direct-io.hg

changeset 4373:8396f6da60b4

bitkeeper revision 1.1236.1.154 (4249c430s6iKHaP4AAIWnJQScN1CyA)

Fix lazy state switching when context-switching to/from the idle
domain. Track which domain's state is on each CPU and, for each
domain, which CPUs are running on its page tables.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Mar 29 21:10:08 2005 +0000 (2005-03-29)
parents d58e731924a7
children 817e74623cf4
files xen/arch/ia64/xenmisc.c xen/arch/x86/domain.c xen/arch/x86/domain_build.c xen/arch/x86/mm.c xen/arch/x86/shadow.c xen/arch/x86/smp.c xen/arch/x86/x86_32/mm.c xen/arch/x86/x86_64/mm.c xen/common/dom0_ops.c xen/common/page_alloc.c xen/common/schedule.c xen/include/asm-x86/mm.h xen/include/public/xen.h xen/include/xen/sched.h xen/include/xen/smp.h
line diff
     1.1 --- a/xen/arch/ia64/xenmisc.c	Tue Mar 29 14:52:44 2005 +0000
     1.2 +++ b/xen/arch/ia64/xenmisc.c	Tue Mar 29 21:10:08 2005 +0000
     1.3 @@ -53,7 +53,7 @@ platform_is_hp_ski(void)
     1.4  }
     1.5  
     1.6  /* calls in xen/common code that are unused on ia64 */
     1.7 -void synchronise_pagetables(unsigned long cpu_mask) { return; }
     1.8 +void synchronise_execution_state(unsigned long cpu_mask) { }
     1.9  
    1.10  int grant_table_create(struct domain *d) { return 0; }
    1.11  void grant_table_destroy(struct domain *d)
     2.1 --- a/xen/arch/x86/domain.c	Tue Mar 29 14:52:44 2005 +0000
     2.2 +++ b/xen/arch/x86/domain.c	Tue Mar 29 21:10:08 2005 +0000
     2.3 @@ -45,13 +45,18 @@
     2.4  static int opt_noreboot = 0;
     2.5  boolean_param("noreboot", opt_noreboot);
     2.6  
     2.7 +struct percpu_ctxt {
     2.8 +    struct exec_domain *curr_ed;
     2.9 +} __cacheline_aligned;
    2.10 +static struct percpu_ctxt percpu_ctxt[NR_CPUS];
    2.11 +
    2.12  static void default_idle(void)
    2.13  {
    2.14 -    __cli();
    2.15 +    local_irq_disable();
    2.16      if ( !softirq_pending(smp_processor_id()) )
    2.17          safe_halt();
    2.18      else
    2.19 -        __sti();
    2.20 +        local_irq_enable();
    2.21  }
    2.22  
    2.23  static __attribute_used__ void idle_loop(void)
    2.24 @@ -73,6 +78,8 @@ void startup_cpu_idle_loop(void)
    2.25  {
    2.26      /* Just some sanity to ensure that the scheduler is set up okay. */
    2.27      ASSERT(current->domain->id == IDLE_DOMAIN_ID);
    2.28 +    percpu_ctxt[smp_processor_id()].curr_ed = current;
    2.29 +    set_bit(smp_processor_id(), &current->domain->cpuset);
    2.30      domain_unpause_by_systemcontroller(current->domain);
    2.31      raise_softirq(SCHEDULE_SOFTIRQ);
    2.32      do_softirq();
    2.33 @@ -110,7 +117,7 @@ void machine_restart(char * __unused)
    2.34              safe_halt();
    2.35      }
    2.36  
    2.37 -    __sti();
    2.38 +    local_irq_enable();
    2.39  
    2.40      /* Ensure we are the boot CPU. */
    2.41      if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
    2.42 @@ -307,10 +314,10 @@ unsigned long alloc_monitor_pagetable(st
    2.43      struct pfn_info *mmfn_info;
    2.44      struct domain *d = ed->domain;
    2.45  
    2.46 -    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
    2.47 +    ASSERT(pagetable_val(ed->arch.monitor_table) == 0);
    2.48  
    2.49      mmfn_info = alloc_domheap_page(NULL);
    2.50 -    ASSERT( mmfn_info ); 
    2.51 +    ASSERT(mmfn_info != NULL); 
    2.52  
    2.53      mmfn = (unsigned long) (mmfn_info - frame_table);
    2.54      mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
    2.55 @@ -326,7 +333,7 @@ unsigned long alloc_monitor_pagetable(st
    2.56  
    2.57      ed->arch.monitor_vtable = mpl2e;
    2.58  
    2.59 -    // map the phys_to_machine map into the Read-Only MPT space for this domain
    2.60 +    /* Map the p2m map into the Read-Only MPT space for this domain. */
    2.61      mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
    2.62          mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
    2.63  
    2.64 @@ -578,19 +585,10 @@ void toggle_guest_mode(struct exec_domai
    2.65          : "=r" (__r) : "r" (value), "0" (__r) );\
    2.66      __r; })
    2.67  
    2.68 -static void switch_segments(
    2.69 -    struct xen_regs *regs, struct exec_domain *p, struct exec_domain *n)
    2.70 +static void load_segments(struct exec_domain *p, struct exec_domain *n)
    2.71  {
    2.72      int all_segs_okay = 1;
    2.73  
    2.74 -    if ( !is_idle_task(p->domain) )
    2.75 -    {
    2.76 -        __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
    2.77 -        __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
    2.78 -        __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
    2.79 -        __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
    2.80 -    }
    2.81 -
    2.82      /* Either selector != 0 ==> reload. */
    2.83      if ( unlikely(p->arch.user_ctxt.ds |
    2.84                    n->arch.user_ctxt.ds) )
    2.85 @@ -654,7 +652,8 @@ static void switch_segments(
    2.86  
    2.87      if ( unlikely(!all_segs_okay) )
    2.88      {
    2.89 -        unsigned long *rsp =
    2.90 +        struct xen_regs *regs = get_execution_context();
    2.91 +        unsigned long   *rsp =
    2.92              (n->arch.flags & TF_kernel_mode) ?
    2.93              (unsigned long *)regs->rsp : 
    2.94              (unsigned long *)n->arch.kernel_sp;
    2.95 @@ -689,6 +688,24 @@ static void switch_segments(
    2.96      }
    2.97  }
    2.98  
    2.99 +static void save_segments(struct exec_domain *p)
   2.100 +{
   2.101 +    __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
   2.102 +    __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
   2.103 +    __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
   2.104 +    __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
   2.105 +}
   2.106 +
   2.107 +static void clear_segments(void)
   2.108 +{
   2.109 +    __asm__ __volatile__ (
   2.110 +        "movl %0,%%ds; "
   2.111 +        "movl %0,%%es; "
   2.112 +        "movl %0,%%fs; "
   2.113 +        "movl %0,%%gs; swapgs; movl %0,%%gs"
   2.114 +        : : "r" (0) );
   2.115 +}
   2.116 +
   2.117  long do_switch_to_user(void)
   2.118  {
   2.119      struct xen_regs       *regs = get_execution_context();
   2.120 @@ -720,80 +737,96 @@ long do_switch_to_user(void)
   2.121  
   2.122  #elif defined(__i386__)
   2.123  
   2.124 -#define switch_segments(_r, _p, _n) ((void)0)
   2.125 +#define load_segments(_p, _n) ((void)0)
   2.126 +#define save_segments(_p)     ((void)0)
   2.127 +#define clear_segments()      ((void)0)
   2.128  
   2.129  #endif
   2.130  
   2.131 -/*
   2.132 - * This special macro can be used to load a debugging register
   2.133 - */
   2.134  #define loaddebug(_ed,_reg) \
   2.135 -		__asm__("mov %0,%%db" #_reg  \
   2.136 -			: /* no output */ \
   2.137 -			:"r" ((_ed)->debugreg[_reg]))
   2.138 +	__asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_ed)->debugreg[_reg]))
   2.139  
   2.140 -void context_switch(struct exec_domain *prev_p, struct exec_domain *next_p)
   2.141 +static void __context_switch(void)
   2.142  {
   2.143 -#ifdef __i386__
   2.144 -    struct tss_struct *tss = init_tss + smp_processor_id();
   2.145 -#endif
   2.146      execution_context_t *stack_ec = get_execution_context();
   2.147 +    unsigned int         cpu = smp_processor_id();
   2.148 +    struct exec_domain  *p = percpu_ctxt[cpu].curr_ed;
   2.149 +    struct exec_domain  *n = current;
   2.150  
   2.151 -    __cli();
   2.152 -
   2.153 -    /* Switch guest general-register state. */
   2.154 -    if ( !is_idle_task(prev_p->domain) )
   2.155 +    if ( !is_idle_task(p->domain) )
   2.156      {
   2.157 -        memcpy(&prev_p->arch.user_ctxt,
   2.158 +        memcpy(&p->arch.user_ctxt,
   2.159                 stack_ec, 
   2.160                 sizeof(*stack_ec));
   2.161 -        unlazy_fpu(prev_p);
   2.162 -        CLEAR_FAST_TRAP(&prev_p->arch);
   2.163 +        unlazy_fpu(p);
   2.164 +        CLEAR_FAST_TRAP(&p->arch);
   2.165 +        save_segments(p);
   2.166      }
   2.167  
   2.168 -    if ( !is_idle_task(next_p->domain) )
   2.169 -    {
   2.170 -        memcpy(stack_ec,
   2.171 -               &next_p->arch.user_ctxt,
   2.172 -               sizeof(*stack_ec));
   2.173 +    memcpy(stack_ec,
   2.174 +           &n->arch.user_ctxt,
   2.175 +           sizeof(*stack_ec));
   2.176  
   2.177 -        /* Maybe switch the debug registers. */
   2.178 -        if ( unlikely(next_p->arch.debugreg[7]) )
   2.179 -        {
   2.180 -            loaddebug(&next_p->arch, 0);
   2.181 -            loaddebug(&next_p->arch, 1);
   2.182 -            loaddebug(&next_p->arch, 2);
   2.183 -            loaddebug(&next_p->arch, 3);
   2.184 -            /* no 4 and 5 */
   2.185 -            loaddebug(&next_p->arch, 6);
   2.186 -            loaddebug(&next_p->arch, 7);
   2.187 -        }
   2.188 +    /* Maybe switch the debug registers. */
   2.189 +    if ( unlikely(n->arch.debugreg[7]) )
   2.190 +    {
   2.191 +        loaddebug(&n->arch, 0);
   2.192 +        loaddebug(&n->arch, 1);
   2.193 +        loaddebug(&n->arch, 2);
   2.194 +        loaddebug(&n->arch, 3);
   2.195 +        /* no 4 and 5 */
   2.196 +        loaddebug(&n->arch, 6);
   2.197 +        loaddebug(&n->arch, 7);
   2.198 +    }
   2.199  
   2.200 -        if ( !VMX_DOMAIN(next_p) )
   2.201 -        {
   2.202 -            SET_FAST_TRAP(&next_p->arch);
   2.203 +    if ( !VMX_DOMAIN(n) )
   2.204 +    {
   2.205 +        SET_FAST_TRAP(&n->arch);
   2.206  
   2.207  #ifdef __i386__
   2.208 +        {
   2.209              /* Switch the kernel ring-1 stack. */
   2.210 -            tss->esp1 = next_p->arch.kernel_sp;
   2.211 -            tss->ss1  = next_p->arch.kernel_ss;
   2.212 +            struct tss_struct *tss = &init_tss[cpu];
   2.213 +            tss->esp1 = n->arch.kernel_sp;
   2.214 +            tss->ss1  = n->arch.kernel_ss;
   2.215 +        }
   2.216  #endif
   2.217 -        }
   2.218 -
   2.219 -        /* Switch page tables. */
   2.220 -        write_ptbase(next_p);
   2.221      }
   2.222  
   2.223 -    set_current(next_p);
   2.224 -
   2.225 -    __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->arch.gdt));
   2.226 +    set_bit(cpu, &n->domain->cpuset);
   2.227 +    write_ptbase(n);
   2.228 +    clear_bit(cpu, &p->domain->cpuset);
   2.229  
   2.230 -    __sti();
   2.231 +    __asm__ __volatile__ ( "lgdt %0" : "=m" (*n->arch.gdt) );
   2.232  
   2.233 -    if ( !VMX_DOMAIN(next_p) )
   2.234 +    percpu_ctxt[cpu].curr_ed = n;
   2.235 +}
   2.236 +
   2.237 +
   2.238 +void context_switch(struct exec_domain *prev, struct exec_domain *next)
   2.239 +{
   2.240 +    struct exec_domain *realprev;
   2.241 +
   2.242 +    local_irq_disable();
   2.243 +
   2.244 +    set_current(next);
   2.245 +
   2.246 +    if ( ((realprev = percpu_ctxt[smp_processor_id()]. curr_ed) == next) || 
   2.247 +         is_idle_task(next->domain) )
   2.248      {
   2.249 -        load_LDT(next_p);
   2.250 -        switch_segments(stack_ec, prev_p, next_p);
   2.251 +        local_irq_enable();
   2.252 +    }
   2.253 +    else
   2.254 +    {
   2.255 +        __context_switch();
   2.256 +
   2.257 +        local_irq_enable();
   2.258 +        
   2.259 +        if ( !VMX_DOMAIN(next) )
   2.260 +        {
   2.261 +            load_LDT(next);
   2.262 +            load_segments(realprev, next);
   2.263 +        }
   2.264      }
   2.265  
   2.266      /*
   2.267 @@ -802,13 +835,27 @@ void context_switch(struct exec_domain *
   2.268       * 'prev' (after this point, a dying domain's info structure may be freed
   2.269       * without warning). 
   2.270       */
   2.271 -    clear_bit(EDF_RUNNING, &prev_p->ed_flags);
   2.272 +    clear_bit(EDF_RUNNING, &prev->ed_flags);
   2.273  
   2.274 -    schedule_tail(next_p);
   2.275 +    schedule_tail(next);
   2.276  
   2.277      BUG();
   2.278  }
   2.279  
   2.280 +static void __synchronise_lazy_execstate(void *unused)
   2.281 +{
   2.282 +    if ( percpu_ctxt[smp_processor_id()].curr_ed != current )
   2.283 +    {
   2.284 +        __context_switch();
   2.285 +        load_LDT(current);
   2.286 +        clear_segments();
   2.287 +    }
   2.288 +}
   2.289 +void synchronise_lazy_execstate(unsigned long cpuset)
   2.290 +{
   2.291 +    smp_subset_call_function(__synchronise_lazy_execstate, NULL, 1, cpuset);
   2.292 +}
   2.293 +
   2.294  unsigned long __hypercall_create_continuation(
   2.295      unsigned int op, unsigned int nr_args, ...)
   2.296  {
   2.297 @@ -947,13 +994,11 @@ void domain_relinquish_memory(struct dom
   2.298  {
   2.299      struct exec_domain *ed;
   2.300  
   2.301 -    /* Ensure that noone is running over the dead domain's page tables. */
   2.302 -    synchronise_pagetables(~0UL);
   2.303 +    BUG_ON(d->cpuset != 0);
   2.304  
   2.305      /* Release device mappings of other domains */
   2.306      gnttab_release_dev_mappings( d->grant_table );
   2.307  
   2.308 -
   2.309      /* Exit shadow mode before deconstructing final guest page table. */
   2.310      shadow_mode_disable(d);
   2.311  
     3.1 --- a/xen/arch/x86/domain_build.c	Tue Mar 29 14:52:44 2005 +0000
     3.2 +++ b/xen/arch/x86/domain_build.c	Tue Mar 29 21:10:08 2005 +0000
     3.3 @@ -421,7 +421,7 @@ int construct_dom0(struct domain *d,
     3.4      update_pagetables(ed);
     3.5  
     3.6      /* Install the new page tables. */
     3.7 -    __cli();
     3.8 +    local_irq_disable();
     3.9      write_ptbase(ed);
    3.10  
    3.11      /* Copy the OS image and free temporary buffer. */
    3.12 @@ -498,7 +498,7 @@ int construct_dom0(struct domain *d,
    3.13  
    3.14      /* Reinstate the caller's page tables. */
    3.15      write_ptbase(current);
    3.16 -    __sti();
    3.17 +    local_irq_enable();
    3.18  
    3.19  #if defined(__i386__)
    3.20      /* Destroy low mappings - they were only for our convenience. */
     4.1 --- a/xen/arch/x86/mm.c	Tue Mar 29 14:52:44 2005 +0000
     4.2 +++ b/xen/arch/x86/mm.c	Tue Mar 29 21:10:08 2005 +0000
     4.3 @@ -1147,16 +1147,13 @@ int get_page_type(struct pfn_info *page,
     4.4                   * may be unnecessary (e.g., page was GDT/LDT) but those
     4.5                   * circumstances should be very rare.
     4.6                   */
     4.7 -                struct exec_domain *ed;
     4.8 -                unsigned long mask = 0;
     4.9 -                for_each_exec_domain ( page_get_owner(page), ed )
    4.10 -                    mask |= 1 << ed->processor;
    4.11 -                mask = tlbflush_filter_cpuset(mask, page->tlbflush_timestamp);
    4.12 -
    4.13 -                if ( unlikely(mask != 0) )
    4.14 +                unsigned long cpuset = tlbflush_filter_cpuset(
    4.15 +                    page_get_owner(page)->cpuset, page->tlbflush_timestamp);
    4.16 +
    4.17 +                if ( unlikely(cpuset != 0) )
    4.18                  {
    4.19                      perfc_incrc(need_flush_tlb_flush);
    4.20 -                    flush_tlb_mask(mask);
    4.21 +                    flush_tlb_mask(cpuset);
    4.22                  }
    4.23  
    4.24                  /* We lose existing type, back pointer, and validity. */
    4.25 @@ -2842,7 +2839,7 @@ void audit_domain(struct domain *d)
    4.26  
    4.27      if ( d != current->domain )
    4.28          domain_pause(d);
    4.29 -    synchronise_pagetables(~0UL);
    4.30 +    synchronise_lazy_execstate(~0UL);
    4.31  
    4.32      printk("pt base=%lx sh_info=%x\n",
    4.33             pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT,
     5.1 --- a/xen/arch/x86/shadow.c	Tue Mar 29 14:52:44 2005 +0000
     5.2 +++ b/xen/arch/x86/shadow.c	Tue Mar 29 21:10:08 2005 +0000
     5.3 @@ -384,7 +384,6 @@ int shadow_mode_control(struct domain *d
     5.4      }   
     5.5  
     5.6      domain_pause(d);
     5.7 -    synchronise_pagetables(~0UL);
     5.8  
     5.9      shadow_lock(d);
    5.10  
     6.1 --- a/xen/arch/x86/smp.c	Tue Mar 29 14:52:44 2005 +0000
     6.2 +++ b/xen/arch/x86/smp.c	Tue Mar 29 21:10:08 2005 +0000
     6.3 @@ -59,9 +59,7 @@
     6.4   */
     6.5  
     6.6  /*
     6.7 - * the following functions deal with sending IPIs between CPUs.
     6.8 - *
     6.9 - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
    6.10 + * The following functions deal with sending IPIs between CPUs.
    6.11   */
    6.12  
    6.13  static inline int __prepare_ICR (unsigned int shortcut, int vector)
    6.14 @@ -82,22 +80,22 @@ static inline void __send_IPI_shortcut(u
    6.15       * of the value read we use an atomic rmw access to avoid costly
    6.16       * cli/sti.  Otherwise we use an even cheaper single atomic write
    6.17       * to the APIC.
    6.18 -	 */
    6.19 +     */
    6.20      unsigned int cfg;
    6.21  
    6.22      /*
    6.23 -	 * Wait for idle.
    6.24 -	 */
    6.25 +     * Wait for idle.
    6.26 +     */
    6.27      apic_wait_icr_idle();
    6.28  
    6.29      /*
    6.30 -	 * No need to touch the target chip field
    6.31 -	 */
    6.32 +     * No need to touch the target chip field
    6.33 +     */
    6.34      cfg = __prepare_ICR(shortcut, vector);
    6.35  
    6.36      /*
    6.37 -	 * Send the IPI. The write to APIC_ICR fires this off.
    6.38 -	 */
    6.39 +     * Send the IPI. The write to APIC_ICR fires this off.
    6.40 +     */
    6.41      apic_write_around(APIC_ICR, cfg);
    6.42  }
    6.43  
    6.44 @@ -111,106 +109,44 @@ static inline void send_IPI_mask(int mas
    6.45      unsigned long cfg;
    6.46      unsigned long flags;
    6.47  
    6.48 -    __save_flags(flags);
    6.49 -    __cli();
    6.50 +    local_irq_save(flags);
    6.51  
    6.52 -		
    6.53      /*
    6.54       * Wait for idle.
    6.55       */
    6.56      apic_wait_icr_idle();
    6.57 -		
    6.58 +
    6.59      /*
    6.60       * prepare target chip field
    6.61       */
    6.62      cfg = __prepare_ICR2(mask);
    6.63      apic_write_around(APIC_ICR2, cfg);
    6.64 -		
    6.65 +
    6.66      /*
    6.67       * program the ICR 
    6.68       */
    6.69      cfg = __prepare_ICR(0, vector);
    6.70 -			
    6.71 +
    6.72      /*
    6.73       * Send the IPI. The write to APIC_ICR fires this off.
    6.74       */
    6.75      apic_write_around(APIC_ICR, cfg);
    6.76  
    6.77 -    __restore_flags(flags);
    6.78 +    local_irq_restore(flags);
    6.79  }
    6.80  
    6.81  static inline void send_IPI_allbutself(int vector)
    6.82  {
    6.83      /*
    6.84 -     * if there are no other CPUs in the system then
    6.85 -     * we get an APIC send error if we try to broadcast.
    6.86 -     * thus we have to avoid sending IPIs in this case.
    6.87 +     * If there are no other CPUs in the system then we get an APIC send error 
    6.88 +     * if we try to broadcast. thus we have to avoid sending IPIs in this case.
    6.89       */
    6.90 -    if (!(smp_num_cpus > 1))
    6.91 +    if ( smp_num_cpus <= 1 )
    6.92          return;
    6.93  
    6.94      __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
    6.95  }
    6.96  
    6.97 -/*
    6.98 - * ********* XEN NOTICE **********
    6.99 - * I've left the following comments lying around as they look liek they might
   6.100 - * be useful to get multiprocessor guest OSes going. However, I suspect the
   6.101 - * issues we face will be quite different so I've ripped out all the
   6.102 - * TLBSTATE logic (I didn't understand it anyway :-). These comments do
   6.103 - * not apply to Xen, therefore! -- Keir (8th Oct 2003).
   6.104 - */
   6.105 -/*
   6.106 - *	Smarter SMP flushing macros. 
   6.107 - *		c/o Linus Torvalds.
   6.108 - *
   6.109 - *	These mean you can really definitely utterly forget about
   6.110 - *	writing to user space from interrupts. (Its not allowed anyway).
   6.111 - *
   6.112 - *	Optimizations Manfred Spraul <manfred@colorfullife.com>
   6.113 - *
   6.114 - * The flush IPI assumes that a thread switch happens in this order:
   6.115 - * [cpu0: the cpu that switches]
   6.116 - * 1) switch_mm() either 1a) or 1b)
   6.117 - * 1a) thread switch to a different mm
   6.118 - * 1a1) clear_bit(cpu, &old_mm.cpu_vm_mask);
   6.119 - * 	Stop ipi delivery for the old mm. This is not synchronized with
   6.120 - * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
   6.121 - * 	for the wrong mm, and in the worst case we perform a superflous
   6.122 - * 	tlb flush.
   6.123 - * 1a2) set cpu_tlbstate to TLBSTATE_OK
   6.124 - * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
   6.125 - *	was in lazy tlb mode.
   6.126 - * 1a3) update cpu_tlbstate[].active_mm
   6.127 - * 	Now cpu0 accepts tlb flushes for the new mm.
   6.128 - * 1a4) set_bit(cpu, &new_mm.cpu_vm_mask);
   6.129 - * 	Now the other cpus will send tlb flush ipis.
   6.130 - * 1a4) change cr3.
   6.131 - * 1b) thread switch without mm change
   6.132 - *	cpu_tlbstate[].active_mm is correct, cpu0 already handles
   6.133 - *	flush ipis.
   6.134 - * 1b1) set cpu_tlbstate to TLBSTATE_OK
   6.135 - * 1b2) test_and_set the cpu bit in cpu_vm_mask.
   6.136 - * 	Atomically set the bit [other cpus will start sending flush ipis],
   6.137 - * 	and test the bit.
   6.138 - * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
   6.139 - * 2) switch %%esp, ie current
   6.140 - *
   6.141 - * The interrupt must handle 2 special cases:
   6.142 - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
   6.143 - * - the cpu performs speculative tlb reads, i.e. even if the cpu only
   6.144 - *   runs in kernel space, the cpu could load tlb entries for user space
   6.145 - *   pages.
   6.146 - *
   6.147 - * The good news is that cpu_tlbstate is local to each cpu, no
   6.148 - * write/read ordering problems.
   6.149 - *
   6.150 - * TLB flush IPI:
   6.151 - *
   6.152 - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
   6.153 - * 2) Leave the mm if we are in the lazy tlb mode.
   6.154 - */
   6.155 -
   6.156  static spinlock_t flush_lock = SPIN_LOCK_UNLOCKED;
   6.157  static unsigned long flush_cpumask;
   6.158  
   6.159 @@ -226,21 +162,19 @@ void flush_tlb_mask(unsigned long mask)
   6.160  {
   6.161      ASSERT(local_irq_is_enabled());
   6.162      
   6.163 -    if ( mask & (1 << smp_processor_id()) )
   6.164 +    if ( mask & (1UL << smp_processor_id()) )
   6.165      {
   6.166          local_flush_tlb();
   6.167 -        mask &= ~(1 << smp_processor_id());
   6.168 +        mask &= ~(1UL << smp_processor_id());
   6.169      }
   6.170  
   6.171      if ( mask != 0 )
   6.172      {
   6.173          spin_lock(&flush_lock);
   6.174 -
   6.175          flush_cpumask = mask;
   6.176          send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
   6.177          while ( flush_cpumask != 0 )
   6.178              cpu_relax();
   6.179 -
   6.180          spin_unlock(&flush_lock);
   6.181      }
   6.182  }
   6.183 @@ -254,7 +188,8 @@ void new_tlbflush_clock_period(void)
   6.184      if ( smp_num_cpus > 1 )
   6.185      {
   6.186          spin_lock(&flush_lock);
   6.187 -        flush_cpumask = ((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id());
   6.188 +        flush_cpumask  = (1UL << smp_num_cpus) - 1;
   6.189 +        flush_cpumask &= ~(1UL << smp_processor_id());
   6.190          send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
   6.191          while ( flush_cpumask != 0 )
   6.192              cpu_relax();
   6.193 @@ -266,124 +201,138 @@ void new_tlbflush_clock_period(void)
   6.194      tlbflush_clock++;
   6.195  }
   6.196  
   6.197 -static void flush_tlb_all_pge_ipi(void* info)
   6.198 +static void flush_tlb_all_pge_ipi(void *info)
   6.199  {
   6.200      __flush_tlb_pge();
   6.201  }
   6.202  
   6.203  void flush_tlb_all_pge(void)
   6.204  {
   6.205 -    smp_call_function (flush_tlb_all_pge_ipi,0,1,1);
   6.206 +    smp_call_function(flush_tlb_all_pge_ipi, 0, 1, 1);
   6.207      __flush_tlb_pge();
   6.208  }
   6.209  
   6.210  void smp_send_event_check_mask(unsigned long cpu_mask)
   6.211  {
   6.212 -    cpu_mask &= ~(1<<smp_processor_id());
   6.213 +    cpu_mask &= ~(1UL << smp_processor_id());
   6.214      if ( cpu_mask != 0 )
   6.215          send_IPI_mask(cpu_mask, EVENT_CHECK_VECTOR);
   6.216  }
   6.217  
   6.218  /*
   6.219 - * Structure and data for smp_call_function(). This is designed to minimise
   6.220 - * static memory requirements. It also looks cleaner.
   6.221 + * Structure and data for smp_call_function().
   6.222   */
   6.223 -static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
   6.224  
   6.225  struct call_data_struct {
   6.226      void (*func) (void *info);
   6.227      void *info;
   6.228 -    atomic_t started;
   6.229 -    atomic_t finished;
   6.230 +    unsigned long started;
   6.231 +    unsigned long finished;
   6.232      int wait;
   6.233  };
   6.234  
   6.235 -static struct call_data_struct * call_data;
   6.236 +static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
   6.237 +static struct call_data_struct *call_data;
   6.238  
   6.239  /*
   6.240 - * this function sends a 'generic call function' IPI to all other CPUs
   6.241 - * in the system.
   6.242 + * Run a function on all other CPUs.
   6.243 + *  @func: The function to run. This must be fast and non-blocking.
   6.244 + *  @info: An arbitrary pointer to pass to the function.
   6.245 + *  @wait: If true, spin until function has completed on other CPUs.
   6.246 + *  Returns: 0 on success, else a negative status code.
   6.247   */
   6.248 -
   6.249 -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
   6.250 -                       int wait)
   6.251 -/*
   6.252 - * [SUMMARY] Run a function on all other CPUs.
   6.253 - * <func> The function to run. This must be fast and non-blocking.
   6.254 - * <info> An arbitrary pointer to pass to the function.
   6.255 - * <nonatomic> currently unused.
   6.256 - * <wait> If true, wait (atomically) until function has completed on other CPUs.
   6.257 - * [RETURNS] 0 on success, else a negative status code. Does not return until
   6.258 - * remote CPUs are nearly ready to execute <<func>> or are or have executed.
   6.259 - *
   6.260 - * You must not call this function with disabled interrupts or from a
   6.261 - * hardware interrupt handler, or bottom halfs.
   6.262 - */
   6.263 +int smp_call_function(
   6.264 +    void (*func) (void *info), void *info, int unused, int wait)
   6.265  {
   6.266      struct call_data_struct data;
   6.267 -    int cpus = smp_num_cpus-1;
   6.268 +    unsigned long cpuset;
   6.269  
   6.270 -    if (!cpus)
   6.271 +    ASSERT(local_irq_is_enabled());
   6.272 +
   6.273 +    cpuset = ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id());
   6.274 +    if ( cpuset == 0 )
   6.275          return 0;
   6.276  
   6.277      data.func = func;
   6.278      data.info = info;
   6.279 -    atomic_set(&data.started, 0);
   6.280 +    data.started = data.finished = 0;
   6.281      data.wait = wait;
   6.282 -    if (wait)
   6.283 -        atomic_set(&data.finished, 0);
   6.284 -
   6.285 -    ASSERT(local_irq_is_enabled());
   6.286  
   6.287      spin_lock(&call_lock);
   6.288  
   6.289      call_data = &data;
   6.290      wmb();
   6.291 -    /* Send a message to all other CPUs and wait for them to respond */
   6.292 +
   6.293      send_IPI_allbutself(CALL_FUNCTION_VECTOR);
   6.294  
   6.295 -    /* Wait for response */
   6.296 -    while (atomic_read(&data.started) != cpus)
   6.297 -        barrier();
   6.298 -
   6.299 -    if (wait)
   6.300 -        while (atomic_read(&data.finished) != cpus)
   6.301 -            barrier();
   6.302 +    while ( (wait ? data.finished : data.started) != cpuset )
   6.303 +        cpu_relax();
   6.304  
   6.305      spin_unlock(&call_lock);
   6.306  
   6.307      return 0;
   6.308  }
   6.309  
   6.310 -static void stop_this_cpu (void * dummy)
   6.311 +/* Run a function on a subset of CPUs (may include local CPU). */
   6.312 +int smp_subset_call_function(
   6.313 +    void (*func) (void *info), void *info, int wait, unsigned long cpuset)
   6.314  {
   6.315 -    /*
   6.316 -     * Remove this CPU:
   6.317 -     */
   6.318 -    clear_bit(smp_processor_id(), &cpu_online_map);
   6.319 -    __cli();
   6.320 -    disable_local_APIC();
   6.321 -    for(;;) __asm__("hlt");
   6.322 +    struct call_data_struct data;
   6.323 +
   6.324 +    ASSERT(local_irq_is_enabled());
   6.325 +
   6.326 +    if ( cpuset & (1UL << smp_processor_id()) )
   6.327 +    {
   6.328 +        local_irq_disable();
   6.329 +        (*func)(info);
   6.330 +        local_irq_enable();
   6.331 +    }
   6.332 +
   6.333 +    cpuset &= ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id());
   6.334 +    if ( cpuset == 0 )
   6.335 +        return 0;
   6.336 +
   6.337 +    data.func = func;
   6.338 +    data.info = info;
   6.339 +    data.started = data.finished = 0;
   6.340 +    data.wait = wait;
   6.341 +
   6.342 +    spin_lock(&call_lock);
   6.343 +
   6.344 +    call_data = &data;
   6.345 +    wmb();
   6.346 +
   6.347 +    send_IPI_mask(cpuset, CALL_FUNCTION_VECTOR);
   6.348 +
   6.349 +    while ( (wait ? data.finished : data.started) != cpuset )
   6.350 +        cpu_relax();
   6.351 +
   6.352 +    spin_unlock(&call_lock);
   6.353 +
   6.354 +    return 0;
   6.355  }
   6.356  
   6.357 -/*
   6.358 - * this function calls the 'stop' function on all other CPUs in the system.
   6.359 - */
   6.360 +static void stop_this_cpu (void *dummy)
   6.361 +{
   6.362 +    clear_bit(smp_processor_id(), &cpu_online_map);
   6.363 +
   6.364 +    disable_local_APIC();
   6.365 +
   6.366 +    for ( ; ; )
   6.367 +        __asm__ __volatile__ ( "hlt" );
   6.368 +}
   6.369  
   6.370  void smp_send_stop(void)
   6.371  {
   6.372 +    /* Stop all other CPUs in the system. */
   6.373      smp_call_function(stop_this_cpu, NULL, 1, 0);
   6.374      smp_num_cpus = 1;
   6.375  
   6.376 -    __cli();
   6.377 +    local_irq_disable();
   6.378      disable_local_APIC();
   6.379 -    __sti();
   6.380 +    local_irq_enable();
   6.381  }
   6.382  
   6.383 -/*
   6.384 - * Nothing to do, as all the work is done automatically when
   6.385 - * we return from the interrupt.
   6.386 - */
   6.387  asmlinkage void smp_event_check_interrupt(void)
   6.388  {
   6.389      ack_APIC_irq();
   6.390 @@ -394,23 +343,20 @@ asmlinkage void smp_call_function_interr
   6.391  {
   6.392      void (*func) (void *info) = call_data->func;
   6.393      void *info = call_data->info;
   6.394 -    int wait = call_data->wait;
   6.395  
   6.396      ack_APIC_irq();
   6.397      perfc_incrc(ipis);
   6.398  
   6.399 -    /*
   6.400 -     * Notify initiating CPU that I've grabbed the data and am
   6.401 -     * about to execute the function
   6.402 -     */
   6.403 -    mb();
   6.404 -    atomic_inc(&call_data->started);
   6.405 -    /*
   6.406 -     * At this point the info structure may be out of scope unless wait==1
   6.407 -     */
   6.408 -    (*func)(info);
   6.409 -    if (wait) {
   6.410 +    if ( call_data->wait )
   6.411 +    {
   6.412 +        (*func)(info);
   6.413          mb();
   6.414 -        atomic_inc(&call_data->finished);
   6.415 +        set_bit(smp_processor_id(), &call_data->finished);
   6.416 +    }
   6.417 +    else
   6.418 +    {
   6.419 +        mb();
   6.420 +        set_bit(smp_processor_id(), &call_data->started);
   6.421 +        (*func)(info);
   6.422      }
   6.423  }
     7.1 --- a/xen/arch/x86/x86_32/mm.c	Tue Mar 29 14:52:44 2005 +0000
     7.2 +++ b/xen/arch/x86/x86_32/mm.c	Tue Mar 29 21:10:08 2005 +0000
     7.3 @@ -180,22 +180,6 @@ void subarch_init_memory(struct domain *
     7.4      }
     7.5  }
     7.6  
     7.7 -/*
     7.8 - * Allows shooting down of borrowed page-table use on specific CPUs.
     7.9 - * Specifically, we borrow page tables when running the idle domain.
    7.10 - */
    7.11 -static void __synchronise_pagetables(void *mask)
    7.12 -{
    7.13 -    struct exec_domain *ed = current;
    7.14 -    if ( ((unsigned long)mask & (1 << ed->processor)) &&
    7.15 -         is_idle_task(ed->domain) )
    7.16 -        write_ptbase(ed);
    7.17 -}
    7.18 -void synchronise_pagetables(unsigned long cpu_mask)
    7.19 -{
    7.20 -    __synchronise_pagetables((void *)cpu_mask);
    7.21 -    smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1);
    7.22 -}
    7.23  
    7.24  long do_stack_switch(unsigned long ss, unsigned long esp)
    7.25  {
     8.1 --- a/xen/arch/x86/x86_64/mm.c	Tue Mar 29 14:52:44 2005 +0000
     8.2 +++ b/xen/arch/x86/x86_64/mm.c	Tue Mar 29 21:10:08 2005 +0000
     8.3 @@ -236,23 +236,6 @@ void subarch_init_memory(struct domain *
     8.4      }
     8.5  }
     8.6  
     8.7 -/*
     8.8 - * Allows shooting down of borrowed page-table use on specific CPUs.
     8.9 - * Specifically, we borrow page tables when running the idle domain.
    8.10 - */
    8.11 -static void __synchronise_pagetables(void *mask)
    8.12 -{
    8.13 -    struct exec_domain *ed = current;
    8.14 -    if ( ((unsigned long)mask & (1 << ed->processor)) &&
    8.15 -         is_idle_task(ed->domain) )
    8.16 -        write_ptbase(ed);
    8.17 -}
    8.18 -void synchronise_pagetables(unsigned long cpu_mask)
    8.19 -{
    8.20 -    __synchronise_pagetables((void *)cpu_mask);
    8.21 -    smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1);
    8.22 -}
    8.23 -
    8.24  long do_stack_switch(unsigned long ss, unsigned long esp)
    8.25  {
    8.26      if ( (ss & 3) != 3 )
     9.1 --- a/xen/common/dom0_ops.c	Tue Mar 29 14:52:44 2005 +0000
     9.2 +++ b/xen/common/dom0_ops.c	Tue Mar 29 21:10:08 2005 +0000
     9.3 @@ -266,7 +266,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
     9.4          else
     9.5          {
     9.6              exec_domain_pause(ed);
     9.7 -            synchronise_pagetables(~0UL);
     9.8              if ( ed->processor != (cpu % smp_num_cpus) )
     9.9                  set_bit(EDF_MIGRATED, &ed->ed_flags);
    9.10              set_bit(EDF_CPUPINNED, &ed->ed_flags);
    10.1 --- a/xen/common/page_alloc.c	Tue Mar 29 14:52:44 2005 +0000
    10.2 +++ b/xen/common/page_alloc.c	Tue Mar 29 21:10:08 2005 +0000
    10.3 @@ -534,8 +534,6 @@ void free_domheap_pages(struct pfn_info 
    10.4  {
    10.5      int            i, drop_dom_ref;
    10.6      struct domain *d = page_get_owner(pg);
    10.7 -    struct exec_domain *ed;
    10.8 -    int cpu_mask = 0;
    10.9  
   10.10      ASSERT(!in_irq());
   10.11  
   10.12 @@ -557,14 +555,11 @@ void free_domheap_pages(struct pfn_info 
   10.13          /* NB. May recursively lock from domain_relinquish_memory(). */
   10.14          spin_lock_recursive(&d->page_alloc_lock);
   10.15  
   10.16 -        for_each_exec_domain ( d, ed )
   10.17 -            cpu_mask |= 1 << ed->processor;
   10.18 -
   10.19          for ( i = 0; i < (1 << order); i++ )
   10.20          {
   10.21              ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0);
   10.22              pg[i].tlbflush_timestamp  = tlbflush_current_time();
   10.23 -            pg[i].u.free.cpu_mask     = cpu_mask;
   10.24 +            pg[i].u.free.cpu_mask     = d->cpuset;
   10.25              list_del(&pg[i].list);
   10.26          }
   10.27  
    11.1 --- a/xen/common/schedule.c	Tue Mar 29 14:52:44 2005 +0000
    11.2 +++ b/xen/common/schedule.c	Tue Mar 29 21:10:08 2005 +0000
    11.3 @@ -192,7 +192,6 @@ void sched_add_domain(struct exec_domain
    11.4  
    11.5  void sched_rem_domain(struct exec_domain *ed) 
    11.6  {
    11.7 -
    11.8      rem_ac_timer(&ed->timer);
    11.9      SCHED_OP(rem_task, ed);
   11.10      TRACE_3D(TRC_SCHED_DOM_REM, ed->domain->id, ed->eid, ed);
    12.1 --- a/xen/include/asm-x86/mm.h	Tue Mar 29 14:52:44 2005 +0000
    12.2 +++ b/xen/include/asm-x86/mm.h	Tue Mar 29 21:10:08 2005 +0000
    12.3 @@ -206,12 +206,6 @@ static inline int get_page_and_type(stru
    12.4  int check_descriptor(struct desc_struct *d);
    12.5  
    12.6  /*
    12.7 - * Use currently-executing domain's pagetables on the specified CPUs.
    12.8 - * i.e., stop borrowing someone else's tables if you are the idle domain.
    12.9 - */
   12.10 -void synchronise_pagetables(unsigned long cpu_mask);
   12.11 -
   12.12 -/*
   12.13   * The MPT (machine->physical mapping table) is an array of word-sized
   12.14   * values, indexed on machine frame number. It is expected that guest OSes
   12.15   * will use it to store a "physical" frame number to give the appearance of
    13.1 --- a/xen/include/public/xen.h	Tue Mar 29 14:52:44 2005 +0000
    13.2 +++ b/xen/include/public/xen.h	Tue Mar 29 21:10:08 2005 +0000
    13.3 @@ -124,11 +124,11 @@
    13.4   *   ptr[:2]  -- Machine address of new page-table base to install in MMU
    13.5   *               when in user space.
    13.6   * 
    13.7 - *   val[7:0] == MMUEXT_TLB_FLUSH:
    13.8 - *   No additional arguments.
    13.9 + *   val[7:0] == MMUEXT_TLB_FLUSH_LOCAL:
   13.10 + *   No additional arguments. Flushes local TLB.
   13.11   * 
   13.12 - *   val[7:0] == MMUEXT_INVLPG:
   13.13 - *   ptr[:2]  -- Linear address to be flushed from the TLB.
   13.14 + *   val[7:0] == MMUEXT_INVLPG_LOCAL:
   13.15 + *   ptr[:2]  -- Linear address to be flushed from the local TLB.
   13.16   * 
   13.17   *   val[7:0] == MMUEXT_FLUSH_CACHE:
   13.18   *   No additional arguments. Writes back and flushes cache contents.
   13.19 @@ -154,6 +154,12 @@
   13.20   *   val[7:0] == MMUEXT_REASSIGN_PAGE:
   13.21   *   ptr[:2]  -- A machine address within the page to be reassigned to the FD.
   13.22   *               (NB. page must currently belong to the calling domain).
   13.23 + * 
   13.24 + *   val[7:0] == MMUEXT_TLB_FLUSH_MULTI:
   13.25 + *   Flush TLBs of VCPUs specified in @mask.
   13.26 + * 
   13.27 + *   val[7:0] == MMUEXT_INVLPG_MULTI:
   13.28 + *   ptr[:2]  -- Linear address to be flushed from TLB of VCPUs in @mask.
   13.29   */
   13.30  #define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
   13.31  #define MMU_MACHPHYS_UPDATE      2 /* ptr = MA of frame to modify entry for  */
   13.32 @@ -164,8 +170,8 @@
   13.33  #define MMUEXT_PIN_L4_TABLE      3 /* ptr = MA of frame to pin               */
   13.34  #define MMUEXT_UNPIN_TABLE       4 /* ptr = MA of frame to unpin             */
   13.35  #define MMUEXT_NEW_BASEPTR       5 /* ptr = MA of new pagetable base         */
   13.36 -#define MMUEXT_TLB_FLUSH         6 /* ptr = NULL                             */
   13.37 -#define MMUEXT_INVLPG            7 /* ptr = VA to invalidate                 */
   13.38 +#define MMUEXT_TLB_FLUSH_LOCAL   6 /* ptr = NULL                             */
   13.39 +#define MMUEXT_INVLPG_LOCAL      7 /* ptr = VA to invalidate                 */
   13.40  #define MMUEXT_FLUSH_CACHE       8
   13.41  #define MMUEXT_SET_LDT           9 /* ptr = VA of table; val = # entries     */
   13.42  #define MMUEXT_SET_FOREIGNDOM   10 /* val[31:16] = dom                       */
   13.43 @@ -173,6 +179,8 @@
   13.44  #define MMUEXT_TRANSFER_PAGE    12 /* ptr = MA of frame; val[31:16] = dom    */
   13.45  #define MMUEXT_REASSIGN_PAGE    13
   13.46  #define MMUEXT_NEW_USER_BASEPTR 14
   13.47 +#define MMUEXT_TLB_FLUSH_MULTI  15 /* ptr = NULL; mask = VCPUs to flush      */
   13.48 +#define MMUEXT_INVLPG_MULTI     16 /* ptr = VA to inval.; mask = VCPUs       */
   13.49  #define MMUEXT_CMD_MASK        255
   13.50  #define MMUEXT_CMD_SHIFT         8
   13.51  
   13.52 @@ -180,6 +188,9 @@
   13.53  #define UVMF_FLUSH_TLB          1 /* Flush entire TLB. */
   13.54  #define UVMF_INVLPG             2 /* Flush the VA mapping being updated. */
   13.55  
   13.56 +/* Backwards source compatibility. */
   13.57 +#define MMUEXT_TLB_FLUSH        MMUEXT_TLB_FLUSH_LOCAL
   13.58 +#define MMUEXT_INVLPG           MMUEXT_INVLPG_LOCAL
   13.59  
   13.60  /*
   13.61   * Commands to HYPERVISOR_sched_op().
   13.62 @@ -257,8 +268,9 @@ typedef u16 domid_t;
   13.63   */
   13.64  typedef struct
   13.65  {
   13.66 -    memory_t ptr;    /* Machine address of PTE. */
   13.67 -    memory_t val;    /* New contents of PTE.    */
   13.68 +    memory_t ptr;       /* Machine address of PTE. */
   13.69 +    memory_t val;       /* New contents of PTE.    */
   13.70 +    /*unsigned long mask;*/ /* VCPU mask (certain extended commands). */
   13.71  } PACKED mmu_update_t;
   13.72  
   13.73  /*
    14.1 --- a/xen/include/xen/sched.h	Tue Mar 29 14:52:44 2005 +0000
    14.2 +++ b/xen/include/xen/sched.h	Tue Mar 29 21:10:08 2005 +0000
    14.3 @@ -143,6 +143,9 @@ struct domain
    14.4  
    14.5      struct exec_domain *exec_domain[MAX_VIRT_CPUS];
    14.6  
    14.7 +    /* Bitmask of CPUs on which this domain is running. */
    14.8 +    unsigned long cpuset;
    14.9 +
   14.10      struct arch_domain arch;
   14.11  };
   14.12  
   14.13 @@ -250,6 +253,12 @@ void init_idle_task(void);
   14.14  void domain_wake(struct exec_domain *d);
   14.15  void domain_sleep(struct exec_domain *d);
   14.16  
   14.17 +/*
   14.18 + * Force loading of currently-executing domain state on the specified set
   14.19 + * of CPUs. This is used to counteract lazy state switching where required.
   14.20 + */
   14.21 +void synchronise_lazy_execstate(unsigned long cpuset);
   14.22 +
   14.23  extern void context_switch(
   14.24      struct exec_domain *prev, 
   14.25      struct exec_domain *next);
   14.26 @@ -330,14 +339,21 @@ static inline void exec_domain_pause(str
   14.27      ASSERT(ed != current);
   14.28      atomic_inc(&ed->pausecnt);
   14.29      domain_sleep(ed);
   14.30 +    synchronise_lazy_execstate(ed->domain->cpuset & (1UL << ed->processor));
   14.31  }
   14.32  
   14.33  static inline void domain_pause(struct domain *d)
   14.34  {
   14.35      struct exec_domain *ed;
   14.36  
   14.37 -    for_each_exec_domain(d, ed)
   14.38 -        exec_domain_pause(ed);
   14.39 +    for_each_exec_domain( d, ed )
   14.40 +    {
   14.41 +        ASSERT(ed != current);
   14.42 +        atomic_inc(&ed->pausecnt);
   14.43 +        domain_sleep(ed);
   14.44 +    }
   14.45 +
   14.46 +    synchronise_lazy_execstate(d->cpuset);
   14.47  }
   14.48  
   14.49  static inline void exec_domain_unpause(struct exec_domain *ed)
   14.50 @@ -351,7 +367,7 @@ static inline void domain_unpause(struct
   14.51  {
   14.52      struct exec_domain *ed;
   14.53  
   14.54 -    for_each_exec_domain(d, ed)
   14.55 +    for_each_exec_domain( d, ed )
   14.56          exec_domain_unpause(ed);
   14.57  }
   14.58  
   14.59 @@ -361,30 +377,26 @@ static inline void exec_domain_unblock(s
   14.60          domain_wake(ed);
   14.61  }
   14.62  
   14.63 -static inline void domain_unblock(struct domain *d)
   14.64 -{
   14.65 -    struct exec_domain *ed;
   14.66 -
   14.67 -    for_each_exec_domain(d, ed)
   14.68 -        exec_domain_unblock(ed);
   14.69 -}
   14.70 -
   14.71  static inline void domain_pause_by_systemcontroller(struct domain *d)
   14.72  {
   14.73      struct exec_domain *ed;
   14.74  
   14.75 -    for_each_exec_domain(d, ed) {
   14.76 +    for_each_exec_domain ( d, ed )
   14.77 +    {
   14.78          ASSERT(ed != current);
   14.79          if ( !test_and_set_bit(EDF_CTRLPAUSE, &ed->ed_flags) )
   14.80              domain_sleep(ed);
   14.81      }
   14.82 +
   14.83 +    synchronise_lazy_execstate(d->cpuset);
   14.84  }
   14.85  
   14.86  static inline void domain_unpause_by_systemcontroller(struct domain *d)
   14.87  {
   14.88      struct exec_domain *ed;
   14.89  
   14.90 -    for_each_exec_domain(d, ed) {
   14.91 +    for_each_exec_domain ( d, ed )
   14.92 +    {
   14.93          if ( test_and_clear_bit(EDF_CTRLPAUSE, &ed->ed_flags) )
   14.94              domain_wake(ed);
   14.95      }
    15.1 --- a/xen/include/xen/smp.h	Tue Mar 29 14:52:44 2005 +0000
    15.2 +++ b/xen/include/xen/smp.h	Tue Mar 29 21:10:08 2005 +0000
    15.3 @@ -43,8 +43,10 @@ extern void smp_commence(void);
    15.4  /*
    15.5   * Call a function on all other processors
    15.6   */
    15.7 -extern int smp_call_function (void (*func) (void *info), void *info,
    15.8 -			      int retry, int wait);
    15.9 +extern int smp_call_function(
   15.10 +    void (*func) (void *info), void *info, int retry, int wait);
   15.11 +extern int smp_subset_call_function(
   15.12 +    void (*func) (void *info), void *info, int wait, unsigned long cpuset);
   15.13  
   15.14  /*
   15.15   * True once the per process idle is forked
   15.16 @@ -84,7 +86,8 @@ extern volatile int smp_msg_id;
   15.17  #define kernel_lock()
   15.18  #define cpu_logical_map(cpu)			0
   15.19  #define cpu_number_map(cpu)			0
   15.20 -#define smp_call_function(func,info,retry,wait)	({ 0; })
   15.21 +#define smp_call_function(func,info,retry,wait)	0
   15.22 +#define smp_subset_call_function(f,i,w,c)	({ if ( (c&1) ) (*f)(i); 0; })
   15.23  #define cpu_online_map				1
   15.24  
   15.25  #endif