ia64/xen-unstable

changeset 18523:7f1a36b834e1

x86: make GDT per-CPU

The major issue with supporting a significantly larger number of
physical CPUs appears to be the use of per-CPU GDT entries - at
present, x86-64 could support only up to 126 CPUs (with code changes
to also use the top-most GDT page, that would be 254). Instead of
trying to go with incremental steps here, by converting the GDT itself
to be per-CPU, limitations in that respect go away entirely.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Sep 22 13:46:57 2008 +0100 (2008-09-22)
parents 3c42b5ad0a4f
children 6d3b932cbeca
files xen/arch/x86/boot/wakeup.S xen/arch/x86/boot/x86_32.S xen/arch/x86/boot/x86_64.S xen/arch/x86/cpu/common.c xen/arch/x86/domain.c xen/arch/x86/domain_build.c xen/arch/x86/hvm/vmx/vmcs.c xen/arch/x86/setup.c xen/arch/x86/smpboot.c xen/arch/x86/traps.c xen/arch/x86/x86_32/mm.c xen/arch/x86/x86_32/supervisor_mode_kernel.S xen/arch/x86/x86_32/traps.c xen/arch/x86/x86_64/mm.c xen/arch/x86/x86_64/traps.c xen/include/asm-x86/desc.h xen/include/asm-x86/ldt.h xen/include/asm-x86/page.h
line diff
     1.1 --- a/xen/arch/x86/boot/wakeup.S	Mon Sep 22 13:41:07 2008 +0100
     1.2 +++ b/xen/arch/x86/boot/wakeup.S	Mon Sep 22 13:46:57 2008 +0100
     1.3 @@ -168,7 +168,7 @@ 1:
     1.4          .word   0,0,0
     1.5  lgdt_descr:
     1.6          .word   LAST_RESERVED_GDT_BYTE
     1.7 -        .quad   gdt_table - FIRST_RESERVED_GDT_BYTE
     1.8 +        .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
     1.9          
    1.10  wakeup_64:
    1.11          lgdt    lgdt_descr(%rip)
     2.1 --- a/xen/arch/x86/boot/x86_32.S	Mon Sep 22 13:41:07 2008 +0100
     2.2 +++ b/xen/arch/x86/boot/x86_32.S	Mon Sep 22 13:46:57 2008 +0100
     2.3 @@ -78,7 +78,7 @@ idt_descr:
     2.4          .word   0
     2.5  gdt_descr:
     2.6          .word   LAST_RESERVED_GDT_BYTE
     2.7 -        .long   gdt_table - FIRST_RESERVED_GDT_BYTE
     2.8 +        .long   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
     2.9  
    2.10  
    2.11          .align 32
    2.12 @@ -94,7 +94,7 @@ ENTRY(idle_pg_table)
    2.13  #define GUEST_DESC(d)                                                   \
    2.14          .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff,                \
    2.15                ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d)
    2.16 -ENTRY(gdt_table)
    2.17 +ENTRY(boot_cpu_gdt_table)
    2.18          .quad 0x0000000000000000     /* unused */
    2.19          .quad 0x00cf9a000000ffff     /* 0xe008 ring 0 4.00GB code at 0x0 */
    2.20          .quad 0x00cf92000000ffff     /* 0xe010 ring 0 4.00GB data at 0x0 */
    2.21 @@ -102,4 +102,6 @@ ENTRY(gdt_table)
    2.22          GUEST_DESC(0x00c0b200)       /* 0xe021 ring 1 3.xxGB data at 0x0 */
    2.23          GUEST_DESC(0x00c0fa00)       /* 0xe02b ring 3 3.xxGB code at 0x0 */
    2.24          GUEST_DESC(0x00c0f200)       /* 0xe033 ring 3 3.xxGB data at 0x0 */
    2.25 +        .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0
    2.26 +        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu) */
    2.27          .align PAGE_SIZE,0
     3.1 --- a/xen/arch/x86/boot/x86_64.S	Mon Sep 22 13:41:07 2008 +0100
     3.2 +++ b/xen/arch/x86/boot/x86_64.S	Mon Sep 22 13:46:57 2008 +0100
     3.3 @@ -85,7 +85,7 @@ multiboot_ptr:
     3.4          .word   0
     3.5  gdt_descr:
     3.6          .word   LAST_RESERVED_GDT_BYTE
     3.7 -        .quad   gdt_table - FIRST_RESERVED_GDT_BYTE
     3.8 +        .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
     3.9  
    3.10          .word   0,0,0
    3.11  idt_descr:
    3.12 @@ -96,7 +96,7 @@ ENTRY(stack_start)
    3.13          .quad   cpu0_stack
    3.14  
    3.15          .align PAGE_SIZE, 0
    3.16 -ENTRY(gdt_table)
    3.17 +ENTRY(boot_cpu_gdt_table)
    3.18          .quad 0x0000000000000000     /* unused */
    3.19          .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
    3.20          .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
    3.21 @@ -105,11 +105,13 @@ ENTRY(gdt_table)
    3.22          .quad 0x00cff2000000ffff     /* 0xe02b ring 3 data                */
    3.23          .quad 0x00affa000000ffff     /* 0xe033 ring 3 code, 64-bit mode   */
    3.24          .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
    3.25 +        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
    3.26 +        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
    3.27  
    3.28          .align PAGE_SIZE, 0
    3.29  /* NB. Even rings != 0 get access to the full 4Gb, as only the            */
    3.30  /*     (compatibility) machine->physical mapping table lives there.       */
    3.31 -ENTRY(compat_gdt_table)
    3.32 +ENTRY(boot_cpu_compat_gdt_table)
    3.33          .quad 0x0000000000000000     /* unused */
    3.34          .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
    3.35          .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
    3.36 @@ -118,4 +120,6 @@ ENTRY(compat_gdt_table)
    3.37          .quad 0x00cffa000000ffff     /* 0xe02b ring 3 code, compatibility */
    3.38          .quad 0x00cff2000000ffff     /* 0xe033 ring 3 data                */
    3.39          .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
    3.40 +        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
    3.41 +        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
    3.42          .align PAGE_SIZE, 0
     4.1 --- a/xen/arch/x86/cpu/common.c	Mon Sep 22 13:41:07 2008 +0100
     4.2 +++ b/xen/arch/x86/cpu/common.c	Mon Sep 22 13:46:57 2008 +0100
     4.3 @@ -575,6 +575,9 @@ void __cpuinit cpu_init(void)
     4.4  	if (cpu_has_pat)
     4.5  		wrmsrl(MSR_IA32_CR_PAT, host_pat);
     4.6  
     4.7 +	/* Install correct page table. */
     4.8 +	write_ptbase(current);
     4.9 +
    4.10  	*(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
    4.11  	*(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(current);
    4.12  	asm volatile ( "lgdt %0" : "=m" (gdt_load) );
    4.13 @@ -605,9 +608,6 @@ void __cpuinit cpu_init(void)
    4.14  #define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) );
    4.15  	CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
    4.16  #undef CD
    4.17 -
    4.18 -	/* Install correct page table. */
    4.19 -	write_ptbase(current);
    4.20  }
    4.21  
    4.22  #ifdef CONFIG_HOTPLUG_CPU
     5.1 --- a/xen/arch/x86/domain.c	Mon Sep 22 13:41:07 2008 +0100
     5.2 +++ b/xen/arch/x86/domain.c	Mon Sep 22 13:46:57 2008 +0100
     5.3 @@ -211,7 +211,6 @@ static inline int may_switch_mode(struct
     5.4  
     5.5  int switch_native(struct domain *d)
     5.6  {
     5.7 -    l1_pgentry_t gdt_l1e;
     5.8      unsigned int vcpuid;
     5.9  
    5.10      if ( d == NULL )
    5.11 @@ -223,12 +222,8 @@ int switch_native(struct domain *d)
    5.12  
    5.13      d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
    5.14  
    5.15 -    /* switch gdt */
    5.16 -    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
    5.17      for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
    5.18      {
    5.19 -        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
    5.20 -                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
    5.21          if (d->vcpu[vcpuid])
    5.22              release_compat_l4(d->vcpu[vcpuid]);
    5.23      }
    5.24 @@ -238,7 +233,6 @@ int switch_native(struct domain *d)
    5.25  
    5.26  int switch_compat(struct domain *d)
    5.27  {
    5.28 -    l1_pgentry_t gdt_l1e;
    5.29      unsigned int vcpuid;
    5.30  
    5.31      if ( d == NULL )
    5.32 @@ -250,15 +244,11 @@ int switch_compat(struct domain *d)
    5.33  
    5.34      d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
    5.35  
    5.36 -    /* switch gdt */
    5.37 -    gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
    5.38      for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
    5.39      {
    5.40          if ( (d->vcpu[vcpuid] != NULL) &&
    5.41               (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
    5.42              goto undo_and_fail;
    5.43 -        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
    5.44 -                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
    5.45      }
    5.46  
    5.47      domain_set_alloc_bitsize(d);
    5.48 @@ -267,13 +257,10 @@ int switch_compat(struct domain *d)
    5.49  
    5.50   undo_and_fail:
    5.51      d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
    5.52 -    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
    5.53      while ( vcpuid-- != 0 )
    5.54      {
    5.55          if ( d->vcpu[vcpuid] != NULL )
    5.56              release_compat_l4(d->vcpu[vcpuid]);
    5.57 -        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
    5.58 -                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
    5.59      }
    5.60      return -ENOMEM;
    5.61  }
    5.62 @@ -322,7 +309,12 @@ int vcpu_initialise(struct vcpu *v)
    5.63          if ( is_idle_domain(d) )
    5.64          {
    5.65              v->arch.schedule_tail = continue_idle_domain;
    5.66 -            v->arch.cr3           = __pa(idle_pg_table);
    5.67 +            if ( v->vcpu_id )
    5.68 +                v->arch.cr3 = d->vcpu[0]->arch.cr3;
    5.69 +            else if ( !*idle_vcpu )
    5.70 +                v->arch.cr3 = __pa(idle_pg_table);
    5.71 +            else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
    5.72 +                return -ENOMEM;
    5.73          }
    5.74  
    5.75          v->arch.guest_context.ctrlreg[4] =
    5.76 @@ -349,8 +341,7 @@ int arch_domain_create(struct domain *d,
    5.77  #ifdef __x86_64__
    5.78      struct page_info *pg;
    5.79  #endif
    5.80 -    l1_pgentry_t gdt_l1e;
    5.81 -    int i, vcpuid, pdpt_order, paging_initialised = 0;
    5.82 +    int i, pdpt_order, paging_initialised = 0;
    5.83      int rc = -ENOMEM;
    5.84  
    5.85      d->arch.hvm_domain.hap_enabled =
    5.86 @@ -369,18 +360,6 @@ int arch_domain_create(struct domain *d,
    5.87          goto fail;
    5.88      memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
    5.89  
    5.90 -    /*
    5.91 -     * Map Xen segments into every VCPU's GDT, irrespective of whether every
    5.92 -     * VCPU will actually be used. This avoids an NMI race during context
    5.93 -     * switch: if we take an interrupt after switching CR3 but before switching
    5.94 -     * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
    5.95 -     * try to load CS from an invalid table.
    5.96 -     */
    5.97 -    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
    5.98 -    for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
    5.99 -        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
   5.100 -                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
   5.101 -
   5.102  #if defined(__i386__)
   5.103  
   5.104      mapcache_domain_init(d);
   5.105 @@ -1193,9 +1172,12 @@ static void paravirt_ctxt_switch_to(stru
   5.106  static void __context_switch(void)
   5.107  {
   5.108      struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
   5.109 -    unsigned int          cpu = smp_processor_id();
   5.110 +    unsigned int          i, cpu = smp_processor_id();
   5.111      struct vcpu          *p = per_cpu(curr_vcpu, cpu);
   5.112      struct vcpu          *n = current;
   5.113 +    struct desc_struct   *gdt;
   5.114 +    struct page_info     *page;
   5.115 +    struct desc_ptr       gdt_desc;
   5.116  
   5.117      ASSERT(p != n);
   5.118      ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
   5.119 @@ -1221,14 +1203,30 @@ static void __context_switch(void)
   5.120          cpu_set(cpu, n->domain->domain_dirty_cpumask);
   5.121      cpu_set(cpu, n->vcpu_dirty_cpumask);
   5.122  
   5.123 +    gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
   5.124 +                                  per_cpu(compat_gdt_table, cpu);
   5.125 +    page = virt_to_page(gdt);
   5.126 +    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
   5.127 +    {
   5.128 +        l1e_write(n->domain->arch.mm_perdomain_pt +
   5.129 +                  (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
   5.130 +                  FIRST_RESERVED_GDT_PAGE + i,
   5.131 +                  l1e_from_page(page + i, __PAGE_HYPERVISOR));
   5.132 +    }
   5.133 +
   5.134 +    if ( p->vcpu_id != n->vcpu_id )
   5.135 +    {
   5.136 +        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
   5.137 +        gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
   5.138 +        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
   5.139 +    }
   5.140 +
   5.141      write_ptbase(n);
   5.142  
   5.143      if ( p->vcpu_id != n->vcpu_id )
   5.144      {
   5.145 -        char gdt_load[10];
   5.146 -        *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
   5.147 -        *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(n);
   5.148 -        asm volatile ( "lgdt %0" : "=m" (gdt_load) );
   5.149 +        gdt_desc.base = GDT_VIRT_START(n);
   5.150 +        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
   5.151      }
   5.152  
   5.153      if ( p->domain != n->domain )
   5.154 @@ -1279,8 +1277,6 @@ void context_switch(struct vcpu *prev, s
   5.155              uint64_t efer = read_efer();
   5.156              if ( !(efer & EFER_SCE) )
   5.157                  write_efer(efer | EFER_SCE);
   5.158 -            flush_tlb_one_local(GDT_VIRT_START(next) +
   5.159 -                                FIRST_RESERVED_GDT_BYTE);
   5.160          }
   5.161  #endif
   5.162  
     6.1 --- a/xen/arch/x86/domain_build.c	Mon Sep 22 13:41:07 2008 +0100
     6.2 +++ b/xen/arch/x86/domain_build.c	Mon Sep 22 13:46:57 2008 +0100
     6.3 @@ -314,24 +314,11 @@ int __init construct_dom0(
     6.4  #if defined(__x86_64__)
     6.5      if ( compat32 )
     6.6      {
     6.7 -        l1_pgentry_t gdt_l1e;
     6.8 -
     6.9          d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
    6.10          v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
    6.11  
    6.12          if ( nr_pages != (unsigned int)nr_pages )
    6.13              nr_pages = UINT_MAX;
    6.14 -
    6.15 -        /*
    6.16 -         * Map compatibility Xen segments into every VCPU's GDT. See
    6.17 -         * arch_domain_create() for further comments.
    6.18 -         */
    6.19 -        gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
    6.20 -                                PAGE_HYPERVISOR);
    6.21 -        for ( i = 0; i < MAX_VIRT_CPUS; i++ )
    6.22 -            d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
    6.23 -                                     FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
    6.24 -        flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
    6.25      }
    6.26  #endif
    6.27  
     7.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c	Mon Sep 22 13:41:07 2008 +0100
     7.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c	Mon Sep 22 13:46:57 2008 +0100
     7.3 @@ -446,7 +446,7 @@ static void vmx_set_host_env(struct vcpu
     7.4  
     7.5      __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
     7.6  
     7.7 -    __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3);
     7.8 +    __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
     7.9      __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
    7.10  
    7.11      __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
     8.1 --- a/xen/arch/x86/setup.c	Mon Sep 22 13:41:07 2008 +0100
     8.2 +++ b/xen/arch/x86/setup.c	Mon Sep 22 13:46:57 2008 +0100
     8.3 @@ -115,6 +115,12 @@ extern void early_cpu_init(void);
     8.4  extern void vesa_init(void);
     8.5  extern void vesa_mtrr_init(void);
     8.6  
     8.7 +DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
     8.8 +#ifdef CONFIG_COMPAT
     8.9 +DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
    8.10 +    = boot_cpu_compat_gdt_table;
    8.11 +#endif
    8.12 +
    8.13  struct tss_struct init_tss[NR_CPUS];
    8.14  
    8.15  char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE];
    8.16 @@ -224,6 +230,7 @@ static void __init percpu_init_areas(voi
    8.17  static void __init init_idle_domain(void)
    8.18  {
    8.19      struct domain *idle_domain;
    8.20 +    unsigned int i;
    8.21  
    8.22      /* Domain creation requires that scheduler structures are initialised. */
    8.23      scheduler_init();
    8.24 @@ -236,6 +243,12 @@ static void __init init_idle_domain(void
    8.25      idle_vcpu[0] = this_cpu(curr_vcpu) = current;
    8.26  
    8.27      setup_idle_pagetable();
    8.28 +
    8.29 +    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
    8.30 +        idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] =
    8.31 +            l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i,
    8.32 +                          __PAGE_HYPERVISOR);
    8.33 +
    8.34  }
    8.35  
    8.36  static void __init srat_detect_node(int cpu)
    8.37 @@ -443,7 +456,6 @@ void __init __start_xen(unsigned long mb
    8.38      parse_video_info();
    8.39  
    8.40      set_current((struct vcpu *)0xfffff000); /* debug sanity */
    8.41 -    idle_vcpu[0] = current;
    8.42      set_processor_id(0); /* needed early, for smp_processor_id() */
    8.43      if ( cpu_has_efer )
    8.44          rdmsrl(MSR_EFER, this_cpu(efer));
     9.1 --- a/xen/arch/x86/smpboot.c	Mon Sep 22 13:41:07 2008 +0100
     9.2 +++ b/xen/arch/x86/smpboot.c	Mon Sep 22 13:46:57 2008 +0100
     9.3 @@ -836,10 +836,15 @@ static int __devinit do_boot_cpu(int api
     9.4   */
     9.5  {
     9.6  	unsigned long boot_error;
     9.7 +	unsigned int i;
     9.8  	int timeout;
     9.9  	unsigned long start_eip;
    9.10  	unsigned short nmi_high = 0, nmi_low = 0;
    9.11  	struct vcpu *v;
    9.12 +	struct desc_struct *gdt;
    9.13 +#ifdef __x86_64__
    9.14 +        struct page_info *page;
    9.15 +#endif
    9.16  
    9.17  	/*
    9.18  	 * Save current MTRR state in case it was changed since early boot
    9.19 @@ -865,6 +870,37 @@ static int __devinit do_boot_cpu(int api
    9.20  	/* Debug build: detect stack overflow by setting up a guard page. */
    9.21  	memguard_guard_stack(stack_start.esp);
    9.22  
    9.23 +	gdt = per_cpu(gdt_table, cpu);
    9.24 +	if (gdt == boot_cpu_gdt_table) {
    9.25 +		i = get_order_from_pages(NR_RESERVED_GDT_PAGES);
    9.26 +#ifdef __x86_64__
    9.27 +#ifdef CONFIG_COMPAT
    9.28 +		page = alloc_domheap_pages(NULL, i,
    9.29 +					   MEMF_node(cpu_to_node(cpu)));
    9.30 +		per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
    9.31 +		memcpy(gdt, boot_cpu_compat_gdt_table,
    9.32 +		       NR_RESERVED_GDT_PAGES * PAGE_SIZE);
    9.33 +		gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
    9.34 +#endif
    9.35 +		page = alloc_domheap_pages(NULL, i,
    9.36 +					   MEMF_node(cpu_to_node(cpu)));
    9.37 +		per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
    9.38 +#else
    9.39 +		per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i);
    9.40 +#endif
    9.41 +		memcpy(gdt, boot_cpu_gdt_table,
    9.42 +		       NR_RESERVED_GDT_PAGES * PAGE_SIZE);
    9.43 +		BUILD_BUG_ON(NR_CPUS > 0x10000);
    9.44 +		gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
    9.45 +	}
    9.46 +
    9.47 +	for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
    9.48 +		v->domain->arch.mm_perdomain_pt
    9.49 +			[(v->vcpu_id << GDT_LDT_VCPU_SHIFT) +
    9.50 +			 FIRST_RESERVED_GDT_PAGE + i]
    9.51 +			= l1e_from_page(virt_to_page(gdt) + i,
    9.52 +					__PAGE_HYPERVISOR);
    9.53 +
    9.54  	/*
    9.55  	 * This grunge runs the startup process for
    9.56  	 * the targeted processor.
    10.1 --- a/xen/arch/x86/traps.c	Mon Sep 22 13:41:07 2008 +0100
    10.2 +++ b/xen/arch/x86/traps.c	Mon Sep 22 13:46:57 2008 +0100
    10.3 @@ -2965,13 +2965,13 @@ void set_intr_gate(unsigned int n, void 
    10.4  void set_tss_desc(unsigned int n, void *addr)
    10.5  {
    10.6      _set_tssldt_desc(
    10.7 -        gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
    10.8 +        per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
    10.9          (unsigned long)addr,
   10.10          offsetof(struct tss_struct, __cacheline_filler) - 1,
   10.11          9);
   10.12  #ifdef CONFIG_COMPAT
   10.13      _set_tssldt_desc(
   10.14 -        compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
   10.15 +        per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
   10.16          (unsigned long)addr,
   10.17          offsetof(struct tss_struct, __cacheline_filler) - 1,
   10.18          11);
    11.1 --- a/xen/arch/x86/x86_32/mm.c	Mon Sep 22 13:41:07 2008 +0100
    11.2 +++ b/xen/arch/x86/x86_32/mm.c	Mon Sep 22 13:46:57 2008 +0100
    11.3 @@ -132,6 +132,30 @@ void __init setup_idle_pagetable(void)
    11.4                                  __PAGE_HYPERVISOR));
    11.5  }
    11.6  
    11.7 +unsigned long clone_idle_pagetable(struct vcpu *v)
    11.8 +{
    11.9 +    unsigned int i;
   11.10 +    struct domain *d = v->domain;
   11.11 +    l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0];
   11.12 +    l2_pgentry_t *l2_table = alloc_xenheap_page();
   11.13 +
   11.14 +    if ( !l2_table )
   11.15 +        return 0;
   11.16 +
   11.17 +    memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table));
   11.18 +    l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] =
   11.19 +        l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT);
   11.20 +
   11.21 +    copy_page(l2_table, idle_pg_table_l2 +
   11.22 +              l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES);
   11.23 +    for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
   11.24 +        l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
   11.25 +            l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
   11.26 +                          __PAGE_HYPERVISOR);
   11.27 +
   11.28 +    return __pa(l3_table);
   11.29 +}
   11.30 +
   11.31  void __init zap_low_mappings(l2_pgentry_t *dom0_l2)
   11.32  {
   11.33      int i;
   11.34 @@ -186,7 +210,7 @@ void __init subarch_init_memory(void)
   11.35      {
   11.36          /* Guest kernel runs in ring 0, not ring 1. */
   11.37          struct desc_struct *d;
   11.38 -        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
   11.39 +        d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
   11.40          d[0].b &= ~_SEGMENT_DPL;
   11.41          d[1].b &= ~_SEGMENT_DPL;
   11.42      }
    12.1 --- a/xen/arch/x86/x86_32/supervisor_mode_kernel.S	Mon Sep 22 13:41:07 2008 +0100
    12.2 +++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S	Mon Sep 22 13:46:57 2008 +0100
    12.3 @@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack)
    12.4          # %gs:%esi now points to the guest stack before the
    12.5          # interrupt/exception occured.
    12.6  
    12.7 -        /*
    12.8 -         * Reverse the __TSS macro, giving us the CPU number.
    12.9 -         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
   12.10 -         */
   12.11 -        str   %ecx
   12.12 -        shrl  $3,%ecx                                   # Calculate GDT index for TSS.
   12.13 -        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
   12.14 -        shll  $6,%ecx                                   # Each TSS entry is 0x80 bytes
   12.15 -        addl  $init_tss,%ecx                            # but we have 2*cpu from above.
   12.16 +        movl  $PER_CPU_GDT_ENTRY*8,%ecx
   12.17 +        lsll  %ecx,%ecx
   12.18 +        shll  $7,%ecx                                   # Each TSS entry is 0x80 bytes
   12.19 +        addl  $init_tss,%ecx
   12.20  
   12.21          # Load Xen stack from TSS.
   12.22          movw  TSS_ss0(%ecx),%ax
    13.1 --- a/xen/arch/x86/x86_32/traps.c	Mon Sep 22 13:41:07 2008 +0100
    13.2 +++ b/xen/arch/x86/x86_32/traps.c	Mon Sep 22 13:46:57 2008 +0100
    13.3 @@ -194,13 +194,15 @@ static unsigned char doublefault_stack[D
    13.4  
    13.5  asmlinkage void do_double_fault(void)
    13.6  {
    13.7 -    struct tss_struct *tss = &doublefault_tss;
    13.8 -    unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
    13.9 +    struct tss_struct *tss;
   13.10 +    unsigned int cpu;
   13.11  
   13.12      watchdog_disable();
   13.13  
   13.14      console_force_unlock();
   13.15  
   13.16 +    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
   13.17 +
   13.18      /* Find information saved during fault and dump it to the console. */
   13.19      tss = &init_tss[cpu];
   13.20      printk("*** DOUBLE FAULT ***\n");
   13.21 @@ -325,7 +327,7 @@ void __devinit subarch_percpu_traps_init
   13.22      tss->eflags = 2;
   13.23      tss->bitmap = IOBMP_INVALID_OFFSET;
   13.24      _set_tssldt_desc(
   13.25 -        gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
   13.26 +        boot_cpu_gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
   13.27          (unsigned long)tss, 235, 9);
   13.28  
   13.29      set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
    14.1 --- a/xen/arch/x86/x86_64/mm.c	Mon Sep 22 13:41:07 2008 +0100
    14.2 +++ b/xen/arch/x86/x86_64/mm.c	Mon Sep 22 13:46:57 2008 +0100
    14.3 @@ -21,6 +21,7 @@
    14.4  #include <xen/lib.h>
    14.5  #include <xen/init.h>
    14.6  #include <xen/mm.h>
    14.7 +#include <xen/numa.h>
    14.8  #include <xen/sched.h>
    14.9  #include <xen/guest_access.h>
   14.10  #include <asm/current.h>
   14.11 @@ -206,6 +207,24 @@ void __init setup_idle_pagetable(void)
   14.12                    __PAGE_HYPERVISOR));
   14.13  }
   14.14  
   14.15 +unsigned long clone_idle_pagetable(struct vcpu *v)
   14.16 +{
   14.17 +    struct domain *d = v->domain;
   14.18 +    struct page_info *page = alloc_domheap_page(NULL,
   14.19 +                                                MEMF_node(vcpu_to_node(v)));
   14.20 +    l4_pgentry_t *l4_table = page_to_virt(page);
   14.21 +
   14.22 +    if ( !page )
   14.23 +        return 0;
   14.24 +
   14.25 +    copy_page(l4_table, idle_pg_table);
   14.26 +    l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
   14.27 +        l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
   14.28 +                      __PAGE_HYPERVISOR);
   14.29 +
   14.30 +    return __pa(l4_table);
   14.31 +}
   14.32 +
   14.33  void __init zap_low_mappings(void)
   14.34  {
   14.35      BUG_ON(num_online_cpus() != 1);
    15.1 --- a/xen/arch/x86/x86_64/traps.c	Mon Sep 22 13:41:07 2008 +0100
    15.2 +++ b/xen/arch/x86/x86_64/traps.c	Mon Sep 22 13:46:57 2008 +0100
    15.3 @@ -213,15 +213,14 @@ void show_page_walk(unsigned long addr)
    15.4  asmlinkage void double_fault(void);
    15.5  asmlinkage void do_double_fault(struct cpu_user_regs *regs)
    15.6  {
    15.7 -    unsigned int cpu, tr;
    15.8 -
    15.9 -    asm volatile ( "str %0" : "=r" (tr) );
   15.10 -    cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
   15.11 +    unsigned int cpu;
   15.12  
   15.13      watchdog_disable();
   15.14  
   15.15      console_force_unlock();
   15.16  
   15.17 +    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
   15.18 +
   15.19      /* Find information saved during fault and dump it to the console. */
   15.20      printk("*** DOUBLE FAULT ***\n");
   15.21      print_xen_info();
    16.1 --- a/xen/include/asm-x86/desc.h	Mon Sep 22 13:41:07 2008 +0100
    16.2 +++ b/xen/include/asm-x86/desc.h	Mon Sep 22 13:46:57 2008 +0100
    16.3 @@ -34,11 +34,9 @@
    16.4  #define FLAT_COMPAT_USER_CS   FLAT_COMPAT_RING3_CS
    16.5  #define FLAT_COMPAT_USER_SS   FLAT_COMPAT_RING3_SS
    16.6  
    16.7 -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
    16.8 -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2)
    16.9 -
   16.10 -#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY)
   16.11 -#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY)
   16.12 +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
   16.13 +#define LDT_ENTRY (TSS_ENTRY + 2)
   16.14 +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2)
   16.15  
   16.16  #elif defined(__i386__)
   16.17  
   16.18 @@ -51,17 +49,15 @@
   16.19  
   16.20  #define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY
   16.21  
   16.22 -#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
   16.23 -#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1)
   16.24 -
   16.25 -#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY)
   16.26 -#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY)
   16.27 +#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
   16.28 +#define LDT_ENTRY (TSS_ENTRY + 1)
   16.29 +#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1)
   16.30  
   16.31  #endif
   16.32  
   16.33  #ifndef __ASSEMBLY__
   16.34  
   16.35 -#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (__TSS(n)<<3) )
   16.36 +#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (TSS_ENTRY<<3) )
   16.37  
   16.38  #if defined(__x86_64__)
   16.39  #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
   16.40 @@ -205,11 +201,19 @@ do {                                    
   16.41  
   16.42  #endif
   16.43  
   16.44 -extern struct desc_struct gdt_table[];
   16.45 +struct desc_ptr {
   16.46 +	unsigned short limit;
   16.47 +	unsigned long base;
   16.48 +} __attribute__((__packed__)) ;
   16.49 +
   16.50 +extern struct desc_struct boot_cpu_gdt_table[];
   16.51 +DECLARE_PER_CPU(struct desc_struct *, gdt_table);
   16.52  #ifdef CONFIG_COMPAT
   16.53 -extern struct desc_struct compat_gdt_table[];
   16.54 +extern struct desc_struct boot_cpu_compat_gdt_table[];
   16.55 +DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table);
   16.56  #else
   16.57 -# define compat_gdt_table gdt_table
   16.58 +# define boot_cpu_compat_gdt_table boot_cpu_gdt_table
   16.59 +# define per_cpu__compat_gdt_table per_cpu__gdt_table
   16.60  #endif
   16.61  
   16.62  extern void set_intr_gate(unsigned int irq, void * addr);
    17.1 --- a/xen/include/asm-x86/ldt.h	Mon Sep 22 13:41:07 2008 +0100
    17.2 +++ b/xen/include/asm-x86/ldt.h	Mon Sep 22 13:46:57 2008 +0100
    17.3 @@ -6,7 +6,6 @@
    17.4  
    17.5  static inline void load_LDT(struct vcpu *v)
    17.6  {
    17.7 -    unsigned int cpu;
    17.8      struct desc_struct *desc;
    17.9      unsigned long ents;
   17.10  
   17.11 @@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu 
   17.12      }
   17.13      else
   17.14      {
   17.15 -        cpu = smp_processor_id();
   17.16 -        desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table)
   17.17 -               + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY;
   17.18 +        desc = (!is_pv_32on64_vcpu(v)
   17.19 +                ? this_cpu(gdt_table) : this_cpu(compat_gdt_table))
   17.20 +               + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY;
   17.21          _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2);
   17.22 -        __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) );
   17.23 +        __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) );
   17.24      }
   17.25  }
   17.26  
    18.1 --- a/xen/include/asm-x86/page.h	Mon Sep 22 13:41:07 2008 +0100
    18.2 +++ b/xen/include/asm-x86/page.h	Mon Sep 22 13:46:57 2008 +0100
    18.3 @@ -278,6 +278,7 @@ extern unsigned int   m2p_compat_vstart;
    18.4  #endif
    18.5  void paging_init(void);
    18.6  void setup_idle_pagetable(void);
    18.7 +unsigned long clone_idle_pagetable(struct vcpu *);
    18.8  #endif /* !defined(__ASSEMBLY__) */
    18.9  
   18.10  #define _PAGE_PRESENT  0x001U