ia64/xen-unstable

changeset 3631:677cb76cff18

bitkeeper revision 1.1159.212.78 (4202391ehUS0T4TJglUpPqBH3oGjNQ)

Move domain builder to be subarch-specific. Fix pfn_info structure and
page reference-counting to be 64-bit clean.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@scramble.cl.cam.ac.uk
date Thu Feb 03 14:45:50 2005 +0000 (2005-02-03)
parents d55d523078f7
children 1c43dbcfc46f
files .rootkeys xen/arch/x86/domain.c xen/arch/x86/memory.c xen/arch/x86/shadow.c xen/arch/x86/x86_32/domain_build.c xen/arch/x86/x86_32/mm.c xen/arch/x86/x86_64/domain_build.c xen/arch/x86/x86_64/mm.c xen/common/page_alloc.c xen/include/asm-x86/mm.h xen/include/asm-x86/shadow.h
line diff
     1.1 --- a/.rootkeys	Thu Feb 03 13:07:34 2005 +0000
     1.2 +++ b/.rootkeys	Thu Feb 03 14:45:50 2005 +0000
     1.3 @@ -897,6 +897,7 @@ 41c0c411ODt8uEmV-yUxpQLpqimE5Q xen/arch/
     1.4  41f97ef5139vN42cOYHfX_Ac8WOOjA xen/arch/x86/vmx_platform.c
     1.5  41c0c4128URE0dxcO15JME_MuKBPfg xen/arch/x86/vmx_vmcs.c
     1.6  419cbedeQDg8IrO3izo3o5rQNlo0kQ xen/arch/x86/x86_32/asm-offsets.c
     1.7 +4202391dkvdTZ8GhWXe3Gqf9EOgWXg xen/arch/x86/x86_32/domain_build.c
     1.8  3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/x86/x86_32/domain_page.c
     1.9  3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/x86/x86_32/entry.S
    1.10  3ddb79bcHwuCQDjBICDTSis52hWguw xen/arch/x86/x86_32/mm.c
    1.11 @@ -905,6 +906,7 @@ 42000d3ckiFc1qxa4AWqsd0t3lxuyw xen/arch/
    1.12  3ddb79bc4nTpGQOe6_-MbyZzkhlhFQ xen/arch/x86/x86_32/usercopy.c
    1.13  3ddb79bcOMCu9-5mKpjIh5d0qqBDPg xen/arch/x86/x86_32/xen.lds
    1.14  41bf1717Ty3hwN3E9swdu8QfnvGqww xen/arch/x86/x86_64/asm-offsets.c
    1.15 +4202391dA91ZovYX9d_5zJi9yGvLoQ xen/arch/x86/x86_64/domain_build.c
    1.16  40e96d3aLDI-nViMuYneD7VKYlZrVg xen/arch/x86/x86_64/entry.S
    1.17  41bf1717XhPz_dNT5OKSjgmbFuWBuA xen/arch/x86/x86_64/mm.c
    1.18  42000d3cMb8o1WuFBXC07c8i3lPZBw xen/arch/x86/x86_64/traps.c
     2.1 --- a/xen/arch/x86/domain.c	Thu Feb 03 13:07:34 2005 +0000
     2.2 +++ b/xen/arch/x86/domain.c	Thu Feb 03 14:45:50 2005 +0000
     2.3 @@ -43,20 +43,6 @@
     2.4  static int opt_noreboot = 0;
     2.5  boolean_param("noreboot", opt_noreboot);
     2.6  
     2.7 -#if !defined(CONFIG_X86_64BITMODE)
     2.8 -/* No ring-3 access in initial page tables. */
     2.9 -#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
    2.10 -#else
    2.11 -/* Allow ring-3 access in long mode as guest cannot use ring 1. */
    2.12 -#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
    2.13 -#endif
    2.14 -#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
    2.15 -#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
    2.16 -#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
    2.17 -
    2.18 -#define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
    2.19 -#define round_pgdown(_p)  ((_p)&PAGE_MASK)
    2.20 -
    2.21  static void default_idle(void)
    2.22  {
    2.23      __cli();
    2.24 @@ -795,364 +781,3 @@ void domain_relinquish_memory(struct dom
    2.25      relinquish_list(d, &d->page_list);
    2.26  }
    2.27  
    2.28 -
    2.29 -int construct_dom0(struct domain *p, 
    2.30 -                   unsigned long alloc_start,
    2.31 -                   unsigned long alloc_end,
    2.32 -                   char *image_start, unsigned long image_len, 
    2.33 -                   char *initrd_start, unsigned long initrd_len,
    2.34 -                   char *cmdline)
    2.35 -{
    2.36 -    char *dst;
    2.37 -    int i, rc;
    2.38 -    unsigned long pfn, mfn;
    2.39 -    unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
    2.40 -    unsigned long nr_pt_pages;
    2.41 -    unsigned long count;
    2.42 -    l2_pgentry_t *l2tab, *l2start;
    2.43 -    l1_pgentry_t *l1tab = NULL, *l1start = NULL;
    2.44 -    struct pfn_info *page = NULL;
    2.45 -    start_info_t *si;
    2.46 -    struct exec_domain *ed = p->exec_domain[0];
    2.47 -
    2.48 -    /*
    2.49 -     * This fully describes the memory layout of the initial domain. All 
    2.50 -     * *_start address are page-aligned, except v_start (and v_end) which are 
    2.51 -     * superpage-aligned.
    2.52 -     */
    2.53 -    struct domain_setup_info dsi;
    2.54 -    unsigned long vinitrd_start;
    2.55 -    unsigned long vinitrd_end;
    2.56 -    unsigned long vphysmap_start;
    2.57 -    unsigned long vphysmap_end;
    2.58 -    unsigned long vstartinfo_start;
    2.59 -    unsigned long vstartinfo_end;
    2.60 -    unsigned long vstack_start;
    2.61 -    unsigned long vstack_end;
    2.62 -    unsigned long vpt_start;
    2.63 -    unsigned long vpt_end;
    2.64 -    unsigned long v_end;
    2.65 -
    2.66 -    /* Machine address of next candidate page-table page. */
    2.67 -    unsigned long mpt_alloc;
    2.68 -
    2.69 -    extern void physdev_init_dom0(struct domain *);
    2.70 -
    2.71 -    /* Sanity! */
    2.72 -    if ( p->id != 0 ) 
    2.73 -        BUG();
    2.74 -    if ( test_bit(DF_CONSTRUCTED, &p->d_flags) ) 
    2.75 -        BUG();
    2.76 -
    2.77 -    memset(&dsi, 0, sizeof(struct domain_setup_info));
    2.78 -
    2.79 -    printk("*** LOADING DOMAIN 0 ***\n");
    2.80 -
    2.81 -    /*
    2.82 -     * This is all a bit grim. We've moved the modules to the "safe" physical 
    2.83 -     * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this 
    2.84 -     * routine we're going to copy it down into the region that's actually 
    2.85 -     * been allocated to domain 0. This is highly likely to be overlapping, so 
    2.86 -     * we use a forward copy.
    2.87 -     * 
    2.88 -     * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with 
    2.89 -     * 4GB and lots of network/disk cards that allocate loads of buffers. 
    2.90 -     * We'll have to revisit this if we ever support PAE (64GB).
    2.91 -     */
    2.92 -
    2.93 -    rc = parseelfimage(image_start, image_len, &dsi);
    2.94 -    if ( rc != 0 )
    2.95 -        return rc;
    2.96 -
    2.97 -    /* Set up domain options */
    2.98 -    if ( dsi.use_writable_pagetables )
    2.99 -        vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
   2.100 -
   2.101 -    if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
   2.102 -    {
   2.103 -        printk("Initial guest OS must load to a page boundary.\n");
   2.104 -        return -EINVAL;
   2.105 -    }
   2.106 -
   2.107 -    /*
   2.108 -     * Why do we need this? The number of page-table frames depends on the 
   2.109 -     * size of the bootstrap address space. But the size of the address space 
   2.110 -     * depends on the number of page-table frames (since each one is mapped 
   2.111 -     * read-only). We have a pair of simultaneous equations in two unknowns, 
   2.112 -     * which we solve by exhaustive search.
   2.113 -     */
   2.114 -    vinitrd_start    = round_pgup(dsi.v_kernend);
   2.115 -    vinitrd_end      = vinitrd_start + initrd_len;
   2.116 -    vphysmap_start   = round_pgup(vinitrd_end);
   2.117 -    vphysmap_end     = vphysmap_start + (nr_pages * sizeof(unsigned long));
   2.118 -    vpt_start        = round_pgup(vphysmap_end);
   2.119 -    for ( nr_pt_pages = 2; ; nr_pt_pages++ )
   2.120 -    {
   2.121 -        vpt_end          = vpt_start + (nr_pt_pages * PAGE_SIZE);
   2.122 -        vstartinfo_start = vpt_end;
   2.123 -        vstartinfo_end   = vstartinfo_start + PAGE_SIZE;
   2.124 -        vstack_start     = vstartinfo_end;
   2.125 -        vstack_end       = vstack_start + PAGE_SIZE;
   2.126 -        v_end            = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
   2.127 -        if ( (v_end - vstack_end) < (512 << 10) )
   2.128 -            v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
   2.129 -        if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >> 
   2.130 -               L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
   2.131 -            break;
   2.132 -    }
   2.133 -
   2.134 -    printk("PHYSICAL MEMORY ARRANGEMENT:\n"
   2.135 -           " Kernel image:  %p->%p\n"
   2.136 -           " Initrd image:  %p->%p\n"
   2.137 -           " Dom0 alloc.:   %08lx->%08lx\n",
   2.138 -           image_start, image_start + image_len,
   2.139 -           initrd_start, initrd_start + initrd_len,
   2.140 -           alloc_start, alloc_end);
   2.141 -    printk("VIRTUAL MEMORY ARRANGEMENT:\n"
   2.142 -           " Loaded kernel: %08lx->%08lx\n"
   2.143 -           " Init. ramdisk: %08lx->%08lx\n"
   2.144 -           " Phys-Mach map: %08lx->%08lx\n"
   2.145 -           " Page tables:   %08lx->%08lx\n"
   2.146 -           " Start info:    %08lx->%08lx\n"
   2.147 -           " Boot stack:    %08lx->%08lx\n"
   2.148 -           " TOTAL:         %08lx->%08lx\n",
   2.149 -           dsi.v_kernstart, dsi.v_kernend, 
   2.150 -           vinitrd_start, vinitrd_end,
   2.151 -           vphysmap_start, vphysmap_end,
   2.152 -           vpt_start, vpt_end,
   2.153 -           vstartinfo_start, vstartinfo_end,
   2.154 -           vstack_start, vstack_end,
   2.155 -           dsi.v_start, v_end);
   2.156 -    printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
   2.157 -
   2.158 -    if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
   2.159 -    {
   2.160 -        printk("Initial guest OS requires too much space\n"
   2.161 -               "(%luMB is greater than %luMB limit)\n",
   2.162 -               (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
   2.163 -        return -ENOMEM;
   2.164 -    }
   2.165 -
   2.166 -    /*
   2.167 -     * Protect the lowest 1GB of memory. We use a temporary mapping there
   2.168 -     * from which we copy the kernel and ramdisk images.
   2.169 -     */
   2.170 -    if ( dsi.v_start < (1<<30) )
   2.171 -    {
   2.172 -        printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
   2.173 -        return -EINVAL;
   2.174 -    }
   2.175 -
   2.176 -    /* Paranoia: scrub DOM0's memory allocation. */
   2.177 -    printk("Scrubbing DOM0 RAM: ");
   2.178 -    dst = (char *)alloc_start;
   2.179 -    while ( dst < (char *)alloc_end )
   2.180 -    {
   2.181 -#define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
   2.182 -        printk(".");
   2.183 -        touch_nmi_watchdog();
   2.184 -        if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
   2.185 -        {
   2.186 -            memset(dst, 0, SCRUB_BYTES);
   2.187 -            dst += SCRUB_BYTES;
   2.188 -        }
   2.189 -        else
   2.190 -        {
   2.191 -            memset(dst, 0, (char *)alloc_end - dst);
   2.192 -            break;
   2.193 -        }
   2.194 -    }
   2.195 -    printk("done.\n");
   2.196 -
   2.197 -    /* Construct a frame-allocation list for the initial domain. */
   2.198 -    for ( mfn = (alloc_start>>PAGE_SHIFT); 
   2.199 -          mfn < (alloc_end>>PAGE_SHIFT); 
   2.200 -          mfn++ )
   2.201 -    {
   2.202 -        page = &frame_table[mfn];
   2.203 -        page->u.inuse.domain    = p;
   2.204 -        page->u.inuse.type_info = 0;
   2.205 -        page->count_info        = PGC_allocated | 1;
   2.206 -        list_add_tail(&page->list, &p->page_list);
   2.207 -        p->tot_pages++; p->max_pages++;
   2.208 -    }
   2.209 -
   2.210 -    mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
   2.211 -
   2.212 -    SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
   2.213 -    SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
   2.214 -
   2.215 -    /*
   2.216 -     * We're basically forcing default RPLs to 1, so that our "what privilege
   2.217 -     * level are we returning to?" logic works.
   2.218 -     */
   2.219 -    ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
   2.220 -    ed->thread.event_selector    = FLAT_GUESTOS_CS;
   2.221 -    ed->thread.guestos_ss = FLAT_GUESTOS_DS;
   2.222 -    for ( i = 0; i < 256; i++ ) 
   2.223 -        ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
   2.224 -
   2.225 -    /* WARNING: The new domain must have its 'processor' field filled in! */
   2.226 -    l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
   2.227 -    memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
   2.228 -    l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
   2.229 -        mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
   2.230 -    l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
   2.231 -        mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
   2.232 -    ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
   2.233 -
   2.234 -    l2tab += l2_table_offset(dsi.v_start);
   2.235 -    mfn = alloc_start >> PAGE_SHIFT;
   2.236 -    for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
   2.237 -    {
   2.238 -        if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
   2.239 -        {
   2.240 -            l1start = l1tab = (l1_pgentry_t *)mpt_alloc; 
   2.241 -            mpt_alloc += PAGE_SIZE;
   2.242 -            *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
   2.243 -            clear_page(l1tab);
   2.244 -            if ( count == 0 )
   2.245 -                l1tab += l1_table_offset(dsi.v_start);
   2.246 -        }
   2.247 -        *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
   2.248 -        
   2.249 -        page = &frame_table[mfn];
   2.250 -        if ( !get_page_and_type(page, p, PGT_writable_page) )
   2.251 -            BUG();
   2.252 -
   2.253 -        mfn++;
   2.254 -    }
   2.255 -
   2.256 -    /* Pages that are part of page tables must be read only. */
   2.257 -    l2tab = l2start + l2_table_offset(vpt_start);
   2.258 -    l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
   2.259 -    l1tab += l1_table_offset(vpt_start);
   2.260 -    l2tab++;
   2.261 -    for ( count = 0; count < nr_pt_pages; count++ ) 
   2.262 -    {
   2.263 -        *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
   2.264 -        page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
   2.265 -        if ( count == 0 )
   2.266 -        {
   2.267 -            page->u.inuse.type_info &= ~PGT_type_mask;
   2.268 -            page->u.inuse.type_info |= PGT_l2_page_table;
   2.269 -
   2.270 -            /*
   2.271 -             * No longer writable: decrement the type_count.
   2.272 -             * Installed as CR3: increment both the ref_count and type_count.
   2.273 -             * Net: just increment the ref_count.
   2.274 -             */
   2.275 -            get_page(page, p); /* an extra ref because of readable mapping */
   2.276 -
   2.277 -            /* Get another ref to L2 page so that it can be pinned. */
   2.278 -            if ( !get_page_and_type(page, p, PGT_l2_page_table) )
   2.279 -                BUG();
   2.280 -            set_bit(_PGT_pinned, &page->u.inuse.type_info);
   2.281 -        }
   2.282 -        else
   2.283 -        {
   2.284 -            page->u.inuse.type_info &= ~PGT_type_mask;
   2.285 -            page->u.inuse.type_info |= PGT_l1_page_table;
   2.286 -	    page->u.inuse.type_info |= 
   2.287 -		((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
   2.288 -
   2.289 -            /*
   2.290 -             * No longer writable: decrement the type_count.
   2.291 -             * This is an L1 page, installed in a validated L2 page:
   2.292 -             * increment both the ref_count and type_count.
   2.293 -             * Net: just increment the ref_count.
   2.294 -             */
   2.295 -            get_page(page, p); /* an extra ref because of readable mapping */
   2.296 -        }
   2.297 -        l1tab++;
   2.298 -        if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
   2.299 -            l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
   2.300 -    }
   2.301 -
   2.302 -    /* Set up shared-info area. */
   2.303 -    update_dom_time(p);
   2.304 -    p->shared_info->domain_time = 0;
   2.305 -    /* Mask all upcalls... */
   2.306 -    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
   2.307 -        p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
   2.308 -    p->shared_info->n_vcpu = smp_num_cpus;
   2.309 -
   2.310 -    /* Install the new page tables. */
   2.311 -    __cli();
   2.312 -    write_ptbase(&ed->mm);
   2.313 -
   2.314 -    /* Copy the OS image. */
   2.315 -    (void)loadelfimage(image_start);
   2.316 -
   2.317 -    /* Copy the initial ramdisk. */
   2.318 -    if ( initrd_len != 0 )
   2.319 -        memcpy((void *)vinitrd_start, initrd_start, initrd_len);
   2.320 -    
   2.321 -    /* Set up start info area. */
   2.322 -    si = (start_info_t *)vstartinfo_start;
   2.323 -    memset(si, 0, PAGE_SIZE);
   2.324 -    si->nr_pages     = p->tot_pages;
   2.325 -    si->shared_info  = virt_to_phys(p->shared_info);
   2.326 -    si->flags        = SIF_PRIVILEGED | SIF_INITDOMAIN;
   2.327 -    si->pt_base      = vpt_start;
   2.328 -    si->nr_pt_frames = nr_pt_pages;
   2.329 -    si->mfn_list     = vphysmap_start;
   2.330 -
   2.331 -    /* Write the phys->machine and machine->phys table entries. */
   2.332 -    for ( pfn = 0; pfn < p->tot_pages; pfn++ )
   2.333 -    {
   2.334 -        mfn = pfn + (alloc_start>>PAGE_SHIFT);
   2.335 -#ifndef NDEBUG
   2.336 -#define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
   2.337 -        if ( pfn > REVERSE_START )
   2.338 -            mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
   2.339 -#endif
   2.340 -        ((unsigned long *)vphysmap_start)[pfn] = mfn;
   2.341 -        machine_to_phys_mapping[mfn] = pfn;
   2.342 -    }
   2.343 -
   2.344 -    if ( initrd_len != 0 )
   2.345 -    {
   2.346 -        si->mod_start = vinitrd_start;
   2.347 -        si->mod_len   = initrd_len;
   2.348 -        printk("Initrd len 0x%lx, start at 0x%08lx\n",
   2.349 -               si->mod_len, si->mod_start);
   2.350 -    }
   2.351 -
   2.352 -    dst = si->cmd_line;
   2.353 -    if ( cmdline != NULL )
   2.354 -    {
   2.355 -        for ( i = 0; i < 255; i++ )
   2.356 -        {
   2.357 -            if ( cmdline[i] == '\0' )
   2.358 -                break;
   2.359 -            *dst++ = cmdline[i];
   2.360 -        }
   2.361 -    }
   2.362 -    *dst = '\0';
   2.363 -
   2.364 -    /* Reinstate the caller's page tables. */
   2.365 -    write_ptbase(&current->mm);
   2.366 -    __sti();
   2.367 -
   2.368 -    /* Destroy low mappings - they were only for our convenience. */
   2.369 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
   2.370 -        if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
   2.371 -            l2start[i] = mk_l2_pgentry(0);
   2.372 -    zap_low_mappings(); /* Do the same for the idle page tables. */
   2.373 -    
   2.374 -    /* DOM0 gets access to everything. */
   2.375 -    physdev_init_dom0(p);
   2.376 -
   2.377 -    set_bit(DF_CONSTRUCTED, &p->d_flags);
   2.378 -
   2.379 -    new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
   2.380 -
   2.381 -#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
   2.382 -    shadow_lock(&p->mm);
   2.383 -    shadow_mode_enable(p, SHM_test); 
   2.384 -    shadow_unlock(&p->mm);
   2.385 -#endif
   2.386 -
   2.387 -    return 0;
   2.388 -}
     3.1 --- a/xen/arch/x86/memory.c	Thu Feb 03 13:07:34 2005 +0000
     3.2 +++ b/xen/arch/x86/memory.c	Thu Feb 03 14:45:50 2005 +0000
     3.3 @@ -444,7 +444,7 @@ static void put_page_from_l1e(l1_pgentry
     3.4      if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
     3.5          return;
     3.6  
     3.7 -    e = page->u.inuse.domain;
     3.8 +    e = page_get_owner(page);
     3.9      if ( unlikely(e != d) )
    3.10      {
    3.11          /*
    3.12 @@ -493,7 +493,7 @@ static void put_page_from_l2e(l2_pgentry
    3.13  
    3.14  static int alloc_l2_table(struct pfn_info *page)
    3.15  {
    3.16 -    struct domain *d = page->u.inuse.domain;
    3.17 +    struct domain *d = page_get_owner(page);
    3.18      unsigned long  page_nr = page_to_pfn(page);
    3.19      l2_pgentry_t  *pl2e;
    3.20      int            i;
    3.21 @@ -512,7 +512,7 @@ static int alloc_l2_table(struct pfn_inf
    3.22      pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
    3.23          mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
    3.24      pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
    3.25 -        mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) | 
    3.26 +        mk_l2_pgentry(__pa(page_get_owner(page)->mm_perdomain_pt) | 
    3.27                        __PAGE_HYPERVISOR);
    3.28  #endif
    3.29  
    3.30 @@ -530,7 +530,7 @@ static int alloc_l2_table(struct pfn_inf
    3.31  
    3.32  static int alloc_l1_table(struct pfn_info *page)
    3.33  {
    3.34 -    struct domain *d = page->u.inuse.domain;
    3.35 +    struct domain *d = page_get_owner(page);
    3.36      unsigned long  page_nr = page_to_pfn(page);
    3.37      l1_pgentry_t  *pl1e;
    3.38      int            i;
    3.39 @@ -570,7 +570,7 @@ static void free_l2_table(struct pfn_inf
    3.40  
    3.41  static void free_l1_table(struct pfn_info *page)
    3.42  {
    3.43 -    struct domain *d = page->u.inuse.domain;
    3.44 +    struct domain *d = page_get_owner(page);
    3.45      unsigned long page_nr = page - frame_table;
    3.46      l1_pgentry_t *pl1e;
    3.47      int i;
    3.48 @@ -731,7 +731,7 @@ int alloc_page_type(struct pfn_info *pag
    3.49  
    3.50  void free_page_type(struct pfn_info *page, unsigned int type)
    3.51  {
    3.52 -    struct domain *d = page->u.inuse.domain;
    3.53 +    struct domain *d = page_get_owner(page);
    3.54  
    3.55      switch ( type )
    3.56      {
    3.57 @@ -774,7 +774,7 @@ void put_page_type(struct pfn_info *page
    3.58           * See domain.c:relinquish_list().
    3.59           */
    3.60          ASSERT((x & PGT_validated) || 
    3.61 -               test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
    3.62 +               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
    3.63  
    3.64          if ( unlikely((nx & PGT_count_mask) == 0) )
    3.65          {
    3.66 @@ -832,7 +832,7 @@ int get_page_type(struct pfn_info *page,
    3.67                   * may be unnecessary (e.g., page was GDT/LDT) but those
    3.68                   * circumstances should be very rare.
    3.69                   */
    3.70 -                struct domain *d = page->u.inuse.domain;
    3.71 +                struct domain *d = page_get_owner(page);
    3.72                  if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
    3.73                                           page->tlbflush_timestamp)) )
    3.74                  {
    3.75 @@ -987,7 +987,7 @@ static int do_extended_command(unsigned 
    3.76          if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
    3.77          {
    3.78              MEM_LOG("Page %08lx bad domain (dom=%p)",
    3.79 -                    ptr, page->u.inuse.domain);
    3.80 +                    ptr, page_get_owner(page));
    3.81          }
    3.82          else if ( likely(test_and_clear_bit(_PGT_pinned, 
    3.83                                              &page->u.inuse.type_info)) )
    3.84 @@ -1117,7 +1117,7 @@ static int do_extended_command(unsigned 
    3.85           * benign reference to the page (PGC_allocated). If that reference
    3.86           * disappears then the deallocation routine will safely spin.
    3.87           */
    3.88 -        nd = page->u.inuse.domain;
    3.89 +        nd = page_get_owner(page);
    3.90          y  = page->count_info;
    3.91          do {
    3.92              x = y;
    3.93 @@ -1173,7 +1173,7 @@ static int do_extended_command(unsigned 
    3.94          if ( unlikely(e->tot_pages++ == 0) )
    3.95              get_knownalive_domain(e);
    3.96          list_add_tail(&page->list, &e->page_list);
    3.97 -        page->u.inuse.domain = e;
    3.98 +        page_set_owner(page, e);
    3.99  
   3.100          spin_unlock(&e->page_alloc_lock);
   3.101  
   3.102 @@ -1229,7 +1229,7 @@ static int do_extended_command(unsigned 
   3.103           * benign reference to the page (PGC_allocated). If that reference
   3.104           * disappears then the deallocation routine will safely spin.
   3.105           */
   3.106 -        nd = page->u.inuse.domain;
   3.107 +        nd = page_get_owner(page);
   3.108          y  = page->count_info;
   3.109          do {
   3.110              x = y;
   3.111 @@ -2072,7 +2072,7 @@ void audit_domain(struct domain *d)
   3.112          pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
   3.113          page = &frame_table[pfn];
   3.114  
   3.115 -        if ( page->u.inuse.domain != d )
   3.116 +        if ( page_get_owner(page) != d )
   3.117              BUG();
   3.118  
   3.119          if ( (page->u.inuse.type_info & PGT_count_mask) >
   3.120 @@ -2118,7 +2118,7 @@ void audit_domain(struct domain *d)
   3.121          pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
   3.122          page = &frame_table[pfn];
   3.123  
   3.124 -        if ( page->u.inuse.domain != d )
   3.125 +        if ( page_get_owner(page) != d )
   3.126              BUG();
   3.127  
   3.128          switch ( page->u.inuse.type_info & PGT_type_mask )
   3.129 @@ -2144,10 +2144,10 @@ void audit_domain(struct domain *d)
   3.130                      unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
   3.131                      struct pfn_info *l1page = &frame_table[l1pfn];
   3.132  
   3.133 -                    if ( l1page->u.inuse.domain != d )
   3.134 +                    if ( page_get_owner(l1page) != d )
   3.135                      {
   3.136                          printk("L2: Skip bizarre page belonging to other "
   3.137 -                               "dom %p\n", l1page->u.inuse.domain);    
   3.138 +                               "dom %p\n", page_get_owner(l1page));
   3.139                          continue;
   3.140                      }
   3.141                      
   3.142 @@ -2222,12 +2222,12 @@ void audit_domain(struct domain *d)
   3.143  
   3.144                      }
   3.145  
   3.146 -                    if ( l1page->u.inuse.domain != d )
   3.147 +                    if ( page_get_owner(l1page) != d )
   3.148                      {
   3.149 -                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
   3.150 +                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
   3.151                                 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
   3.152                                 d->id, pfn, i,
   3.153 -                               (unsigned long)l1page->u.inuse.domain,
   3.154 +                               page_get_owner(l1page),
   3.155                                 l1pfn,
   3.156                                 l1page->count_info,
   3.157                                 l1page->u.inuse.type_info,
   3.158 @@ -2312,7 +2312,7 @@ void audit_domain(struct domain *d)
   3.159                      unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
   3.160                      struct pfn_info *l1page = &frame_table[l1pfn];
   3.161  
   3.162 -                    if ( l1page->u.inuse.domain == d)
   3.163 +                    if ( page_get_owner(l1page) == d )
   3.164                          adjust(l1page, 1, 1);
   3.165                  }
   3.166              }
   3.167 @@ -2333,7 +2333,7 @@ void audit_domain(struct domain *d)
   3.168                      unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
   3.169                      struct pfn_info *l1page = &frame_table[l1pfn];
   3.170  
   3.171 -                    if ( (l1page->u.inuse.domain != d) ||
   3.172 +                    if ( (page_get_owner(l1page) != d) ||
   3.173                           (l1pfn < 0x100) || (l1pfn > max_page) )
   3.174                          continue;
   3.175  
     4.1 --- a/xen/arch/x86/shadow.c	Thu Feb 03 13:07:34 2005 +0000
     4.2 +++ b/xen/arch/x86/shadow.c	Thu Feb 03 14:45:50 2005 +0000
     4.3 @@ -420,7 +420,7 @@ static inline struct pfn_info *alloc_sha
     4.4  void unshadow_table(unsigned long gpfn, unsigned int type)
     4.5  {
     4.6      unsigned long  spfn;
     4.7 -    struct domain *d = frame_table[gpfn].u.inuse.domain;
     4.8 +    struct domain *d = page_get_owner(&frame_table[gpfn]);
     4.9  
    4.10      SH_VLOG("unshadow_table type=%08x gpfn=%08lx", type, gpfn);
    4.11  
    4.12 @@ -494,7 +494,7 @@ unsigned long shadow_l2_table(
    4.13          spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
    4.14              mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
    4.15          spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
    4.16 -            mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) |
    4.17 +            mk_l2_pgentry(__pa(page_get_owner(&frame_table[gpfn])->mm_perdomain_pt) |
    4.18  			  __PAGE_HYPERVISOR);
    4.19      }
    4.20  #endif
    4.21 @@ -924,7 +924,7 @@ int check_pagetable(struct mm_struct *m,
    4.22  
    4.23      if (m->shadow_mode != SHM_full_32) {
    4.24          if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
    4.25 -              ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | 
    4.26 +              ((__pa(page_get_owner(&frame_table[gpfn])->mm.perdomain_pt) | 
    4.27              __PAGE_HYPERVISOR))) )
    4.28              FAILPT("hypervisor per-domain map inconsistent");
    4.29      }
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/xen/arch/x86/x86_32/domain_build.c	Thu Feb 03 14:45:50 2005 +0000
     5.3 @@ -0,0 +1,389 @@
     5.4 +/******************************************************************************
     5.5 + * domain_build.c
     5.6 + * 
     5.7 + * Copyright (c) 2002-2005, K A Fraser
     5.8 + */
     5.9 +
    5.10 +#include <xen/config.h>
    5.11 +#include <xen/init.h>
    5.12 +#include <xen/lib.h>
    5.13 +#include <xen/sched.h>
    5.14 +#include <xen/smp.h>
    5.15 +#include <xen/delay.h>
    5.16 +#include <asm/regs.h>
    5.17 +#include <asm/system.h>
    5.18 +#include <asm/io.h>
    5.19 +#include <asm/processor.h>
    5.20 +#include <asm/desc.h>
    5.21 +#include <asm/i387.h>
    5.22 +#include <xen/event.h>
    5.23 +#include <xen/elf.h>
    5.24 +#include <xen/kernel.h>
    5.25 +
    5.26 +/* No ring-3 access in initial page tables. */
    5.27 +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
    5.28 +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
    5.29 +
    5.30 +#define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
    5.31 +#define round_pgdown(_p)  ((_p)&PAGE_MASK)
    5.32 +
    5.33 +int construct_dom0(struct domain *p, 
    5.34 +                   unsigned long alloc_start,
    5.35 +                   unsigned long alloc_end,
    5.36 +                   char *image_start, unsigned long image_len, 
    5.37 +                   char *initrd_start, unsigned long initrd_len,
    5.38 +                   char *cmdline)
    5.39 +{
    5.40 +    char *dst;
    5.41 +    int i, rc;
    5.42 +    unsigned long pfn, mfn;
    5.43 +    unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
    5.44 +    unsigned long nr_pt_pages;
    5.45 +    unsigned long count;
    5.46 +    l2_pgentry_t *l2tab, *l2start;
    5.47 +    l1_pgentry_t *l1tab = NULL, *l1start = NULL;
    5.48 +    struct pfn_info *page = NULL;
    5.49 +    start_info_t *si;
    5.50 +    struct exec_domain *ed = p->exec_domain[0];
    5.51 +
    5.52 +    /*
    5.53 +     * This fully describes the memory layout of the initial domain. All 
    5.54 +     * *_start address are page-aligned, except v_start (and v_end) which are 
    5.55 +     * superpage-aligned.
    5.56 +     */
    5.57 +    struct domain_setup_info dsi;
    5.58 +    unsigned long vinitrd_start;
    5.59 +    unsigned long vinitrd_end;
    5.60 +    unsigned long vphysmap_start;
    5.61 +    unsigned long vphysmap_end;
    5.62 +    unsigned long vstartinfo_start;
    5.63 +    unsigned long vstartinfo_end;
    5.64 +    unsigned long vstack_start;
    5.65 +    unsigned long vstack_end;
    5.66 +    unsigned long vpt_start;
    5.67 +    unsigned long vpt_end;
    5.68 +    unsigned long v_end;
    5.69 +
    5.70 +    /* Machine address of next candidate page-table page. */
    5.71 +    unsigned long mpt_alloc;
    5.72 +
    5.73 +    extern void physdev_init_dom0(struct domain *);
    5.74 +
    5.75 +    /* Sanity! */
    5.76 +    if ( p->id != 0 ) 
    5.77 +        BUG();
    5.78 +    if ( test_bit(DF_CONSTRUCTED, &p->d_flags) ) 
    5.79 +        BUG();
    5.80 +
    5.81 +    memset(&dsi, 0, sizeof(struct domain_setup_info));
    5.82 +
    5.83 +    printk("*** LOADING DOMAIN 0 ***\n");
    5.84 +
    5.85 +    /*
    5.86 +     * This is all a bit grim. We've moved the modules to the "safe" physical 
    5.87 +     * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this 
    5.88 +     * routine we're going to copy it down into the region that's actually 
    5.89 +     * been allocated to domain 0. This is highly likely to be overlapping, so 
    5.90 +     * we use a forward copy.
    5.91 +     * 
    5.92 +     * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with 
    5.93 +     * 4GB and lots of network/disk cards that allocate loads of buffers. 
    5.94 +     * We'll have to revisit this if we ever support PAE (64GB).
    5.95 +     */
    5.96 +
    5.97 +    rc = parseelfimage(image_start, image_len, &dsi);
    5.98 +    if ( rc != 0 )
    5.99 +        return rc;
   5.100 +
   5.101 +    /* Set up domain options */
   5.102 +    if ( dsi.use_writable_pagetables )
   5.103 +        vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
   5.104 +
   5.105 +    if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
   5.106 +    {
   5.107 +        printk("Initial guest OS must load to a page boundary.\n");
   5.108 +        return -EINVAL;
   5.109 +    }
   5.110 +
   5.111 +    /*
   5.112 +     * Why do we need this? The number of page-table frames depends on the 
   5.113 +     * size of the bootstrap address space. But the size of the address space 
   5.114 +     * depends on the number of page-table frames (since each one is mapped 
   5.115 +     * read-only). We have a pair of simultaneous equations in two unknowns, 
   5.116 +     * which we solve by exhaustive search.
   5.117 +     */
   5.118 +    vinitrd_start    = round_pgup(dsi.v_kernend);
   5.119 +    vinitrd_end      = vinitrd_start + initrd_len;
   5.120 +    vphysmap_start   = round_pgup(vinitrd_end);
   5.121 +    vphysmap_end     = vphysmap_start + (nr_pages * sizeof(unsigned long));
   5.122 +    vpt_start        = round_pgup(vphysmap_end);
   5.123 +    for ( nr_pt_pages = 2; ; nr_pt_pages++ )
   5.124 +    {
   5.125 +        vpt_end          = vpt_start + (nr_pt_pages * PAGE_SIZE);
   5.126 +        vstartinfo_start = vpt_end;
   5.127 +        vstartinfo_end   = vstartinfo_start + PAGE_SIZE;
   5.128 +        vstack_start     = vstartinfo_end;
   5.129 +        vstack_end       = vstack_start + PAGE_SIZE;
   5.130 +        v_end            = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
   5.131 +        if ( (v_end - vstack_end) < (512 << 10) )
   5.132 +            v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
   5.133 +        if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >> 
   5.134 +               L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
   5.135 +            break;
   5.136 +    }
   5.137 +
   5.138 +    printk("PHYSICAL MEMORY ARRANGEMENT:\n"
   5.139 +           " Kernel image:  %p->%p\n"
   5.140 +           " Initrd image:  %p->%p\n"
   5.141 +           " Dom0 alloc.:   %08lx->%08lx\n",
   5.142 +           image_start, image_start + image_len,
   5.143 +           initrd_start, initrd_start + initrd_len,
   5.144 +           alloc_start, alloc_end);
   5.145 +    printk("VIRTUAL MEMORY ARRANGEMENT:\n"
   5.146 +           " Loaded kernel: %08lx->%08lx\n"
   5.147 +           " Init. ramdisk: %08lx->%08lx\n"
   5.148 +           " Phys-Mach map: %08lx->%08lx\n"
   5.149 +           " Page tables:   %08lx->%08lx\n"
   5.150 +           " Start info:    %08lx->%08lx\n"
   5.151 +           " Boot stack:    %08lx->%08lx\n"
   5.152 +           " TOTAL:         %08lx->%08lx\n",
   5.153 +           dsi.v_kernstart, dsi.v_kernend, 
   5.154 +           vinitrd_start, vinitrd_end,
   5.155 +           vphysmap_start, vphysmap_end,
   5.156 +           vpt_start, vpt_end,
   5.157 +           vstartinfo_start, vstartinfo_end,
   5.158 +           vstack_start, vstack_end,
   5.159 +           dsi.v_start, v_end);
   5.160 +    printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
   5.161 +
   5.162 +    if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
   5.163 +    {
   5.164 +        printk("Initial guest OS requires too much space\n"
   5.165 +               "(%luMB is greater than %luMB limit)\n",
   5.166 +               (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
   5.167 +        return -ENOMEM;
   5.168 +    }
   5.169 +
   5.170 +    /*
   5.171 +     * Protect the lowest 1GB of memory. We use a temporary mapping there
   5.172 +     * from which we copy the kernel and ramdisk images.
   5.173 +     */
   5.174 +    if ( dsi.v_start < (1<<30) )
   5.175 +    {
   5.176 +        printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
   5.177 +        return -EINVAL;
   5.178 +    }
   5.179 +
   5.180 +    /* Paranoia: scrub DOM0's memory allocation. */
   5.181 +    printk("Scrubbing DOM0 RAM: ");
   5.182 +    dst = (char *)alloc_start;
   5.183 +    while ( dst < (char *)alloc_end )
   5.184 +    {
   5.185 +#define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
   5.186 +        printk(".");
   5.187 +        touch_nmi_watchdog();
   5.188 +        if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
   5.189 +        {
   5.190 +            memset(dst, 0, SCRUB_BYTES);
   5.191 +            dst += SCRUB_BYTES;
   5.192 +        }
   5.193 +        else
   5.194 +        {
   5.195 +            memset(dst, 0, (char *)alloc_end - dst);
   5.196 +            break;
   5.197 +        }
   5.198 +    }
   5.199 +    printk("done.\n");
   5.200 +
   5.201 +    /* Construct a frame-allocation list for the initial domain. */
   5.202 +    for ( mfn = (alloc_start>>PAGE_SHIFT); 
   5.203 +          mfn < (alloc_end>>PAGE_SHIFT); 
   5.204 +          mfn++ )
   5.205 +    {
   5.206 +        page = &frame_table[mfn];
   5.207 +        page_set_owner(page, p);
   5.208 +        page->u.inuse.type_info = 0;
   5.209 +        page->count_info        = PGC_allocated | 1;
   5.210 +        list_add_tail(&page->list, &p->page_list);
   5.211 +        p->tot_pages++; p->max_pages++;
   5.212 +    }
   5.213 +
   5.214 +    mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
   5.215 +
   5.216 +    SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
   5.217 +    SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
   5.218 +
   5.219 +    /*
   5.220 +     * We're basically forcing default RPLs to 1, so that our "what privilege
   5.221 +     * level are we returning to?" logic works.
   5.222 +     */
   5.223 +    ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
   5.224 +    ed->thread.event_selector    = FLAT_GUESTOS_CS;
   5.225 +    ed->thread.guestos_ss = FLAT_GUESTOS_DS;
   5.226 +    for ( i = 0; i < 256; i++ ) 
   5.227 +        ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
   5.228 +
   5.229 +    /* WARNING: The new domain must have its 'processor' field filled in! */
   5.230 +    l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
   5.231 +    memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
   5.232 +    l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
   5.233 +        mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
   5.234 +    l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
   5.235 +        mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
   5.236 +    ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
   5.237 +
   5.238 +    l2tab += l2_table_offset(dsi.v_start);
   5.239 +    mfn = alloc_start >> PAGE_SHIFT;
   5.240 +    for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
   5.241 +    {
   5.242 +        if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
   5.243 +        {
   5.244 +            l1start = l1tab = (l1_pgentry_t *)mpt_alloc; 
   5.245 +            mpt_alloc += PAGE_SIZE;
   5.246 +            *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
   5.247 +            clear_page(l1tab);
   5.248 +            if ( count == 0 )
   5.249 +                l1tab += l1_table_offset(dsi.v_start);
   5.250 +        }
   5.251 +        *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
   5.252 +        
   5.253 +        page = &frame_table[mfn];
   5.254 +        if ( !get_page_and_type(page, p, PGT_writable_page) )
   5.255 +            BUG();
   5.256 +
   5.257 +        mfn++;
   5.258 +    }
   5.259 +
   5.260 +    /* Pages that are part of page tables must be read only. */
   5.261 +    l2tab = l2start + l2_table_offset(vpt_start);
   5.262 +    l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
   5.263 +    l1tab += l1_table_offset(vpt_start);
   5.264 +    l2tab++;
   5.265 +    for ( count = 0; count < nr_pt_pages; count++ ) 
   5.266 +    {
   5.267 +        *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
   5.268 +        page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
   5.269 +        if ( count == 0 )
   5.270 +        {
   5.271 +            page->u.inuse.type_info &= ~PGT_type_mask;
   5.272 +            page->u.inuse.type_info |= PGT_l2_page_table;
   5.273 +
   5.274 +            /*
   5.275 +             * No longer writable: decrement the type_count.
   5.276 +             * Installed as CR3: increment both the ref_count and type_count.
   5.277 +             * Net: just increment the ref_count.
   5.278 +             */
   5.279 +            get_page(page, p); /* an extra ref because of readable mapping */
   5.280 +
   5.281 +            /* Get another ref to L2 page so that it can be pinned. */
   5.282 +            if ( !get_page_and_type(page, p, PGT_l2_page_table) )
   5.283 +                BUG();
   5.284 +            set_bit(_PGT_pinned, &page->u.inuse.type_info);
   5.285 +        }
   5.286 +        else
   5.287 +        {
   5.288 +            page->u.inuse.type_info &= ~PGT_type_mask;
   5.289 +            page->u.inuse.type_info |= PGT_l1_page_table;
   5.290 +	    page->u.inuse.type_info |= 
   5.291 +		((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
   5.292 +
   5.293 +            /*
   5.294 +             * No longer writable: decrement the type_count.
   5.295 +             * This is an L1 page, installed in a validated L2 page:
   5.296 +             * increment both the ref_count and type_count.
   5.297 +             * Net: just increment the ref_count.
   5.298 +             */
   5.299 +            get_page(page, p); /* an extra ref because of readable mapping */
   5.300 +        }
   5.301 +        l1tab++;
   5.302 +        if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
   5.303 +            l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
   5.304 +    }
   5.305 +
   5.306 +    /* Set up shared-info area. */
   5.307 +    update_dom_time(p);
   5.308 +    p->shared_info->domain_time = 0;
   5.309 +    /* Mask all upcalls... */
   5.310 +    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
   5.311 +        p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
   5.312 +    p->shared_info->n_vcpu = smp_num_cpus;
   5.313 +
   5.314 +    /* Install the new page tables. */
   5.315 +    __cli();
   5.316 +    write_ptbase(&ed->mm);
   5.317 +
   5.318 +    /* Copy the OS image. */
   5.319 +    (void)loadelfimage(image_start);
   5.320 +
   5.321 +    /* Copy the initial ramdisk. */
   5.322 +    if ( initrd_len != 0 )
   5.323 +        memcpy((void *)vinitrd_start, initrd_start, initrd_len);
   5.324 +    
   5.325 +    /* Set up start info area. */
   5.326 +    si = (start_info_t *)vstartinfo_start;
   5.327 +    memset(si, 0, PAGE_SIZE);
   5.328 +    si->nr_pages     = p->tot_pages;
   5.329 +    si->shared_info  = virt_to_phys(p->shared_info);
   5.330 +    si->flags        = SIF_PRIVILEGED | SIF_INITDOMAIN;
   5.331 +    si->pt_base      = vpt_start;
   5.332 +    si->nr_pt_frames = nr_pt_pages;
   5.333 +    si->mfn_list     = vphysmap_start;
   5.334 +
   5.335 +    /* Write the phys->machine and machine->phys table entries. */
   5.336 +    for ( pfn = 0; pfn < p->tot_pages; pfn++ )
   5.337 +    {
   5.338 +        mfn = pfn + (alloc_start>>PAGE_SHIFT);
   5.339 +#ifndef NDEBUG
   5.340 +#define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
   5.341 +        if ( pfn > REVERSE_START )
   5.342 +            mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
   5.343 +#endif
   5.344 +        ((unsigned long *)vphysmap_start)[pfn] = mfn;
   5.345 +        machine_to_phys_mapping[mfn] = pfn;
   5.346 +    }
   5.347 +
   5.348 +    if ( initrd_len != 0 )
   5.349 +    {
   5.350 +        si->mod_start = vinitrd_start;
   5.351 +        si->mod_len   = initrd_len;
   5.352 +        printk("Initrd len 0x%lx, start at 0x%08lx\n",
   5.353 +               si->mod_len, si->mod_start);
   5.354 +    }
   5.355 +
   5.356 +    dst = si->cmd_line;
   5.357 +    if ( cmdline != NULL )
   5.358 +    {
   5.359 +        for ( i = 0; i < 255; i++ )
   5.360 +        {
   5.361 +            if ( cmdline[i] == '\0' )
   5.362 +                break;
   5.363 +            *dst++ = cmdline[i];
   5.364 +        }
   5.365 +    }
   5.366 +    *dst = '\0';
   5.367 +
   5.368 +    /* Reinstate the caller's page tables. */
   5.369 +    write_ptbase(&current->mm);
   5.370 +    __sti();
   5.371 +
   5.372 +    /* Destroy low mappings - they were only for our convenience. */
   5.373 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
   5.374 +        if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
   5.375 +            l2start[i] = mk_l2_pgentry(0);
   5.376 +    zap_low_mappings(); /* Do the same for the idle page tables. */
   5.377 +    
   5.378 +    /* DOM0 gets access to everything. */
   5.379 +    physdev_init_dom0(p);
   5.380 +
   5.381 +    set_bit(DF_CONSTRUCTED, &p->d_flags);
   5.382 +
   5.383 +    new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
   5.384 +
   5.385 +#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
   5.386 +    shadow_lock(&p->mm);
   5.387 +    shadow_mode_enable(p, SHM_test); 
   5.388 +    shadow_unlock(&p->mm);
   5.389 +#endif
   5.390 +
   5.391 +    return 0;
   5.392 +}
     6.1 --- a/xen/arch/x86/x86_32/mm.c	Thu Feb 03 13:07:34 2005 +0000
     6.2 +++ b/xen/arch/x86/x86_32/mm.c	Thu Feb 03 14:45:50 2005 +0000
     6.3 @@ -151,13 +151,13 @@ void subarch_init_memory(struct domain *
     6.4       * 64-bit operations on them. Also, just for sanity, we assert the size
     6.5       * of the structure here.
     6.6       */
     6.7 -    if ( (offsetof(struct pfn_info, u.inuse.domain) != 
     6.8 +    if ( (offsetof(struct pfn_info, u.inuse._domain) != 
     6.9            (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
    6.10           (sizeof(struct pfn_info) != 24) )
    6.11      {
    6.12          printk("Weird pfn_info layout (%ld,%ld,%d)\n",
    6.13                 offsetof(struct pfn_info, count_info),
    6.14 -               offsetof(struct pfn_info, u.inuse.domain),
    6.15 +               offsetof(struct pfn_info, u.inuse._domain),
    6.16                 sizeof(struct pfn_info));
    6.17          for ( ; ; ) ;
    6.18      }
    6.19 @@ -167,11 +167,11 @@ void subarch_init_memory(struct domain *
    6.20          idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)]);
    6.21      for ( i = 0; i < 1024; i++ )
    6.22      {
    6.23 -        frame_table[m2p_start_mfn+i].count_info        = PGC_allocated | 1;
    6.24 +        frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
    6.25  	/* gdt to make sure it's only mapped read-only by non-privileged
    6.26  	   domains. */
    6.27          frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
    6.28 -        frame_table[m2p_start_mfn+i].u.inuse.domain    = dom_xen;
    6.29 +        page_set_owner(&frame_table[m2p_start_mfn+i], dom_xen);
    6.30      }
    6.31  }
    6.32  
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/xen/arch/x86/x86_64/domain_build.c	Thu Feb 03 14:45:50 2005 +0000
     7.3 @@ -0,0 +1,391 @@
     7.4 +/******************************************************************************
     7.5 + * domain_build.c
     7.6 + * 
     7.7 + * Copyright (c) 2002-2005, K A Fraser
     7.8 + */
     7.9 +
    7.10 +#include <xen/config.h>
    7.11 +#include <xen/init.h>
    7.12 +#include <xen/lib.h>
    7.13 +#include <xen/sched.h>
    7.14 +#include <xen/smp.h>
    7.15 +#include <xen/delay.h>
    7.16 +#include <asm/regs.h>
    7.17 +#include <asm/system.h>
    7.18 +#include <asm/io.h>
    7.19 +#include <asm/processor.h>
    7.20 +#include <asm/desc.h>
    7.21 +#include <asm/i387.h>
    7.22 +#include <xen/event.h>
    7.23 +#include <xen/elf.h>
    7.24 +#include <xen/kernel.h>
    7.25 +
    7.26 +/* Allow ring-3 access in long mode as guest cannot use ring 1. */
    7.27 +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
    7.28 +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
    7.29 +#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
    7.30 +#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
    7.31 +
    7.32 +#define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
    7.33 +#define round_pgdown(_p)  ((_p)&PAGE_MASK)
    7.34 +
    7.35 +int construct_dom0(struct domain *p, 
    7.36 +                   unsigned long alloc_start,
    7.37 +                   unsigned long alloc_end,
    7.38 +                   char *image_start, unsigned long image_len, 
    7.39 +                   char *initrd_start, unsigned long initrd_len,
    7.40 +                   char *cmdline)
    7.41 +{
    7.42 +    char *dst;
    7.43 +    int i, rc;
    7.44 +    unsigned long pfn, mfn;
    7.45 +    unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
    7.46 +    unsigned long nr_pt_pages;
    7.47 +    unsigned long count;
    7.48 +    l2_pgentry_t *l2tab, *l2start;
    7.49 +    l1_pgentry_t *l1tab = NULL, *l1start = NULL;
    7.50 +    struct pfn_info *page = NULL;
    7.51 +    start_info_t *si;
    7.52 +    struct exec_domain *ed = p->exec_domain[0];
    7.53 +
    7.54 +    /*
    7.55 +     * This fully describes the memory layout of the initial domain. All 
    7.56 +     * *_start address are page-aligned, except v_start (and v_end) which are 
    7.57 +     * superpage-aligned.
    7.58 +     */
    7.59 +    struct domain_setup_info dsi;
    7.60 +    unsigned long vinitrd_start;
    7.61 +    unsigned long vinitrd_end;
    7.62 +    unsigned long vphysmap_start;
    7.63 +    unsigned long vphysmap_end;
    7.64 +    unsigned long vstartinfo_start;
    7.65 +    unsigned long vstartinfo_end;
    7.66 +    unsigned long vstack_start;
    7.67 +    unsigned long vstack_end;
    7.68 +    unsigned long vpt_start;
    7.69 +    unsigned long vpt_end;
    7.70 +    unsigned long v_end;
    7.71 +
    7.72 +    /* Machine address of next candidate page-table page. */
    7.73 +    unsigned long mpt_alloc;
    7.74 +
    7.75 +    extern void physdev_init_dom0(struct domain *);
    7.76 +
    7.77 +    /* Sanity! */
    7.78 +    if ( p->id != 0 ) 
    7.79 +        BUG();
    7.80 +    if ( test_bit(DF_CONSTRUCTED, &p->d_flags) ) 
    7.81 +        BUG();
    7.82 +
    7.83 +    memset(&dsi, 0, sizeof(struct domain_setup_info));
    7.84 +
    7.85 +    printk("*** LOADING DOMAIN 0 ***\n");
    7.86 +
    7.87 +    /*
    7.88 +     * This is all a bit grim. We've moved the modules to the "safe" physical 
    7.89 +     * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this 
    7.90 +     * routine we're going to copy it down into the region that's actually 
    7.91 +     * been allocated to domain 0. This is highly likely to be overlapping, so 
    7.92 +     * we use a forward copy.
    7.93 +     * 
    7.94 +     * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with 
    7.95 +     * 4GB and lots of network/disk cards that allocate loads of buffers. 
    7.96 +     * We'll have to revisit this if we ever support PAE (64GB).
    7.97 +     */
    7.98 +
    7.99 +    rc = parseelfimage(image_start, image_len, &dsi);
   7.100 +    if ( rc != 0 )
   7.101 +        return rc;
   7.102 +
   7.103 +    /* Set up domain options */
   7.104 +    if ( dsi.use_writable_pagetables )
   7.105 +        vm_assist(p, VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
   7.106 +
   7.107 +    if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
   7.108 +    {
   7.109 +        printk("Initial guest OS must load to a page boundary.\n");
   7.110 +        return -EINVAL;
   7.111 +    }
   7.112 +
   7.113 +    /*
   7.114 +     * Why do we need this? The number of page-table frames depends on the 
   7.115 +     * size of the bootstrap address space. But the size of the address space 
   7.116 +     * depends on the number of page-table frames (since each one is mapped 
   7.117 +     * read-only). We have a pair of simultaneous equations in two unknowns, 
   7.118 +     * which we solve by exhaustive search.
   7.119 +     */
   7.120 +    vinitrd_start    = round_pgup(dsi.v_kernend);
   7.121 +    vinitrd_end      = vinitrd_start + initrd_len;
   7.122 +    vphysmap_start   = round_pgup(vinitrd_end);
   7.123 +    vphysmap_end     = vphysmap_start + (nr_pages * sizeof(unsigned long));
   7.124 +    vpt_start        = round_pgup(vphysmap_end);
   7.125 +    for ( nr_pt_pages = 2; ; nr_pt_pages++ )
   7.126 +    {
   7.127 +        vpt_end          = vpt_start + (nr_pt_pages * PAGE_SIZE);
   7.128 +        vstartinfo_start = vpt_end;
   7.129 +        vstartinfo_end   = vstartinfo_start + PAGE_SIZE;
   7.130 +        vstack_start     = vstartinfo_end;
   7.131 +        vstack_end       = vstack_start + PAGE_SIZE;
   7.132 +        v_end            = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
   7.133 +        if ( (v_end - vstack_end) < (512 << 10) )
   7.134 +            v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
   7.135 +        if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >> 
   7.136 +               L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
   7.137 +            break;
   7.138 +    }
   7.139 +
   7.140 +    printk("PHYSICAL MEMORY ARRANGEMENT:\n"
   7.141 +           " Kernel image:  %p->%p\n"
   7.142 +           " Initrd image:  %p->%p\n"
   7.143 +           " Dom0 alloc.:   %08lx->%08lx\n",
   7.144 +           image_start, image_start + image_len,
   7.145 +           initrd_start, initrd_start + initrd_len,
   7.146 +           alloc_start, alloc_end);
   7.147 +    printk("VIRTUAL MEMORY ARRANGEMENT:\n"
   7.148 +           " Loaded kernel: %08lx->%08lx\n"
   7.149 +           " Init. ramdisk: %08lx->%08lx\n"
   7.150 +           " Phys-Mach map: %08lx->%08lx\n"
   7.151 +           " Page tables:   %08lx->%08lx\n"
   7.152 +           " Start info:    %08lx->%08lx\n"
   7.153 +           " Boot stack:    %08lx->%08lx\n"
   7.154 +           " TOTAL:         %08lx->%08lx\n",
   7.155 +           dsi.v_kernstart, dsi.v_kernend, 
   7.156 +           vinitrd_start, vinitrd_end,
   7.157 +           vphysmap_start, vphysmap_end,
   7.158 +           vpt_start, vpt_end,
   7.159 +           vstartinfo_start, vstartinfo_end,
   7.160 +           vstack_start, vstack_end,
   7.161 +           dsi.v_start, v_end);
   7.162 +    printk(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
   7.163 +
   7.164 +    if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
   7.165 +    {
   7.166 +        printk("Initial guest OS requires too much space\n"
   7.167 +               "(%luMB is greater than %luMB limit)\n",
   7.168 +               (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
   7.169 +        return -ENOMEM;
   7.170 +    }
   7.171 +
   7.172 +    /*
   7.173 +     * Protect the lowest 1GB of memory. We use a temporary mapping there
   7.174 +     * from which we copy the kernel and ramdisk images.
   7.175 +     */
   7.176 +    if ( dsi.v_start < (1<<30) )
   7.177 +    {
   7.178 +        printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
   7.179 +        return -EINVAL;
   7.180 +    }
   7.181 +
   7.182 +    /* Paranoia: scrub DOM0's memory allocation. */
   7.183 +    printk("Scrubbing DOM0 RAM: ");
   7.184 +    dst = (char *)alloc_start;
   7.185 +    while ( dst < (char *)alloc_end )
   7.186 +    {
   7.187 +#define SCRUB_BYTES (100 * 1024 * 1024) /* 100MB */
   7.188 +        printk(".");
   7.189 +        touch_nmi_watchdog();
   7.190 +        if ( ((char *)alloc_end - dst) > SCRUB_BYTES )
   7.191 +        {
   7.192 +            memset(dst, 0, SCRUB_BYTES);
   7.193 +            dst += SCRUB_BYTES;
   7.194 +        }
   7.195 +        else
   7.196 +        {
   7.197 +            memset(dst, 0, (char *)alloc_end - dst);
   7.198 +            break;
   7.199 +        }
   7.200 +    }
   7.201 +    printk("done.\n");
   7.202 +
   7.203 +    /* Construct a frame-allocation list for the initial domain. */
   7.204 +    for ( mfn = (alloc_start>>PAGE_SHIFT); 
   7.205 +          mfn < (alloc_end>>PAGE_SHIFT); 
   7.206 +          mfn++ )
   7.207 +    {
   7.208 +        page = &frame_table[mfn];
   7.209 +        page_set_owner(page, p);
   7.210 +        page->u.inuse.type_info = 0;
   7.211 +        page->count_info        = PGC_allocated | 1;
   7.212 +        list_add_tail(&page->list, &p->page_list);
   7.213 +        p->tot_pages++; p->max_pages++;
   7.214 +    }
   7.215 +
   7.216 +    mpt_alloc = (vpt_start - dsi.v_start) + alloc_start;
   7.217 +
   7.218 +    SET_GDT_ENTRIES(ed, DEFAULT_GDT_ENTRIES);
   7.219 +    SET_GDT_ADDRESS(ed, DEFAULT_GDT_ADDRESS);
   7.220 +
   7.221 +    /*
   7.222 +     * We're basically forcing default RPLs to 1, so that our "what privilege
   7.223 +     * level are we returning to?" logic works.
   7.224 +     */
   7.225 +    ed->thread.failsafe_selector = FLAT_GUESTOS_CS;
   7.226 +    ed->thread.event_selector    = FLAT_GUESTOS_CS;
   7.227 +    ed->thread.guestos_ss = FLAT_GUESTOS_DS;
   7.228 +    for ( i = 0; i < 256; i++ ) 
   7.229 +        ed->thread.traps[i].cs = FLAT_GUESTOS_CS;
   7.230 +
   7.231 +    /* WARNING: The new domain must have its 'processor' field filled in! */
   7.232 +    l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
   7.233 +    memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
   7.234 +    l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
   7.235 +        mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
   7.236 +    l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
   7.237 +        mk_l2_pgentry(__pa(p->mm_perdomain_pt) | __PAGE_HYPERVISOR);
   7.238 +    ed->mm.pagetable = mk_pagetable((unsigned long)l2start);
   7.239 +
   7.240 +    l2tab += l2_table_offset(dsi.v_start);
   7.241 +    mfn = alloc_start >> PAGE_SHIFT;
   7.242 +    for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
   7.243 +    {
   7.244 +        if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
   7.245 +        {
   7.246 +            l1start = l1tab = (l1_pgentry_t *)mpt_alloc; 
   7.247 +            mpt_alloc += PAGE_SIZE;
   7.248 +            *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
   7.249 +            clear_page(l1tab);
   7.250 +            if ( count == 0 )
   7.251 +                l1tab += l1_table_offset(dsi.v_start);
   7.252 +        }
   7.253 +        *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
   7.254 +        
   7.255 +        page = &frame_table[mfn];
   7.256 +        if ( !get_page_and_type(page, p, PGT_writable_page) )
   7.257 +            BUG();
   7.258 +
   7.259 +        mfn++;
   7.260 +    }
   7.261 +
   7.262 +    /* Pages that are part of page tables must be read only. */
   7.263 +    l2tab = l2start + l2_table_offset(vpt_start);
   7.264 +    l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
   7.265 +    l1tab += l1_table_offset(vpt_start);
   7.266 +    l2tab++;
   7.267 +    for ( count = 0; count < nr_pt_pages; count++ ) 
   7.268 +    {
   7.269 +        *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
   7.270 +        page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
   7.271 +        if ( count == 0 )
   7.272 +        {
   7.273 +            page->u.inuse.type_info &= ~PGT_type_mask;
   7.274 +            page->u.inuse.type_info |= PGT_l2_page_table;
   7.275 +
   7.276 +            /*
   7.277 +             * No longer writable: decrement the type_count.
   7.278 +             * Installed as CR3: increment both the ref_count and type_count.
   7.279 +             * Net: just increment the ref_count.
   7.280 +             */
   7.281 +            get_page(page, p); /* an extra ref because of readable mapping */
   7.282 +
   7.283 +            /* Get another ref to L2 page so that it can be pinned. */
   7.284 +            if ( !get_page_and_type(page, p, PGT_l2_page_table) )
   7.285 +                BUG();
   7.286 +            set_bit(_PGT_pinned, &page->u.inuse.type_info);
   7.287 +        }
   7.288 +        else
   7.289 +        {
   7.290 +            page->u.inuse.type_info &= ~PGT_type_mask;
   7.291 +            page->u.inuse.type_info |= PGT_l1_page_table;
   7.292 +	    page->u.inuse.type_info |= 
   7.293 +		((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
   7.294 +
   7.295 +            /*
   7.296 +             * No longer writable: decrement the type_count.
   7.297 +             * This is an L1 page, installed in a validated L2 page:
   7.298 +             * increment both the ref_count and type_count.
   7.299 +             * Net: just increment the ref_count.
   7.300 +             */
   7.301 +            get_page(page, p); /* an extra ref because of readable mapping */
   7.302 +        }
   7.303 +        l1tab++;
   7.304 +        if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
   7.305 +            l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
   7.306 +    }
   7.307 +
   7.308 +    /* Set up shared-info area. */
   7.309 +    update_dom_time(p);
   7.310 +    p->shared_info->domain_time = 0;
   7.311 +    /* Mask all upcalls... */
   7.312 +    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
   7.313 +        p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
   7.314 +    p->shared_info->n_vcpu = smp_num_cpus;
   7.315 +
   7.316 +    /* Install the new page tables. */
   7.317 +    __cli();
   7.318 +    write_ptbase(&ed->mm);
   7.319 +
   7.320 +    /* Copy the OS image. */
   7.321 +    (void)loadelfimage(image_start);
   7.322 +
   7.323 +    /* Copy the initial ramdisk. */
   7.324 +    if ( initrd_len != 0 )
   7.325 +        memcpy((void *)vinitrd_start, initrd_start, initrd_len);
   7.326 +    
   7.327 +    /* Set up start info area. */
   7.328 +    si = (start_info_t *)vstartinfo_start;
   7.329 +    memset(si, 0, PAGE_SIZE);
   7.330 +    si->nr_pages     = p->tot_pages;
   7.331 +    si->shared_info  = virt_to_phys(p->shared_info);
   7.332 +    si->flags        = SIF_PRIVILEGED | SIF_INITDOMAIN;
   7.333 +    si->pt_base      = vpt_start;
   7.334 +    si->nr_pt_frames = nr_pt_pages;
   7.335 +    si->mfn_list     = vphysmap_start;
   7.336 +
   7.337 +    /* Write the phys->machine and machine->phys table entries. */
   7.338 +    for ( pfn = 0; pfn < p->tot_pages; pfn++ )
   7.339 +    {
   7.340 +        mfn = pfn + (alloc_start>>PAGE_SHIFT);
   7.341 +#ifndef NDEBUG
   7.342 +#define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT)
   7.343 +        if ( pfn > REVERSE_START )
   7.344 +            mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START);
   7.345 +#endif
   7.346 +        ((unsigned long *)vphysmap_start)[pfn] = mfn;
   7.347 +        machine_to_phys_mapping[mfn] = pfn;
   7.348 +    }
   7.349 +
   7.350 +    if ( initrd_len != 0 )
   7.351 +    {
   7.352 +        si->mod_start = vinitrd_start;
   7.353 +        si->mod_len   = initrd_len;
   7.354 +        printk("Initrd len 0x%lx, start at 0x%08lx\n",
   7.355 +               si->mod_len, si->mod_start);
   7.356 +    }
   7.357 +
   7.358 +    dst = si->cmd_line;
   7.359 +    if ( cmdline != NULL )
   7.360 +    {
   7.361 +        for ( i = 0; i < 255; i++ )
   7.362 +        {
   7.363 +            if ( cmdline[i] == '\0' )
   7.364 +                break;
   7.365 +            *dst++ = cmdline[i];
   7.366 +        }
   7.367 +    }
   7.368 +    *dst = '\0';
   7.369 +
   7.370 +    /* Reinstate the caller's page tables. */
   7.371 +    write_ptbase(&current->mm);
   7.372 +    __sti();
   7.373 +
   7.374 +    /* Destroy low mappings - they were only for our convenience. */
   7.375 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
   7.376 +        if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
   7.377 +            l2start[i] = mk_l2_pgentry(0);
   7.378 +    zap_low_mappings(); /* Do the same for the idle page tables. */
   7.379 +    
   7.380 +    /* DOM0 gets access to everything. */
   7.381 +    physdev_init_dom0(p);
   7.382 +
   7.383 +    set_bit(DF_CONSTRUCTED, &p->d_flags);
   7.384 +
   7.385 +    new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
   7.386 +
   7.387 +#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
   7.388 +    shadow_lock(&p->mm);
   7.389 +    shadow_mode_enable(p, SHM_test); 
   7.390 +    shadow_unlock(&p->mm);
   7.391 +#endif
   7.392 +
   7.393 +    return 0;
   7.394 +}
     8.1 --- a/xen/arch/x86/x86_64/mm.c	Thu Feb 03 13:07:34 2005 +0000
     8.2 +++ b/xen/arch/x86/x86_64/mm.c	Thu Feb 03 14:45:50 2005 +0000
     8.3 @@ -171,6 +171,21 @@ void subarch_init_memory(struct domain *
     8.4      l3_pgentry_t l3e;
     8.5      l2_pgentry_t l2e;
     8.6  
     8.7 +    /*
     8.8 +     * We are rather picky about the layout of 'struct pfn_info'. The
     8.9 +     * count_info and domain fields must be adjacent, as we perform atomic
    8.10 +     * 64-bit operations on them.
    8.11 +     */
    8.12 +    if ( (offsetof(struct pfn_info, u.inuse._domain) != 
    8.13 +          (offsetof(struct pfn_info, count_info) + sizeof(u32))) )
    8.14 +    {
    8.15 +        printk("Weird pfn_info layout (%ld,%ld,%d)\n",
    8.16 +               offsetof(struct pfn_info, count_info),
    8.17 +               offsetof(struct pfn_info, u.inuse._domain),
    8.18 +               sizeof(struct pfn_info));
    8.19 +        for ( ; ; ) ;
    8.20 +    }
    8.21 +
    8.22      /* M2P table is mappable read-only by privileged domains. */
    8.23      for ( v  = RDWR_MPT_VIRT_START; 
    8.24            v != RDWR_MPT_VIRT_END;
    8.25 @@ -187,11 +202,11 @@ void subarch_init_memory(struct domain *
    8.26  
    8.27          for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
    8.28          {
    8.29 -            frame_table[m2p_start_mfn+i].count_info        = PGC_allocated | 1;
    8.30 +            frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1;
    8.31              /* gdt to make sure it's only mapped read-only by non-privileged
    8.32                 domains. */
    8.33              frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1;
    8.34 -            frame_table[m2p_start_mfn+i].u.inuse.domain    = dom_xen;
    8.35 +            page_set_owner(&frame_table[m2p_start_mfn+i], dom_xen);
    8.36          }
    8.37      }
    8.38  }
     9.1 --- a/xen/common/page_alloc.c	Thu Feb 03 13:07:34 2005 +0000
     9.2 +++ b/xen/common/page_alloc.c	Thu Feb 03 14:45:50 2005 +0000
     9.3 @@ -418,7 +418,7 @@ unsigned long alloc_xenheap_pages(unsign
     9.4      for ( i = 0; i < (1 << order); i++ )
     9.5      {
     9.6          pg[i].count_info        = 0;
     9.7 -        pg[i].u.inuse.domain    = NULL;
     9.8 +        pg[i].u.inuse._domain   = 0;
     9.9          pg[i].u.inuse.type_info = 0;
    9.10      }
    9.11  
    9.12 @@ -501,7 +501,7 @@ struct pfn_info *alloc_domheap_pages(str
    9.13          }
    9.14  
    9.15          pg[i].count_info        = 0;
    9.16 -        pg[i].u.inuse.domain    = NULL;
    9.17 +        pg[i].u.inuse._domain   = 0;
    9.18          pg[i].u.inuse.type_info = 0;
    9.19      }
    9.20  
    9.21 @@ -529,7 +529,7 @@ struct pfn_info *alloc_domheap_pages(str
    9.22  
    9.23      for ( i = 0; i < (1 << order); i++ )
    9.24      {
    9.25 -        pg[i].u.inuse.domain = d;
    9.26 +        page_set_owner(&pg[i], d);
    9.27          wmb(); /* Domain pointer must be visible before updating refcnt. */
    9.28          pg[i].count_info |= PGC_allocated | 1;
    9.29          list_add_tail(&pg[i].list, &d->page_list);
    9.30 @@ -544,7 +544,7 @@ struct pfn_info *alloc_domheap_pages(str
    9.31  void free_domheap_pages(struct pfn_info *pg, unsigned int order)
    9.32  {
    9.33      int            i, drop_dom_ref;
    9.34 -    struct domain *d = pg->u.inuse.domain;
    9.35 +    struct domain *d = page_get_owner(pg);
    9.36      struct exec_domain *ed;
    9.37      void          *p;
    9.38      int cpu_mask = 0;
    10.1 --- a/xen/include/asm-x86/mm.h	Thu Feb 03 13:07:34 2005 +0000
    10.2 +++ b/xen/include/asm-x86/mm.h	Thu Feb 03 14:45:50 2005 +0000
    10.3 @@ -30,6 +30,9 @@ struct pfn_info
    10.4      /* Each frame can be threaded onto a doubly-linked list. */
    10.5      struct list_head list;
    10.6  
    10.7 +    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
    10.8 +    u32 tlbflush_timestamp;
    10.9 +
   10.10      /* Reference count and various PGC_xxx flags and fields. */
   10.11      u32 count_info;
   10.12  
   10.13 @@ -39,24 +42,22 @@ struct pfn_info
   10.14          /* Page is in use: ((count_info & PGC_count_mask) != 0). */
   10.15          struct {
   10.16              /* Owner of this page (NULL if page is anonymous). */
   10.17 -            struct domain *domain;
   10.18 +            u32 _domain; /* pickled format */
   10.19              /* Type reference count and various PGT_xxx flags and fields. */
   10.20              u32 type_info;
   10.21 -        } inuse;
   10.22 +        } PACKED inuse;
   10.23  
   10.24          /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
   10.25          struct {
   10.26              /* Mask of possibly-tainted TLBs. */
   10.27 -            unsigned long cpu_mask;
   10.28 +            u32 cpu_mask;
   10.29              /* Order-size of the free chunk this page is the head of. */
   10.30              u8 order;
   10.31 -        } free;
   10.32 -
   10.33 -    } u;
   10.34 +        } PACKED free;
   10.35  
   10.36 -    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
   10.37 -    u32 tlbflush_timestamp;
   10.38 -};
   10.39 +    } PACKED u;
   10.40 +
   10.41 +} PACKED;
   10.42  
   10.43   /* The following page types are MUTUALLY EXCLUSIVE. */
   10.44  #define PGT_none            (0<<29) /* no special uses of this page */
   10.45 @@ -97,9 +98,25 @@ struct pfn_info
   10.46  
   10.47  #define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < xenheap_phys_end)
   10.48  
   10.49 +#if defined(__i386__)
   10.50 +
   10.51 +#define pickle_domptr(_d)   ((u32)(unsigned long)(_d))
   10.52 +#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d))
   10.53 +
   10.54 +#elif defined(__x86_64__)
   10.55 +static inline struct domain *unpickle_domptr(u32 _domain)
   10.56 +{ return (_domain == 0) ? NULL : __va(_domain); }
   10.57 +static inline u32 pickle_domptr(struct domain *domain)
   10.58 +{ return (domain == NULL) ? 0 : (u32)__pa(domain); }
   10.59 +
   10.60 +#endif
   10.61 +
   10.62 +#define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
   10.63 +#define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
   10.64 +
   10.65  #define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                                   \
   10.66      do {                                                                    \
   10.67 -        (_pfn)->u.inuse.domain = (_dom);                                    \
   10.68 +        page_set_owner((_pfn), (_dom));                                     \
   10.69          /* The incremented type count is intended to pin to 'writable'. */  \
   10.70          (_pfn)->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;  \
   10.71          wmb(); /* install valid domain ptr before updating refcnt. */       \
   10.72 @@ -142,7 +159,8 @@ static inline int get_page(struct pfn_in
   10.73                             struct domain *domain)
   10.74  {
   10.75      u32 x, nx, y = page->count_info;
   10.76 -    struct domain *d, *nd = page->u.inuse.domain;
   10.77 +    u32 d, nd = page->u.inuse._domain;
   10.78 +    u32 _domain = pickle_domptr(domain);
   10.79  
   10.80      do {
   10.81          x  = y;
   10.82 @@ -150,10 +168,10 @@ static inline int get_page(struct pfn_in
   10.83          d  = nd;
   10.84          if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
   10.85               unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
   10.86 -             unlikely(d != domain) )                 /* Wrong owner? */
   10.87 +             unlikely(d != _domain) )                /* Wrong owner? */
   10.88          {
   10.89              DPRINTK("Error pfn %08lx: ed=%p, sd=%p, caf=%08x, taf=%08x\n",
   10.90 -                    page_to_pfn(page), domain, d,
   10.91 +                    page_to_pfn(page), domain, unpickle_domptr(d),
   10.92                      x, page->u.inuse.type_info);
   10.93              return 0;
   10.94          }
   10.95 @@ -198,7 +216,7 @@ static inline int get_page_and_type(stru
   10.96      ASSERT(((_p)->u.inuse.type_info & PGT_count_mask) != 0)
   10.97  #define ASSERT_PAGE_IS_DOMAIN(_p, _d)                          \
   10.98      ASSERT(((_p)->count_info & PGC_count_mask) != 0);          \
   10.99 -    ASSERT((_p)->u.inuse.domain == (_d))
  10.100 +    ASSERT(page_get_owner(_p) == (_d))
  10.101  
  10.102  int check_descriptor(unsigned long *d);
  10.103  
    11.1 --- a/xen/include/asm-x86/shadow.h	Thu Feb 03 13:07:34 2005 +0000
    11.2 +++ b/xen/include/asm-x86/shadow.h	Thu Feb 03 14:45:50 2005 +0000
    11.3 @@ -189,7 +189,7 @@ static inline int __mark_dirty( struct m
    11.4          SH_LOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (mm %p)",
    11.5                 mfn, pfn, m->shadow_dirty_bitmap_size, m );
    11.6          SH_LOG("dom=%p caf=%08x taf=%08x\n", 
    11.7 -               frame_table[mfn].u.inuse.domain,
    11.8 +               page_get_owner(&frame_table[mfn]),
    11.9                 frame_table[mfn].count_info, 
   11.10                 frame_table[mfn].u.inuse.type_info );
   11.11      }