ia64/xen-unstable

changeset 18578:ba543f51c6f1

[IA64] fix XENMEM_add_to_physmap with XENMAPSPACE_mfn.

This patch fixes HVM domain save/restore.
Tools stack is aware of where memory is populated in guest domain.
But XENMEM_add_to_physmap with XENMAPSPACE_mfn doesn't update
the information related to guest memory map. So guest domain
save/dump-core fails to dump pages which were added by the hypercall.

This patch makes the hypercall update the memory map information
of a given guest domain.
This introduces the race between writers and readers of
the info. Later a new hypercall will be introduced to get memmap
from the guest with lock which prevents this race.
Even if the tools stack can get the memmap by foreign
domain page mapping, it should get memmap by the
newly added hypercall which will be added later.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Isaku Yamahata <yamahata@valinux.co.jp>
date Fri Oct 03 12:49:04 2008 +0900 (2008-10-03)
parents 788ed94f8fe4
children 89ef37e0f4b8
files xen/arch/ia64/xen/mm.c xen/include/asm-ia64/mm.h
line diff
     1.1 --- a/xen/arch/ia64/xen/mm.c	Fri Oct 03 12:24:49 2008 +0900
     1.2 +++ b/xen/arch/ia64/xen/mm.c	Fri Oct 03 12:49:04 2008 +0900
     1.3 @@ -187,6 +187,9 @@ static void domain_page_flush_and_put(st
     1.4                                        volatile pte_t* ptep, pte_t old_pte, 
     1.5                                        struct page_info* page);
     1.6  
     1.7 +static void __xencomm_mark_dirty(struct domain *d,
     1.8 +                                 unsigned long addr, unsigned int len);
     1.9 +
    1.10  extern unsigned long ia64_iobase;
    1.11  
    1.12  struct domain *dom_xen, *dom_io;
    1.13 @@ -2141,6 +2144,329 @@ dom0vp_unexpose_foreign_p2m(struct domai
    1.14      rcu_unlock_domain(dest_dom);
    1.15      return ret;
    1.16  }
    1.17 +
    1.18 +/* this lock can be only for memmap_info. domain_lock() is abused here */
    1.19 +static void
    1.20 +memmap_lock(struct domain *d)
    1.21 +{
    1.22 +    domain_lock(d);
    1.23 +}
    1.24 +
    1.25 +static void
    1.26 +memmap_unlock(struct domain *d)
    1.27 +{
    1.28 +    domain_unlock(d);
    1.29 +}
    1.30 +
    1.31 +/* copy memory range to domain pseudo physical address space */
    1.32 +static int
    1.33 +__memmap_copy_to(struct domain *d, unsigned long dest_gpfn,
    1.34 +               void *src, unsigned long num_pages)
    1.35 +{
    1.36 +    BUG_ON(((unsigned long)src & ~PAGE_MASK) != 0);
    1.37 +    
    1.38 +    while (num_pages > 0) {
    1.39 +        unsigned long mfn;
    1.40 +        struct page_info *page;
    1.41 +        void *virt;
    1.42 +
    1.43 +        mfn = gmfn_to_mfn_foreign(d, dest_gpfn);
    1.44 +        if (mfn == 0 || mfn == INVALID_MFN)
    1.45 +            return -EFAULT;
    1.46 +        page = mfn_to_page(mfn);
    1.47 +        if (get_page(page, d) == 0)
    1.48 +            return -EFAULT;
    1.49 +        virt = mfn_to_virt(mfn);
    1.50 +        copy_page(virt, src);
    1.51 +        __xencomm_mark_dirty(d, (unsigned long)virt, PAGE_SIZE);
    1.52 +        put_page(page);
    1.53 +
    1.54 +        src += PAGE_SIZE;
    1.55 +        dest_gpfn++;
    1.56 +        num_pages--;
    1.57 +    }
    1.58 +
    1.59 +    return 0;
    1.60 +}
    1.61 +
    1.62 +/* copy memory range from domain pseudo physical address space */
    1.63 +static int
    1.64 +__memmap_copy_from(void *dest, struct domain *d, unsigned long src_gpfn,
    1.65 +                   unsigned long num_pages)
    1.66 +{
    1.67 +    BUG_ON(((unsigned long)dest & ~PAGE_MASK) != 0);
    1.68 +
    1.69 +    while (num_pages > 0) {
    1.70 +        unsigned long mfn;
    1.71 +        struct page_info *page;
    1.72 +
    1.73 +        mfn = gmfn_to_mfn_foreign(d, src_gpfn);
    1.74 +        if (mfn == 0 || mfn == INVALID_MFN)
    1.75 +            return -EFAULT;
    1.76 +        page = mfn_to_page(mfn);
    1.77 +        if (get_page(page, d) == 0)
    1.78 +            return -EFAULT;
    1.79 +        copy_page(dest, mfn_to_virt(mfn));
    1.80 +        put_page(page);
    1.81 +
    1.82 +        dest += PAGE_SIZE;
    1.83 +        src_gpfn++;
    1.84 +        num_pages--;
    1.85 +    }
    1.86 +
    1.87 +    return 0;
    1.88 +}
    1.89 +
    1.90 +/* This function unlock/lock memmeap_lock.
    1.91 + * caller must free (*page, *order) even if error case by ckecking
    1.92 + * *page = NULL.
    1.93 + */
    1.94 +static int
    1.95 +memmap_copy_from(struct domain *d,
    1.96 +                 struct page_info **page, unsigned long *order)
    1.97 +{
    1.98 +    unsigned long num_pages;
    1.99 +    struct xen_ia64_memmap_info *memmap_info;
   1.100 +    unsigned long memmap_info_pfn;
   1.101 +
   1.102 +    num_pages = d->shared_info->arch.memmap_info_num_pages;
   1.103 +    memmap_unlock(d);
   1.104 +
   1.105 + again:
   1.106 +    *order = get_order(num_pages << PAGE_SHIFT);
   1.107 +    *page = alloc_domheap_pages(NULL, *order, 0);
   1.108 +    if (*page == NULL)
   1.109 +        return -ENOMEM;
   1.110 +    memmap_info = page_to_virt(*page);
   1.111 +
   1.112 +    memmap_lock(d);
   1.113 +    if (d->shared_info->arch.memmap_info_num_pages != num_pages) {
   1.114 +        num_pages = d->shared_info->arch.memmap_info_num_pages;
   1.115 +        memmap_unlock(d);
   1.116 +        free_domheap_pages(*page, *order);
   1.117 +        goto again;
   1.118 +    }
   1.119 +    memmap_info_pfn = d->shared_info->arch.memmap_info_pfn;
   1.120 +
   1.121 +    /* copy into local to make them virtually contiguous */
   1.122 +    return __memmap_copy_from(memmap_info, d, memmap_info_pfn, num_pages);
   1.123 +}
   1.124 +
   1.125 +static int
   1.126 +memdesc_can_expand(const struct xen_ia64_memmap_info *memmap_info,
   1.127 +                   unsigned long num_pages)
   1.128 +{
   1.129 +    /* Is there room for one more md? */
   1.130 +    if ((num_pages << PAGE_SHIFT) <
   1.131 +        (sizeof(*memmap_info) + memmap_info->efi_memmap_size +
   1.132 +         memmap_info->efi_memdesc_size))
   1.133 +        return 0;
   1.134 +
   1.135 +    return 1;
   1.136 +}
   1.137 +
   1.138 +static int
   1.139 +memdesc_can_collapse(const efi_memory_desc_t *lhs,
   1.140 +                     const efi_memory_desc_t *rhs)
   1.141 +{
   1.142 +    return (lhs->type == rhs->type && lhs->attribute == rhs->attribute);
   1.143 +}
   1.144 +
   1.145 +static int
   1.146 +__dom0vp_add_memdesc_one(struct xen_ia64_memmap_info *memmap_info,
   1.147 +                         unsigned long num_pages,
   1.148 +                         const efi_memory_desc_t *md)
   1.149 +{
   1.150 +    void* const memmap_end = (void*)memmap_info->memdesc +
   1.151 +        memmap_info->efi_memmap_size;
   1.152 +    void *p;
   1.153 +    efi_memory_desc_t *tmp_md;
   1.154 +    efi_memory_desc_t *s_md;
   1.155 +    efi_memory_desc_t *e_md;
   1.156 +    u64 phys_addr;
   1.157 +    u64 phys_addr_end;
   1.158 +
   1.159 +    /* fast path. appending to the last entry */
   1.160 +    tmp_md = (efi_memory_desc_t*)(memmap_end - memmap_info->efi_memdesc_size);
   1.161 +    if (MD_END(tmp_md) < md->phys_addr) {
   1.162 +        /* append one */
   1.163 +        if (!memdesc_can_expand(memmap_info, num_pages))
   1.164 +            return -ENOMEM;
   1.165 +
   1.166 +        memcpy(memmap_end, md, memmap_info->efi_memdesc_size);
   1.167 +        memmap_info->efi_memmap_size += memmap_info->efi_memdesc_size;
   1.168 +        return 0;
   1.169 +    }
   1.170 +    /* fast path. expand the last entry */
   1.171 +    if (tmp_md->phys_addr <= md->phys_addr) {
   1.172 +        if (!memdesc_can_collapse(tmp_md, md))
   1.173 +            return -EINVAL;
   1.174 +
   1.175 +        phys_addr_end = max(MD_END(tmp_md), MD_END(md));
   1.176 +        tmp_md->num_pages =
   1.177 +            (phys_addr_end - tmp_md->phys_addr) >> EFI_PAGE_SHIFT;
   1.178 +        return 0;
   1.179 +    }
   1.180 +
   1.181 +    /* slow path */
   1.182 +    s_md = NULL;
   1.183 +    e_md = NULL;
   1.184 +    for (p = memmap_info->memdesc;
   1.185 +         p < memmap_end;
   1.186 +         p += memmap_info->efi_memdesc_size) {
   1.187 +        tmp_md = p;
   1.188 +
   1.189 +        if (MD_END(tmp_md) < md->phys_addr)
   1.190 +            continue;
   1.191 +
   1.192 +        if (MD_END(md) < tmp_md->phys_addr) {
   1.193 +            if (s_md == NULL) {
   1.194 +                void *next_md = p + memmap_info->efi_memdesc_size;
   1.195 +                size_t left_size = memmap_end - (void*)tmp_md;
   1.196 +
   1.197 +                /* found hole. just insert md here*/
   1.198 +                if (!memdesc_can_expand(memmap_info, num_pages))
   1.199 +                    return -ENOMEM;
   1.200 +
   1.201 +                memmove(next_md, tmp_md, left_size);
   1.202 +                memcpy(tmp_md, md, memmap_info->efi_memdesc_size);
   1.203 +                memmap_info->efi_memmap_size += memmap_info->efi_memdesc_size;
   1.204 +                return 0;
   1.205 +            }
   1.206 +            break;
   1.207 +        }
   1.208 +
   1.209 +        if (s_md == NULL)
   1.210 +            s_md = tmp_md;
   1.211 +        e_md = tmp_md;
   1.212 +
   1.213 +        if (!memdesc_can_collapse(tmp_md, md))
   1.214 +            return -EINVAL;
   1.215 +    }
   1.216 +    BUG_ON(s_md == NULL || e_md == NULL);
   1.217 +
   1.218 +    /* collapse into one */
   1.219 +    phys_addr = min(md->phys_addr, s_md->phys_addr);
   1.220 +    phys_addr_end = max(MD_END(md), MD_END(e_md));
   1.221 +    s_md->phys_addr = phys_addr;
   1.222 +    s_md->num_pages = (phys_addr_end - phys_addr) >> EFI_PAGE_SHIFT;
   1.223 +    if (s_md != e_md) {
   1.224 +        void *next_s_md = (void*)s_md + memmap_info->efi_memdesc_size;
   1.225 +        void *next_e_md = (void*)e_md + memmap_info->efi_memdesc_size;
   1.226 +        size_t left_size = memmap_end - (void*)next_e_md;
   1.227 +
   1.228 +        memmap_info->efi_memmap_size -= (void*)e_md - (void*)s_md;
   1.229 +        if (left_size > 0)
   1.230 +            memmove(next_s_md, next_e_md, left_size);
   1.231 +    }
   1.232 +
   1.233 +    return 0;
   1.234 +}
   1.235 +
   1.236 +/*
   1.237 + * d->arch.convmem_end is mostly read only and sometimes increased.
   1.238 + * It is protected by memmap_lock
   1.239 + *
   1.240 + * d->arch.convmem_end is also referned by guest(self p2m exposure)
   1.241 + * d->shared_info.arch.memmap_info_xxx and memmap_info are
   1.242 + * referenced by tools stack(save/dump-core/foreign p2m exposure).
   1.243 + *
   1.244 + * reader side:
   1.245 + *  - get d->arch.convmem_end (via XENMEM_maximum_gpfn)
   1.246 + *  - issue get_memmap hypercall to get memmap
   1.247 + *    In VMM
   1.248 + *    - lock memmap_lock
   1.249 + *    - copy memmap from target guest
   1.250 + *    - unlock memmap_lock
   1.251 + *    - copy memmap into tools stack address space.
   1.252 + *  - check d->shared_info.memmap_info_num_pages. try again if necessary
   1.253 + *  - get d->arch.convmem_end. try again if changed.
   1.254 + *
   1.255 + * writer side:
   1.256 + *  - lock memmap_lock
   1.257 + *  - increase d->arch.convmem_end at first if necessary
   1.258 + *  - unlock memmap_lock
   1.259 + *  - allocate memory
   1.260 + *    In fact page allocation isn't blocking, so unlock/lock isn't necessary.
   1.261 + *  - lock memmap_lock
   1.262 + *  - update memmap_info
   1.263 + *  - unlock memmap_lock
   1.264 + */
   1.265 +static int
   1.266 +__dom0vp_add_memdesc(struct domain *targ_d,
   1.267 +                     const struct xen_ia64_memmap_info *u_memmap_info,
   1.268 +                     const char *u_memmap)
   1.269 +{
   1.270 +    int ret = 0;
   1.271 +    const void* const u_memmap_end = u_memmap + u_memmap_info->efi_memmap_size;
   1.272 +    const efi_memory_desc_t *md;
   1.273 +
   1.274 +    unsigned long md_end_max;
   1.275 +    unsigned long num_pages;
   1.276 +    unsigned long order;
   1.277 +    unsigned long memmap_info_pfn;
   1.278 +
   1.279 +    struct page_info *page = NULL;
   1.280 +    struct xen_ia64_memmap_info *memmap_info;
   1.281 +    size_t unused_size;
   1.282 +
   1.283 +    const void *p;
   1.284 +
   1.285 +    /* update d->arch.convmem_end */
   1.286 +    md_end_max = 0;
   1.287 +    for (p = u_memmap; p < u_memmap_end;
   1.288 +         p += u_memmap_info->efi_memdesc_size) {
   1.289 +        md = p;
   1.290 +        if (MD_END(md) > md_end_max)
   1.291 +            md_end_max = MD_END(md);
   1.292 +    }
   1.293 +    memmap_lock(targ_d);
   1.294 +    /* convmem_end is also protected memdesc lock */
   1.295 +    if (md_end_max > targ_d->arch.convmem_end)
   1.296 +        targ_d->arch.convmem_end = md_end_max;
   1.297 +
   1.298 +    /* memmap_copy_from_guest() unlock/lock memmap_lock() */
   1.299 +    ret = memmap_copy_from(targ_d, &page, &order);
   1.300 +    if (ret != 0)
   1.301 +        goto out;
   1.302 +    memmap_info = page_to_virt(page);
   1.303 +    num_pages = targ_d->shared_info->arch.memmap_info_num_pages;
   1.304 +    memmap_info_pfn = targ_d->shared_info->arch.memmap_info_pfn;
   1.305 +
   1.306 +    if (memmap_info->efi_memdesc_size != u_memmap_info->efi_memdesc_size ||
   1.307 +        memmap_info->efi_memdesc_version !=
   1.308 +        u_memmap_info->efi_memdesc_version) {
   1.309 +        ret = -EINVAL;
   1.310 +        goto out;
   1.311 +    }
   1.312 +
   1.313 +    /* update memdesc */
   1.314 +    for (p = u_memmap;
   1.315 +         p < u_memmap_end;
   1.316 +         p += u_memmap_info->efi_memdesc_size) {
   1.317 +        md = p;
   1.318 +        ret = __dom0vp_add_memdesc_one(memmap_info, num_pages, md);
   1.319 +        if (ret != 0)
   1.320 +            goto out;
   1.321 +    }
   1.322 +
   1.323 +    /* zero out the unused region to avoid hypervisor bit leak */
   1.324 +    unused_size = (num_pages << PAGE_SHIFT) -
   1.325 +        (sizeof(*memmap_info) + memmap_info->efi_memmap_size);
   1.326 +    if (unused_size > 0)
   1.327 +        memset((void*)memmap_info->memdesc + memmap_info->efi_memmap_size,
   1.328 +               0, unused_size);
   1.329 +
   1.330 +    /* copy back into domain. */
   1.331 +    ret = __memmap_copy_to(targ_d, memmap_info_pfn, memmap_info, num_pages);
   1.332 +
   1.333 + out:
   1.334 +    memmap_unlock(targ_d);
   1.335 +
   1.336 +    if (page != NULL)
   1.337 +        free_domheap_pages(page, order);
   1.338 +    return ret;
   1.339 +}
   1.340  #endif
   1.341  
   1.342  // grant table host mapping
   1.343 @@ -2857,8 +3183,35 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
   1.344          case XENMAPSPACE_mfn:
   1.345          {
   1.346              if ( get_page_from_pagenr(xatp.idx, d) ) {
   1.347 +                struct xen_ia64_memmap_info memmap_info;
   1.348 +                efi_memory_desc_t md;
   1.349 +                int ret;
   1.350 +
   1.351                  mfn = xatp.idx;
   1.352                  page = mfn_to_page(mfn);
   1.353 +
   1.354 +                memmap_info.efi_memmap_size = sizeof(md);
   1.355 +                memmap_info.efi_memdesc_size = sizeof(md);
   1.356 +                memmap_info.efi_memdesc_version =
   1.357 +                    EFI_MEMORY_DESCRIPTOR_VERSION;
   1.358 +
   1.359 +                md.type = EFI_CONVENTIONAL_MEMORY;
   1.360 +                md.pad = 0;
   1.361 +                md.phys_addr = xatp.gpfn << PAGE_SHIFT;
   1.362 +                md.virt_addr = 0;
   1.363 +                md.num_pages = 1UL << (PAGE_SHIFT - EFI_PAGE_SHIFT);
   1.364 +                md.attribute = EFI_MEMORY_WB;
   1.365 +
   1.366 +                ret = __dom0vp_add_memdesc(d, &memmap_info, (char*)&md);
   1.367 +                if (ret != 0) {
   1.368 +                    put_page(page);
   1.369 +                    rcu_unlock_domain(d);
   1.370 +                    gdprintk(XENLOG_DEBUG,
   1.371 +                             "%s:%d td %d gpfn 0x%lx mfn 0x%lx ret %d\n",
   1.372 +                             __func__, __LINE__,
   1.373 +                             d->domain_id, xatp.gpfn, xatp.idx, ret);
   1.374 +                    return ret;
   1.375 +                }
   1.376              }
   1.377              break;
   1.378          }
   1.379 @@ -2982,9 +3335,9 @@ int is_iomem_page(unsigned long mfn)
   1.380      return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
   1.381  }
   1.382  
   1.383 -void xencomm_mark_dirty(unsigned long addr, unsigned int len)
   1.384 +static void __xencomm_mark_dirty(struct domain *d,
   1.385 +                                 unsigned long addr, unsigned int len)
   1.386  {
   1.387 -    struct domain *d = current->domain;
   1.388      unsigned long gpfn;
   1.389      unsigned long end_addr = addr + len;
   1.390  
   1.391 @@ -2996,6 +3349,11 @@ void xencomm_mark_dirty(unsigned long ad
   1.392      }
   1.393  }
   1.394  
   1.395 +void xencomm_mark_dirty(unsigned long addr, unsigned int len)
   1.396 +{
   1.397 +    __xencomm_mark_dirty(current->domain, addr, len);
   1.398 +}
   1.399 +
   1.400  int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn)
   1.401  {
   1.402      /* STUB to compile */
     2.1 --- a/xen/include/asm-ia64/mm.h	Fri Oct 03 12:24:49 2008 +0900
     2.2 +++ b/xen/include/asm-ia64/mm.h	Fri Oct 03 12:49:04 2008 +0900
     2.3 @@ -455,6 +455,7 @@ extern unsigned long dom0vp_unexpose_for
     2.4  #define foreign_p2m_destroy(d)	do { } while (0)
     2.5  #define dom0vp_expose_foreign_p2m(dest_dom, dest_gpfn, domid, buffer, flags)	(-ENOSYS)
     2.6  #define dom0vp_unexpose_foreign_p2m(dest_dom, dest_gpfn, domid)	(-ENOSYS)
     2.7 +#define __dom0vp_add_memdesc(d, memmap_info, memdesc)	(-ENOSYS)
     2.8  #endif
     2.9  
    2.10  extern volatile unsigned long *mpt_table;