ia64/xen-unstable

changeset 19691:f44438bc79ac

libxc: Exchange a page for PV guest

This patch support exchange a page for a suspended PV guest from user
space.

The basic idea to offline a page is:
1) mark a page offline pending
2) If the page is owned by a HVM domain, user have to live migrate it.
In future, with stub-domain support, we can also exchange the page
without migration.
3) If the page is owned by a PV domain, we will try to exchange the
offline pending page to a new one and free the old page.

This patch achieves item 3.

The method to exchange the offline pending page for PV domain is:

1) Suspend the guest.
2) If the page is being granted out, return with offline pending.
3) Get a copy for the content
4) Scan all page table page to see if any reference to the offending
page, if yes, make the entry to be non-present to reduce the reference
count.
5) After update all page tables, user space tools will try to exchange
the old page. If the new mfn has no reference anymore (i.e.
count_info & count_mask =3D 1), the exchange will allocate a new page,
update the m2p and return success, otherwise it will return fail.
6) If step 5 is success, user space tools will update the content of
the new page change the p2m table, and change all entries scaned in
step 4 to point to new entry.
if step failed, it will try to undo step 4 to revert page table.
7) Resume the guest.

Please refer to thread in
http://www.mailinglistarchive.com/xen-devel@lists.xensource.com/msg63084.html
for more information.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jun 01 14:15:48 2009 +0100 (2009-06-01)
parents 6d6ec6f966cf
children 72ff07f65e16
files tools/libxc/xc_offline_page.c tools/libxc/xc_suspend.c tools/libxc/xenguest.h
line diff
     1.1 --- a/tools/libxc/xc_offline_page.c	Mon Jun 01 14:13:53 2009 +0100
     1.2 +++ b/tools/libxc/xc_offline_page.c	Mon Jun 01 14:15:48 2009 +0100
     1.3 @@ -12,12 +12,42 @@
     1.4  #include <stdlib.h>
     1.5  #include <unistd.h>
     1.6  #include <sys/time.h>
     1.7 +#include <xs.h>
     1.8 +#include <xc_core.h>
     1.9  
    1.10  #include "xc_private.h"
    1.11  #include "xc_dom.h"
    1.12  #include "xg_private.h"
    1.13  #include "xg_save_restore.h"
    1.14  
    1.15 +struct domain_mem_info{
    1.16 +    int domid;
    1.17 +    unsigned int pt_level;
    1.18 +    unsigned int guest_width;
    1.19 +    uint32_t *pfn_type;
    1.20 +    xen_pfn_t *p2m_table;
    1.21 +    unsigned long p2m_size;
    1.22 +    xen_pfn_t *m2p_table;
    1.23 +    int max_mfn;
    1.24 +};
    1.25 +
    1.26 +struct pte_backup_entry
    1.27 +{
    1.28 +    xen_pfn_t table_mfn;
    1.29 +    int offset;
    1.30 +};
    1.31 +
    1.32 +#define DEFAULT_BACKUP_COUNT 1024
    1.33 +struct pte_backup
    1.34 +{
    1.35 +    struct pte_backup_entry *entries;
    1.36 +    int max;
    1.37 +    int cur;
    1.38 +};
    1.39 +
    1.40 +/* Global definition for some MACRO */
    1.41 +int guest_width, p2m_size;
    1.42 +
    1.43  int xc_mark_page_online(int xc, unsigned long start,
    1.44                          unsigned long end, uint32_t *status)
    1.45  {
    1.46 @@ -98,3 +128,637 @@ int xc_query_page_offline_status(int xc,
    1.47  
    1.48      return ret;
    1.49  }
    1.50 +
    1.51 + /*
    1.52 +  * There should no update to the grant when domain paused
    1.53 +  */
    1.54 +static int xc_is_page_granted(int xc_handle, xen_pfn_t gpfn,
    1.55 +                              struct grant_entry *gnttab, int gnt_num)
    1.56 +{
    1.57 +    int i = 0;
    1.58 +
    1.59 +    if (!gnttab)
    1.60 +        return 0;
    1.61 +
    1.62 +    for (i = 0; i < gnt_num; i++)
    1.63 +        if ( ((gnttab[i].flags & GTF_type_mask) !=  GTF_invalid) &&
    1.64 +             (gnttab[i].frame == gpfn) )
    1.65 +             break;
    1.66 +
    1.67 +   return (i != gnt_num);
    1.68 +}
    1.69 +
    1.70 +static xen_pfn_t pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m, int gwidth)
    1.71 +{
    1.72 +  return ((xen_pfn_t) ((gwidth==8)?
    1.73 +                       (((uint64_t *)p2m)[(pfn)]):
    1.74 +                       ((((uint32_t *)p2m)[(pfn)]) == 0xffffffffU ?
    1.75 +                            (-1UL) :
    1.76 +                            (((uint32_t *)p2m)[(pfn)]))));
    1.77 +}
    1.78 +
    1.79 +static int get_pt_level(int xc_handle, uint32_t domid,
    1.80 +                        unsigned int *pt_level,
    1.81 +                        unsigned int *gwidth)
    1.82 +{
    1.83 +    DECLARE_DOMCTL;
    1.84 +    xen_capabilities_info_t xen_caps = "";
    1.85 +
    1.86 +    if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0)
    1.87 +        return -1;
    1.88 +
    1.89 +    memset(&domctl, 0, sizeof(domctl));
    1.90 +    domctl.domain = domid;
    1.91 +    domctl.cmd = XEN_DOMCTL_get_address_size;
    1.92 +
    1.93 +    if ( do_domctl(xc_handle, &domctl) != 0 )
    1.94 +        return -1;
    1.95 +
    1.96 +    *gwidth = domctl.u.address_size.size / 8;
    1.97 +
    1.98 +    if (strstr(xen_caps, "xen-3.0-x86_64"))
    1.99 +        /* Depends on whether it's a compat 32-on-64 guest */
   1.100 +        *pt_level = ( (*gwidth == 8) ? 4 : 3 );
   1.101 +    else if (strstr(xen_caps, "xen-3.0-x86_32p"))
   1.102 +        *pt_level = 3;
   1.103 +    else if (strstr(xen_caps, "xen-3.0-x86_32"))
   1.104 +        *pt_level = 2;
   1.105 +    else
   1.106 +        return -1;
   1.107 +
   1.108 +    return 0;
   1.109 +}
   1.110 +
   1.111 +static int close_mem_info(int xc_handle, struct domain_mem_info *minfo)
   1.112 +{
   1.113 +    if (minfo->pfn_type)
   1.114 +        free(minfo->pfn_type);
   1.115 +    munmap(minfo->m2p_table, M2P_SIZE(minfo->max_mfn));
   1.116 +    munmap(minfo->p2m_table, P2M_FLL_ENTRIES * PAGE_SIZE);
   1.117 +    minfo->p2m_table = minfo->m2p_table = NULL;
   1.118 +
   1.119 +    return 0;
   1.120 +}
   1.121 +
   1.122 +static int init_mem_info(int xc_handle, int domid,
   1.123 +                 struct domain_mem_info *minfo,
   1.124 +                 xc_dominfo_t *info)
   1.125 +{
   1.126 +    uint64_aligned_t shared_info_frame;
   1.127 +    shared_info_any_t *live_shinfo = NULL;
   1.128 +    int i, rc;
   1.129 +
   1.130 +    /* Only be initialized once */
   1.131 +    if (minfo->pfn_type || minfo->m2p_table || minfo->p2m_table)
   1.132 +        return -EINVAL;
   1.133 +
   1.134 +    if ( get_pt_level(xc_handle, domid, &minfo->pt_level,
   1.135 +                      &minfo->guest_width) )
   1.136 +    {
   1.137 +        ERROR("Unable to get PT level info.");
   1.138 +        return -EFAULT;
   1.139 +    }
   1.140 +    guest_width = minfo->guest_width;
   1.141 +
   1.142 +    shared_info_frame = info->shared_info_frame;
   1.143 +
   1.144 +    live_shinfo = xc_map_foreign_range(xc_handle, domid,
   1.145 +                     PAGE_SIZE, PROT_READ, shared_info_frame);
   1.146 +    if ( !live_shinfo )
   1.147 +    {
   1.148 +        ERROR("Couldn't map live_shinfo");
   1.149 +        return -EFAULT;
   1.150 +    }
   1.151 +
   1.152 +    if ( (rc = xc_core_arch_map_p2m_writable(xc_handle, minfo->guest_width,
   1.153 +              info, live_shinfo, &minfo->p2m_table,  &minfo->p2m_size)) )
   1.154 +    {
   1.155 +        ERROR("Couldn't map p2m table %x\n", rc);
   1.156 +        goto failed;
   1.157 +    }
   1.158 +    munmap(live_shinfo, PAGE_SIZE);
   1.159 +    live_shinfo = NULL;
   1.160 +
   1.161 +    p2m_size = minfo->p2m_size;
   1.162 +
   1.163 +    minfo->max_mfn = xc_memory_op(xc_handle, XENMEM_maximum_ram_page, NULL);
   1.164 +    if ( !(minfo->m2p_table =
   1.165 +        xc_map_m2p(xc_handle, minfo->max_mfn, PROT_READ, NULL)) )
   1.166 +    {
   1.167 +        ERROR("Failed to map live M2P table");
   1.168 +        goto failed;
   1.169 +    }
   1.170 +
   1.171 +    /* Get pfn type */
   1.172 +    minfo->pfn_type = malloc(sizeof(uint32_t) * minfo->p2m_size);
   1.173 +    if (!minfo->pfn_type)
   1.174 +    {
   1.175 +        ERROR("Failed to malloc pfn_type\n");
   1.176 +        goto failed;
   1.177 +    }
   1.178 +    memset(minfo->pfn_type, 0, sizeof(uint32_t) * minfo->p2m_size);
   1.179 +
   1.180 +    for (i = 0; i < minfo->p2m_size; i++)
   1.181 +        minfo->pfn_type[i] = pfn_to_mfn(i, minfo->p2m_table,
   1.182 +                                        minfo->guest_width);
   1.183 +
   1.184 +    if ( lock_pages(minfo->pfn_type, minfo->p2m_size * sizeof(uint32_t)) )
   1.185 +    {
   1.186 +        ERROR("Unable to lock pfn_type array");
   1.187 +        goto failed;
   1.188 +    }
   1.189 +
   1.190 +    for (i = 0; i < minfo->p2m_size ; i+=1024)
   1.191 +    {
   1.192 +        int count = ((p2m_size - i ) > 1024 ) ? 1024: (p2m_size - i);
   1.193 +        if ( ( rc = xc_get_pfn_type_batch(xc_handle, domid, count,
   1.194 +                  minfo->pfn_type + i)) )
   1.195 +        {
   1.196 +            ERROR("Failed to get pfn_type %x\n", rc);
   1.197 +            goto unlock;
   1.198 +        }
   1.199 +    }
   1.200 +    return 0;
   1.201 +
   1.202 +unlock:
   1.203 +    unlock_pages(minfo->pfn_type, minfo->p2m_size * sizeof(uint32_t));
   1.204 +failed:
   1.205 +    if (minfo->pfn_type)
   1.206 +    {
   1.207 +        minfo->pfn_type = NULL;
   1.208 +        free(minfo->pfn_type);
   1.209 +    }
   1.210 +    if (live_shinfo)
   1.211 +        munmap(live_shinfo, PAGE_SIZE);
   1.212 +    munmap(minfo->m2p_table, M2P_SIZE(minfo->max_mfn));
   1.213 +    munmap(minfo->p2m_table, P2M_FLL_ENTRIES * PAGE_SIZE);
   1.214 +    minfo->p2m_table = minfo->m2p_table = NULL;
   1.215 +
   1.216 +    return -1;
   1.217 +}
   1.218 +
   1.219 +static int backup_ptes(xen_pfn_t table_mfn, int offset,
   1.220 +                       struct pte_backup *backup)
   1.221 +{
   1.222 +    if (!backup)
   1.223 +        return -EINVAL;
   1.224 +
   1.225 +    if (backup->max == backup->cur)
   1.226 +    {
   1.227 +        backup->entries = realloc(backup->entries,
   1.228 +                            backup->max * 2 * sizeof(struct pte_backup_entry));
   1.229 +        if (backup->entries == NULL)
   1.230 +            return -1;
   1.231 +        else
   1.232 +            backup->max *= 2;
   1.233 +    }
   1.234 +
   1.235 +    backup->entries[backup->cur].table_mfn = table_mfn;
   1.236 +    backup->entries[backup->cur++].offset = offset;
   1.237 +
   1.238 +    return 0;
   1.239 +}
   1.240 +
   1.241 +/*
   1.242 + * return:
   1.243 + * 1 when MMU update is required
   1.244 + * 0 when no changes
   1.245 + * <0 when error happen
   1.246 + */
   1.247 +typedef int (*pte_func)(uint64_t pte, uint64_t *new_pte,
   1.248 +                       unsigned long table_mfn, int table_offset,
   1.249 +                       struct pte_backup *backup,
   1.250 +                       unsigned long no_use);
   1.251 +
   1.252 +static int __clear_pte(uint64_t pte, uint64_t *new_pte,
   1.253 +                       unsigned long table_mfn, int table_offset,
   1.254 +                       struct pte_backup *backup,
   1.255 +                       unsigned long mfn)
   1.256 +{
   1.257 +    /* If no new_pte pointer, same as no changes needed */
   1.258 +    if (!new_pte || !backup)
   1.259 +        return -EINVAL;
   1.260 +
   1.261 +    if ( !(pte & _PAGE_PRESENT))
   1.262 +        return 0;
   1.263 +
   1.264 +    /* XXX Check for PSE bit here */
   1.265 +    /* Hit one entry */
   1.266 +    if ( ((pte >> PAGE_SHIFT_X86) & MFN_MASK_X86) == mfn)
   1.267 +    {
   1.268 +        *new_pte = pte & ~_PAGE_PRESENT;
   1.269 +        if (!backup_ptes(table_mfn, table_offset, backup))
   1.270 +            return 1;
   1.271 +    }
   1.272 +
   1.273 +    return 0;
   1.274 +}
   1.275 +
   1.276 +static int __update_pte(uint64_t pte, uint64_t *new_pte,
   1.277 +                      unsigned long table_mfn, int table_offset,
   1.278 +                      struct pte_backup *backup,
   1.279 +                      unsigned long new_mfn)
   1.280 +{
   1.281 +    int index;
   1.282 +
   1.283 +    if (!new_pte)
   1.284 +        return 0;
   1.285 +
   1.286 +    for (index = 0; index < backup->cur; index ++)
   1.287 +        if ( (backup->entries[index].table_mfn == table_mfn) &&
   1.288 +             (backup->entries[index].offset == table_offset) )
   1.289 +            break;
   1.290 +
   1.291 +    if (index != backup->cur)
   1.292 +    {
   1.293 +        if (pte & _PAGE_PRESENT)
   1.294 +            ERROR("Page present while in backup ptes\n");
   1.295 +        pte &= ~MFN_MASK_X86;
   1.296 +        pte |= (new_mfn << PAGE_SHIFT_X86) | _PAGE_PRESENT;
   1.297 +        *new_pte = pte;
   1.298 +        return 1;
   1.299 +    }
   1.300 +
   1.301 +    return 0;
   1.302 +}
   1.303 +
   1.304 +static int change_pte(int xc_handle, int domid,
   1.305 +                     struct domain_mem_info *minfo,
   1.306 +                     struct pte_backup *backup,
   1.307 +                     struct xc_mmu *mmu,
   1.308 +                     pte_func func,
   1.309 +                     unsigned long data)
   1.310 +{
   1.311 +    int pte_num, rc;
   1.312 +    uint64_t i;
   1.313 +    void *content = NULL;
   1.314 +
   1.315 +    pte_num = PAGE_SIZE / ((minfo->pt_level == 2) ? 4 : 8);
   1.316 +
   1.317 +    for (i = 0; i < minfo->p2m_size; i++)
   1.318 +    {
   1.319 +        xen_pfn_t table_mfn = pfn_to_mfn(i, minfo->p2m_table,
   1.320 +                                         minfo->guest_width);
   1.321 +        uint64_t pte, new_pte;
   1.322 +        int j;
   1.323 +
   1.324 +        if ( table_mfn == INVALID_P2M_ENTRY )
   1.325 +            continue;
   1.326 +
   1.327 +        if ( minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
   1.328 +        {
   1.329 +            content = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
   1.330 +                                            PROT_READ, table_mfn);
   1.331 +            if (!content)
   1.332 +                goto failed;
   1.333 +
   1.334 +            for (j = 0; j < pte_num; j++)
   1.335 +            {
   1.336 +                if ( minfo->pt_level == 2 )
   1.337 +                    pte = ((const uint32_t*)content)[j];
   1.338 +                else
   1.339 +                    pte = ((const uint64_t*)content)[j];
   1.340 +
   1.341 +                rc = func(pte, &new_pte, table_mfn, j, backup, data);
   1.342 +
   1.343 +                switch (rc)
   1.344 +                {
   1.345 +                    case 1:
   1.346 +                    if ( xc_add_mmu_update(xc_handle, mmu,
   1.347 +                          table_mfn << PAGE_SHIFT |
   1.348 +                          j * ( (minfo->pt_level == 2) ?
   1.349 +                              sizeof(uint32_t): sizeof(uint64_t)) |
   1.350 +                          MMU_PT_UPDATE_PRESERVE_AD,
   1.351 +                          new_pte) )
   1.352 +                        goto failed;
   1.353 +                    break;
   1.354 +
   1.355 +                    case 0:
   1.356 +                    break;
   1.357 +
   1.358 +                    default:
   1.359 +                    goto failed;
   1.360 +                }
   1.361 +            }
   1.362 +        }
   1.363 +
   1.364 +        munmap(content, PAGE_SIZE);
   1.365 +        content = NULL;
   1.366 +    }
   1.367 +
   1.368 +    if ( xc_flush_mmu_updates(xc_handle, mmu) )
   1.369 +        goto failed;
   1.370 +
   1.371 +    return 0;
   1.372 +failed:
   1.373 +    /* XXX Shall we take action if we have fail to swap? */
   1.374 +    if (content)
   1.375 +        munmap(content, PAGE_SIZE);
   1.376 +
   1.377 +    return -1;
   1.378 +}
   1.379 +
   1.380 +static int update_pte(int xc_handle, int domid,
   1.381 +                     struct domain_mem_info *minfo,
   1.382 +                     struct pte_backup *backup,
   1.383 +                     struct xc_mmu *mmu,
   1.384 +                     unsigned long new_mfn)
   1.385 +{
   1.386 +    return change_pte(xc_handle, domid,  minfo, backup, mmu,
   1.387 +                      __update_pte, new_mfn);
   1.388 +}
   1.389 +
   1.390 +static int clear_pte(int xc_handle, int domid,
   1.391 +                     struct domain_mem_info *minfo,
   1.392 +                     struct pte_backup *backup,
   1.393 +                     struct xc_mmu *mmu,
   1.394 +                     xen_pfn_t mfn)
   1.395 +{
   1.396 +    return change_pte(xc_handle, domid, minfo, backup, mmu,
   1.397 +                      __clear_pte, mfn);
   1.398 +}
   1.399 +
   1.400 +static int exchange_page(int xc_handle, xen_pfn_t mfn,
   1.401 +                     xen_pfn_t *new_mfn, int domid)
   1.402 +{
   1.403 +    int rc;
   1.404 +    xen_pfn_t out_mfn;
   1.405 +
   1.406 +	struct xen_memory_exchange exchange = {
   1.407 +		.in = {
   1.408 +			.nr_extents   = 1,
   1.409 +			.extent_order = 0,
   1.410 +			.domid        = domid
   1.411 +		},
   1.412 +		.out = {
   1.413 +			.nr_extents   = 1,
   1.414 +			.extent_order = 0,
   1.415 +			.domid        = domid
   1.416 +		}
   1.417 +    };
   1.418 +    set_xen_guest_handle(exchange.in.extent_start, &mfn);
   1.419 +    set_xen_guest_handle(exchange.out.extent_start, &out_mfn);
   1.420 +
   1.421 +    rc = xc_memory_op(xc_handle, XENMEM_exchange, &exchange);
   1.422 +
   1.423 +    if (!rc)
   1.424 +        *new_mfn = out_mfn;
   1.425 +
   1.426 +    return rc;
   1.427 +}
   1.428 +
   1.429 +/*
   1.430 + * Check if a page can be exchanged successfully
   1.431 + */
   1.432 +
   1.433 +static int is_page_exchangable(int xc_handle, int domid, xen_pfn_t mfn,
   1.434 +                               xc_dominfo_t *info)
   1.435 +{
   1.436 +    uint32_t status;
   1.437 +    int rc;
   1.438 +
   1.439 +    /* domain checking */
   1.440 +    if ( !domid || (domid > DOMID_FIRST_RESERVED) )
   1.441 +    {
   1.442 +        DPRINTF("Dom0's page can't be LM");
   1.443 +        return 0;
   1.444 +    }
   1.445 +    if (info->hvm)
   1.446 +    {
   1.447 +        DPRINTF("Currently we can only live change PV guest's page\n");
   1.448 +        return 0;
   1.449 +    }
   1.450 +
   1.451 +    /* Check if pages are offline pending or not */
   1.452 +    rc = xc_query_page_offline_status(xc_handle, mfn, mfn, &status);
   1.453 +
   1.454 +    if ( rc || !(status & PG_OFFLINE_STATUS_OFFLINE_PENDING) )
   1.455 +    {
   1.456 +        ERROR("Page %lx is not offline pending %x\n",
   1.457 +          mfn, status);
   1.458 +        return 0;
   1.459 +    }
   1.460 +
   1.461 +    return 1;
   1.462 +}
   1.463 +
   1.464 +/* The domain should be suspended when called here */
   1.465 +int xc_exchange_page(int xc_handle, int domid, xen_pfn_t mfn)
   1.466 +{
   1.467 +    xc_dominfo_t info;
   1.468 +    struct domain_mem_info minfo;
   1.469 +    struct xc_mmu *mmu = NULL;
   1.470 +    struct pte_backup old_ptes = {NULL, 0, 0};
   1.471 +    struct grant_entry *gnttab = NULL;
   1.472 +    struct mmuext_op mops;
   1.473 +    int gnt_num, unpined = 0;
   1.474 +    void *old_p, *backup = NULL;
   1.475 +    int rc, result = -1;
   1.476 +    uint32_t status;
   1.477 +    xen_pfn_t new_mfn, gpfn;
   1.478 +
   1.479 +    if ( xc_domain_getinfo(xc_handle, domid, 1, &info) != 1 )
   1.480 +    {
   1.481 +        ERROR("Could not get domain info");
   1.482 +        return -EFAULT;
   1.483 +    }
   1.484 +
   1.485 +    if (!info.shutdown || info.shutdown_reason != SHUTDOWN_suspend)
   1.486 +    {
   1.487 +        ERROR("Can't exchange page unless domain is suspended\n");
   1.488 +        return -EINVAL;
   1.489 +    }
   1.490 +
   1.491 +    if (!is_page_exchangable(xc_handle, domid, mfn, &info))
   1.492 +    {
   1.493 +        ERROR("Could not exchange page\n");
   1.494 +        return -EINVAL;
   1.495 +    }
   1.496 +
   1.497 +    /* Get domain's memory information */
   1.498 +    memset(&minfo, 0, sizeof(minfo));
   1.499 +    init_mem_info(xc_handle, domid, &minfo, &info);
   1.500 +    gpfn = minfo.m2p_table[mfn];
   1.501 +
   1.502 +    /* Don't exchange CR3 for PAE guest in PAE host environment */
   1.503 +    if (minfo.guest_width > sizeof(long))
   1.504 +    {
   1.505 +        if ( (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
   1.506 +                    XEN_DOMCTL_PFINFO_L3TAB )
   1.507 +            goto failed;
   1.508 +    }
   1.509 +
   1.510 +    gnttab = xc_gnttab_map_table(xc_handle, domid, &gnt_num);
   1.511 +    if (!gnttab)
   1.512 +    {
   1.513 +        ERROR("Failed to map grant table\n");
   1.514 +        goto failed;
   1.515 +    }
   1.516 +
   1.517 +    if (xc_is_page_granted(xc_handle, mfn, gnttab, gnt_num))
   1.518 +    {
   1.519 +        ERROR("Page %lx is granted now\n", mfn);
   1.520 +        goto failed;
   1.521 +    }
   1.522 +
   1.523 +    /* allocate required data structure */
   1.524 +    backup = malloc(PAGE_SIZE);
   1.525 +    if (!backup)
   1.526 +    {
   1.527 +        ERROR("Failed to allocate backup pages pointer\n");
   1.528 +        goto failed;
   1.529 +    }
   1.530 +
   1.531 +    old_ptes.max = DEFAULT_BACKUP_COUNT;
   1.532 +    old_ptes.entries = malloc(sizeof(struct pte_backup_entry) *
   1.533 +                              DEFAULT_BACKUP_COUNT);
   1.534 +
   1.535 +    if (!old_ptes.entries)
   1.536 +    {
   1.537 +        ERROR("Faield to allocate backup\n");
   1.538 +        goto failed;
   1.539 +    }
   1.540 +    old_ptes.cur = 0;
   1.541 +
   1.542 +    /* Unpin the page if it is pined */
   1.543 +    if (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB)
   1.544 +    {
   1.545 +        mops.cmd = MMUEXT_UNPIN_TABLE;
   1.546 +        mops.arg1.mfn = mfn;
   1.547 +
   1.548 +        if ( xc_mmuext_op(xc_handle, &mops, 1, domid) < 0 )
   1.549 +        {
   1.550 +            ERROR("Failed to unpin page %lx", mfn);
   1.551 +            goto failed;
   1.552 +        }
   1.553 +        mops.arg1.mfn = mfn;
   1.554 +        unpined = 1;
   1.555 +    }
   1.556 +
   1.557 +    /* backup the content */
   1.558 +    old_p = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
   1.559 +      PROT_READ, mfn);
   1.560 +    if (!old_p)
   1.561 +    {
   1.562 +        ERROR("Failed to map foreign page %lx\n", mfn);
   1.563 +        goto failed;
   1.564 +    }
   1.565 +
   1.566 +    memcpy(backup, old_p, PAGE_SIZE);
   1.567 +    munmap(old_p, PAGE_SIZE);
   1.568 +
   1.569 +    mmu = xc_alloc_mmu_updates(xc_handle, domid);
   1.570 +    if ( mmu == NULL )
   1.571 +    {
   1.572 +        ERROR("%s: failed at %d\n", __FUNCTION__, __LINE__);
   1.573 +        goto failed;
   1.574 +    }
   1.575 +
   1.576 +    /* Firstly update all pte to be invalid to remove the reference */
   1.577 +    rc = clear_pte(xc_handle, domid,  &minfo, &old_ptes, mmu, mfn);
   1.578 +
   1.579 +    if (rc)
   1.580 +    {
   1.581 +        ERROR("clear pte failed\n");
   1.582 +        goto failed;
   1.583 +    }
   1.584 +
   1.585 +    rc = exchange_page(xc_handle, mfn, &new_mfn, domid);
   1.586 +
   1.587 +    if (rc)
   1.588 +    {
   1.589 +        ERROR("Exchange the page failed\n");
   1.590 +        /* Exchange fail means there are refere to the page still */
   1.591 +        rc = update_pte(xc_handle, domid, &minfo, &old_ptes, mmu, mfn);
   1.592 +        if (rc)
   1.593 +            result = -2;
   1.594 +        goto failed;
   1.595 +    }
   1.596 +
   1.597 +    rc = update_pte(xc_handle, domid, &minfo, &old_ptes, mmu, new_mfn);
   1.598 +
   1.599 +    if (rc)
   1.600 +    {
   1.601 +        ERROR("update pte failed guest may be broken now\n");
   1.602 +        /* No recover action now for swap fail */
   1.603 +        result = -2;
   1.604 +        goto failed;
   1.605 +    }
   1.606 +
   1.607 +    /* Check if pages are offlined already */
   1.608 +    rc = xc_query_page_offline_status(xc_handle, mfn, mfn,
   1.609 +                            &status);
   1.610 +
   1.611 +    if (rc)
   1.612 +    {
   1.613 +        ERROR("Fail to query offline status\n");
   1.614 +    }else if ( !(status & PG_OFFLINE_STATUS_OFFLINED) )
   1.615 +    {
   1.616 +        ERROR("page is still online or pending\n");
   1.617 +        goto failed;
   1.618 +    }
   1.619 +    else
   1.620 +    {
   1.621 +        void *new_p;
   1.622 +        IPRINTF("Now page is offlined %lx\n", mfn);
   1.623 +        /* Update the p2m table */
   1.624 +        minfo.p2m_table[gpfn] = new_mfn;
   1.625 +
   1.626 +        new_p = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
   1.627 +                                     PROT_READ|PROT_WRITE, new_mfn);
   1.628 +        memcpy(new_p, backup, PAGE_SIZE);
   1.629 +        munmap(new_p, PAGE_SIZE);
   1.630 +        mops.arg1.mfn = new_mfn;
   1.631 +        result = 0;
   1.632 +    }
   1.633 +
   1.634 +failed:
   1.635 +
   1.636 +    if (unpined && (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB))
   1.637 +    {
   1.638 +        switch ( minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
   1.639 +        {
   1.640 +            case XEN_DOMCTL_PFINFO_L1TAB:
   1.641 +                mops.cmd = MMUEXT_PIN_L1_TABLE;
   1.642 +                break;
   1.643 +
   1.644 +            case XEN_DOMCTL_PFINFO_L2TAB:
   1.645 +                mops.cmd = MMUEXT_PIN_L2_TABLE;
   1.646 +                break;
   1.647 +
   1.648 +            case XEN_DOMCTL_PFINFO_L3TAB:
   1.649 +                mops.cmd = MMUEXT_PIN_L3_TABLE;
   1.650 +                break;
   1.651 +
   1.652 +            case XEN_DOMCTL_PFINFO_L4TAB:
   1.653 +                mops.cmd = MMUEXT_PIN_L4_TABLE;
   1.654 +                break;
   1.655 +
   1.656 +            default:
   1.657 +                ERROR("Unpined for non pate table page\n");
   1.658 +                break;
   1.659 +        }
   1.660 +
   1.661 +        if ( xc_mmuext_op(xc_handle, &mops, 1, domid) < 0 )
   1.662 +        {
   1.663 +            ERROR("failed to pin the mfn again\n");
   1.664 +            result = -2;
   1.665 +        }
   1.666 +    }
   1.667 +
   1.668 +    if (mmu)
   1.669 +        free(mmu);
   1.670 +
   1.671 +    if (old_ptes.entries)
   1.672 +        free(old_ptes.entries);
   1.673 +
   1.674 +    if (backup)
   1.675 +        free(backup);
   1.676 +
   1.677 +    if (gnttab)
   1.678 +        munmap(gnttab, gnt_num / (PAGE_SIZE/sizeof(struct grant_entry)));
   1.679 +
   1.680 +    close_mem_info(xc_handle, &minfo);
   1.681 +
   1.682 +    return result;
   1.683 +}
     2.1 --- a/tools/libxc/xc_suspend.c	Mon Jun 01 14:13:53 2009 +0100
     2.2 +++ b/tools/libxc/xc_suspend.c	Mon Jun 01 14:15:48 2009 +0100
     2.3 @@ -110,7 +110,7 @@ int xc_suspend_evtchn_init(int xc, int x
     2.4      return suspend_evtchn;
     2.5  
     2.6  cleanup:
     2.7 -    if (suspend_evtchn > 0)
     2.8 +    if (suspend_evtchn != -1)
     2.9          xc_suspend_evtchn_release(xce, suspend_evtchn);
    2.10  
    2.11      return -1;
     3.1 --- a/tools/libxc/xenguest.h	Mon Jun 01 14:13:53 2009 +0100
     3.2 +++ b/tools/libxc/xenguest.h	Mon Jun 01 14:15:48 2009 +0100
     3.3 @@ -163,6 +163,8 @@ int xc_mark_page_offline(int xc, unsigne
     3.4  int xc_query_page_offline_status(int xc, unsigned long start,
     3.5                                   unsigned long end, uint32_t *status);
     3.6  
     3.7 +int xc_exchange_page(int xc_handle, int domid, xen_pfn_t mfn);
     3.8 +
     3.9  
    3.10  /**
    3.11   * This function map m2p table