+++ /dev/null
-x86: enforce preemption in HVM_set_mem_access / p2m_set_mem_access()
-
-Processing up to 4G PFNs may take almost arbitrarily long, so
-preemption is needed here.
-
-This is XSA-89.
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Tim Deegan <tim@xen.org>
-
---- a/xen/arch/x86/hvm/hvm.c
-+++ b/xen/arch/x86/hvm/hvm.c
-@@ -4593,6 +4593,15 @@ long do_hvm_op(unsigned long op, XEN_GUE
- goto param_fail5;
-
- rc = p2m_set_mem_access(d, a.first_pfn, a.nr, a.hvmmem_access);
-+ if ( rc > 0 )
-+ {
-+ a.first_pfn += a.nr - rc;
-+ a.nr = rc;
-+ if ( __copy_to_guest(arg, &a, 1) )
-+ rc = -EFAULT;
-+ else
-+ rc = -EAGAIN;
-+ }
-
- param_fail5:
- rcu_unlock_domain(d);
---- a/xen/arch/x86/mm/p2m.c
-+++ b/xen/arch/x86/mm/p2m.c
-@@ -1333,15 +1333,14 @@ void p2m_mem_access_resume(struct domain
-
- /* Set access type for a region of pfns.
- * If start_pfn == -1ul, sets the default access type */
--int p2m_set_mem_access(struct domain *d, unsigned long start_pfn,
-- uint32_t nr, hvmmem_access_t access)
-+long p2m_set_mem_access(struct domain *d, unsigned long pfn, uint32_t nr,
-+ hvmmem_access_t access)
- {
- struct p2m_domain *p2m = p2m_get_hostp2m(d);
-- unsigned long pfn;
- p2m_access_t a, _a;
- p2m_type_t t;
- mfn_t mfn;
-- int rc = 0;
-+ long rc;
-
- /* N.B. _not_ static: initializer depends on p2m->default_access */
- p2m_access_t memaccess[] = {
-@@ -1364,14 +1363,17 @@ int p2m_set_mem_access(struct domain *d,
- a = memaccess[access];
-
- /* If request to set default access */
-- if ( start_pfn == ~0ull )
-+ if ( pfn == ~0ul )
- {
- p2m->default_access = a;
- return 0;
- }
-
-+ if ( !nr )
-+ return 0;
-+
- p2m_lock(p2m);
-- for ( pfn = start_pfn; pfn < start_pfn + nr; pfn++ )
-+ for ( ; ; ++pfn )
- {
- mfn = p2m->get_entry(p2m, pfn, &t, &_a, 0, NULL);
- if ( p2m->set_entry(p2m, pfn, mfn, PAGE_ORDER_4K, t, a) == 0 )
-@@ -1379,6 +1381,13 @@ int p2m_set_mem_access(struct domain *d,
- rc = -ENOMEM;
- break;
- }
-+
-+ /* Check for continuation if it's not the last interation. */
-+ if ( !--nr || hypercall_preempt_check() )
-+ {
-+ rc = nr;
-+ break;
-+ }
- }
- p2m_unlock(p2m);
- return rc;
---- a/xen/include/asm-x86/p2m.h
-+++ b/xen/include/asm-x86/p2m.h
-@@ -576,8 +576,8 @@ void p2m_mem_access_resume(struct domain
-
- /* Set access type for a region of pfns.
- * If start_pfn == -1ul, sets the default access type */
--int p2m_set_mem_access(struct domain *d, unsigned long start_pfn,
-- uint32_t nr, hvmmem_access_t access);
-+long p2m_set_mem_access(struct domain *d, unsigned long start_pfn,
-+ uint32_t nr, hvmmem_access_t access);
-
- /* Get access type for a pfn
- * If pfn == -1ul, gets the default access type */
--- /dev/null
+x86/paging: make log-dirty operations preemptible
+
+Both the freeing and the inspection of the bitmap get done in (nested)
+loops which - besides having a rather high iteration count in general,
+albeit that would be covered by XSA-77 - have the number of non-trivial
+iterations they need to perform (indirectly) controllable by both the
+guest they are for and any domain controlling the guest (including the
+one running qemu for it).
+
+This is XSA-97.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Tim Deegan <tim@xen.org>
+
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -2136,7 +2136,9 @@ int domain_relinquish_resources(struct d
+ pci_release_devices(d);
+
+ /* Tear down paging-assistance stuff. */
+- paging_teardown(d);
++ ret = paging_teardown(d);
++ if ( ret )
++ return ret;
+
+ /* Drop the in-use references to page-table bases. */
+ for_each_vcpu ( d, v )
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -66,6 +66,9 @@ long arch_do_domctl(
+ &domctl->u.shadow_op,
+ guest_handle_cast(u_domctl, void));
+ rcu_unlock_domain(d);
++ if ( ret == -EAGAIN )
++ return hypercall_create_continuation(__HYPERVISOR_domctl,
++ "h", u_domctl);
+ copy_to_guest(u_domctl, domctl, 1);
+ }
+ }
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -678,8 +678,7 @@ int hap_domctl(struct domain *d, xen_dom
+ paging_unlock(d);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+- rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+- u_domctl);
++ rc = -EAGAIN;
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = hap_get_allocation(d);
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -26,6 +26,7 @@
+ #include <asm/shadow.h>
+ #include <asm/p2m.h>
+ #include <asm/hap.h>
++#include <asm/event.h>
+ #include <asm/hvm/nestedhvm.h>
+ #include <xen/numa.h>
+ #include <xsm/xsm.h>
+@@ -116,26 +117,46 @@ static void paging_free_log_dirty_page(s
+ d->arch.paging.free_page(d, mfn_to_page(mfn));
+ }
+
+-void paging_free_log_dirty_bitmap(struct domain *d)
++static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
+ {
+ mfn_t *l4, *l3, *l2;
+ int i4, i3, i2;
+
++ paging_lock(d);
++
+ if ( !mfn_valid(d->arch.paging.log_dirty.top) )
+- return;
++ {
++ paging_unlock(d);
++ return 0;
++ }
+
+- paging_lock(d);
++ if ( !d->arch.paging.preempt.vcpu )
++ {
++ memset(&d->arch.paging.preempt.log_dirty, 0,
++ sizeof(d->arch.paging.preempt.log_dirty));
++ ASSERT(rc <= 0);
++ d->arch.paging.preempt.log_dirty.done = -rc;
++ }
++ else if ( d->arch.paging.preempt.vcpu != current ||
++ d->arch.paging.preempt.op != XEN_DOMCTL_SHADOW_OP_OFF )
++ {
++ paging_unlock(d);
++ return -EBUSY;
++ }
+
+ l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
++ i4 = d->arch.paging.preempt.log_dirty.i4;
++ i3 = d->arch.paging.preempt.log_dirty.i3;
++ rc = 0;
+
+- for ( i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++ )
++ for ( ; i4 < LOGDIRTY_NODE_ENTRIES; i4++, i3 = 0 )
+ {
+ if ( !mfn_valid(l4[i4]) )
+ continue;
+
+ l3 = map_domain_page(mfn_x(l4[i4]));
+
+- for ( i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
++ for ( ; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
+ {
+ if ( !mfn_valid(l3[i3]) )
+ continue;
+@@ -148,20 +169,54 @@ void paging_free_log_dirty_bitmap(struct
+
+ unmap_domain_page(l2);
+ paging_free_log_dirty_page(d, l3[i3]);
++ l3[i3] = _mfn(INVALID_MFN);
++
++ if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++ d->arch.paging.preempt.log_dirty.i4 = i4;
++ rc = -EAGAIN;
++ break;
++ }
+ }
+
+ unmap_domain_page(l3);
++ if ( rc )
++ break;
+ paging_free_log_dirty_page(d, l4[i4]);
++ l4[i4] = _mfn(INVALID_MFN);
++
++ if ( i4 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i3 = 0;
++ d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++ rc = -EAGAIN;
++ break;
++ }
+ }
+
+ unmap_domain_page(l4);
+- paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
+- d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+
+- ASSERT(d->arch.paging.log_dirty.allocs == 0);
+- d->arch.paging.log_dirty.failed_allocs = 0;
++ if ( !rc )
++ {
++ paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
++ d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
++
++ ASSERT(d->arch.paging.log_dirty.allocs == 0);
++ d->arch.paging.log_dirty.failed_allocs = 0;
++
++ rc = -d->arch.paging.preempt.log_dirty.done;
++ d->arch.paging.preempt.vcpu = NULL;
++ }
++ else
++ {
++ d->arch.paging.preempt.vcpu = current;
++ d->arch.paging.preempt.op = XEN_DOMCTL_SHADOW_OP_OFF;
++ }
+
+ paging_unlock(d);
++
++ return rc;
+ }
+
+ int paging_log_dirty_enable(struct domain *d)
+@@ -178,15 +233,25 @@ int paging_log_dirty_enable(struct domai
+ return ret;
+ }
+
+-int paging_log_dirty_disable(struct domain *d)
++static int paging_log_dirty_disable(struct domain *d, bool_t resuming)
+ {
+- int ret;
++ int ret = 1;
++
++ if ( !resuming )
++ {
++ domain_pause(d);
++ /* Safe because the domain is paused. */
++ ret = d->arch.paging.log_dirty.disable_log_dirty(d);
++ ASSERT(ret <= 0);
++ }
+
+- domain_pause(d);
+- /* Safe because the domain is paused. */
+- ret = d->arch.paging.log_dirty.disable_log_dirty(d);
+ if ( !paging_mode_log_dirty(d) )
+- paging_free_log_dirty_bitmap(d);
++ {
++ ret = paging_free_log_dirty_bitmap(d, ret);
++ if ( ret == -EAGAIN )
++ return ret;
++ }
++
+ domain_unpause(d);
+
+ return ret;
+@@ -326,7 +391,9 @@ int paging_mfn_is_dirty(struct domain *d
+
+ /* Read a domain's log-dirty bitmap and stats. If the operation is a CLEAN,
+ * clear the bitmap and stats as well. */
+-int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc)
++static int paging_log_dirty_op(struct domain *d,
++ struct xen_domctl_shadow_op *sc,
++ bool_t resuming)
+ {
+ int rv = 0, clean = 0, peek = 1;
+ unsigned long pages = 0;
+@@ -334,9 +401,22 @@ int paging_log_dirty_op(struct domain *d
+ unsigned long *l1 = NULL;
+ int i4, i3, i2;
+
+- domain_pause(d);
++ if ( !resuming )
++ domain_pause(d);
+ paging_lock(d);
+
++ if ( !d->arch.paging.preempt.vcpu )
++ memset(&d->arch.paging.preempt.log_dirty, 0,
++ sizeof(d->arch.paging.preempt.log_dirty));
++ else if ( d->arch.paging.preempt.vcpu != current ||
++ d->arch.paging.preempt.op != sc->op )
++ {
++ paging_unlock(d);
++ ASSERT(!resuming);
++ domain_unpause(d);
++ return -EBUSY;
++ }
++
+ clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+
+ PAGING_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+@@ -365,17 +445,15 @@ int paging_log_dirty_op(struct domain *d
+ goto out;
+ }
+
+- pages = 0;
+ l4 = paging_map_log_dirty_bitmap(d);
++ i4 = d->arch.paging.preempt.log_dirty.i4;
++ i3 = d->arch.paging.preempt.log_dirty.i3;
++ pages = d->arch.paging.preempt.log_dirty.done;
+
+- for ( i4 = 0;
+- (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES);
+- i4++ )
++ for ( ; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++, i3 = 0 )
+ {
+ l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+- for ( i3 = 0;
+- (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES);
+- i3++ )
++ for ( ; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ )
+ {
+ l2 = ((l3 && mfn_valid(l3[i3])) ?
+ map_domain_page(mfn_x(l3[i3])) : NULL);
+@@ -410,18 +488,51 @@ int paging_log_dirty_op(struct domain *d
+ }
+ if ( l2 )
+ unmap_domain_page(l2);
++
++ if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i4 = i4;
++ d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++ rv = -EAGAIN;
++ break;
++ }
+ }
+ if ( l3 )
+ unmap_domain_page(l3);
++
++ if ( !rv && i4 < LOGDIRTY_NODE_ENTRIES - 1 &&
++ hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++ d->arch.paging.preempt.log_dirty.i3 = 0;
++ rv = -EAGAIN;
++ }
++ if ( rv )
++ break;
+ }
+ if ( l4 )
+ unmap_domain_page(l4);
+
+- if ( pages < sc->pages )
+- sc->pages = pages;
++ if ( !rv )
++ d->arch.paging.preempt.vcpu = NULL;
++ else
++ {
++ d->arch.paging.preempt.vcpu = current;
++ d->arch.paging.preempt.op = sc->op;
++ d->arch.paging.preempt.log_dirty.done = pages;
++ }
+
+ paging_unlock(d);
+
++ if ( rv )
++ {
++ /* Never leave the domain paused for other errors. */
++ ASSERT(rv == -EAGAIN);
++ return rv;
++ }
++
++ if ( pages < sc->pages )
++ sc->pages = pages;
+ if ( clean )
+ {
+ /* We need to further call clean_dirty_bitmap() functions of specific
+@@ -432,6 +543,7 @@ int paging_log_dirty_op(struct domain *d
+ return rv;
+
+ out:
++ d->arch.paging.preempt.vcpu = NULL;
+ paging_unlock(d);
+ domain_unpause(d);
+
+@@ -498,12 +610,6 @@ void paging_log_dirty_init(struct domain
+ d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
+ }
+
+-/* This function fress log dirty bitmap resources. */
+-static void paging_log_dirty_teardown(struct domain*d)
+-{
+- paging_free_log_dirty_bitmap(d);
+-}
+-
+ /************************************************/
+ /* CODE FOR PAGING SUPPORT */
+ /************************************************/
+@@ -547,6 +653,7 @@ void paging_vcpu_init(struct vcpu *v)
+ int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
+ XEN_GUEST_HANDLE(void) u_domctl)
+ {
++ bool_t resuming = 0;
+ int rc;
+
+ if ( unlikely(d == current->domain) )
+@@ -569,6 +676,20 @@ int paging_domctl(struct domain *d, xen_
+ return -EINVAL;
+ }
+
++ if ( d->arch.paging.preempt.vcpu )
++ {
++ if ( d->arch.paging.preempt.vcpu != current ||
++ d->arch.paging.preempt.op != sc->op )
++ {
++ printk(XENLOG_G_DEBUG
++ "d%d:v%d: Paging op %#x on Dom%u with unfinished prior op %#x\n",
++ current->domain->domain_id, current->vcpu_id,
++ sc->op, d->domain_id, d->arch.paging.preempt.op);
++ return -EBUSY;
++ }
++ resuming = 1;
++ }
++
+ rc = xsm_shadow_control(d, sc->op);
+ if ( rc )
+ return rc;
+@@ -594,13 +714,13 @@ int paging_domctl(struct domain *d, xen_
+
+ case XEN_DOMCTL_SHADOW_OP_OFF:
+ if ( paging_mode_log_dirty(d) )
+- if ( (rc = paging_log_dirty_disable(d)) != 0 )
++ if ( (rc = paging_log_dirty_disable(d, resuming)) != 0 )
+ return rc;
+ break;
+
+ case XEN_DOMCTL_SHADOW_OP_CLEAN:
+ case XEN_DOMCTL_SHADOW_OP_PEEK:
+- return paging_log_dirty_op(d, sc);
++ return paging_log_dirty_op(d, sc, resuming);
+ }
+
+ /* Here, dispatch domctl to the appropriate paging code */
+@@ -611,18 +731,24 @@ int paging_domctl(struct domain *d, xen_
+ }
+
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d)
++int paging_teardown(struct domain *d)
+ {
++ int rc;
++
+ if ( hap_enabled(d) )
+ hap_teardown(d);
+ else
+ shadow_teardown(d);
+
+ /* clean up log dirty resources. */
+- paging_log_dirty_teardown(d);
++ rc = paging_free_log_dirty_bitmap(d, 0);
++ if ( rc == -EAGAIN )
++ return rc;
+
+ /* Move populate-on-demand cache back to domain_list for destruction */
+ p2m_pod_empty_cache(d);
++
++ return rc;
+ }
+
+ /* Call once all of the references to the domain have gone away */
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -3829,8 +3829,7 @@ int shadow_domctl(struct domain *d,
+ paging_unlock(d);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+- rc = hypercall_create_continuation(
+- __HYPERVISOR_domctl, "h", u_domctl);
++ rc = -EAGAIN;
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = shadow_get_allocation(d);
+--- a/xen/common/domain.c
++++ b/xen/common/domain.c
+@@ -479,7 +479,6 @@ int domain_kill(struct domain *d)
+ rc = domain_relinquish_resources(d);
+ if ( rc != 0 )
+ {
+- BUG_ON(rc != -EAGAIN);
+ break;
+ }
+ if ( sched_move_domain(d, cpupool0) )
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -193,6 +193,20 @@ struct paging_domain {
+ struct hap_domain hap;
+ /* log dirty support */
+ struct log_dirty_domain log_dirty;
++
++ /* preemption handling */
++ struct {
++ struct vcpu *vcpu;
++ unsigned int op;
++ union {
++ struct {
++ unsigned long done:PADDR_BITS - PAGE_SHIFT;
++ unsigned long i4:PAGETABLE_ORDER;
++ unsigned long i3:PAGETABLE_ORDER;
++ } log_dirty;
++ };
++ } preempt;
++
+ /* alloc/free pages from the pool for paging-assistance structures
+ * (used by p2m and log-dirty code for their tries) */
+ struct page_info * (*alloc_page)(struct domain *d);
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -141,9 +141,6 @@ struct paging_mode {
+ /*****************************************************************************
+ * Log dirty code */
+
+-/* free log dirty bitmap resource */
+-void paging_free_log_dirty_bitmap(struct domain *d);
+-
+ /* get the dirty bitmap for a specific range of pfns */
+ void paging_log_dirty_range(struct domain *d,
+ unsigned long begin_pfn,
+@@ -153,9 +150,6 @@ void paging_log_dirty_range(struct domai
+ /* enable log dirty */
+ int paging_log_dirty_enable(struct domain *d);
+
+-/* disable log dirty */
+-int paging_log_dirty_disable(struct domain *d);
+-
+ /* log dirty initialization */
+ void paging_log_dirty_init(struct domain *d,
+ int (*enable_log_dirty)(struct domain *d),
+@@ -218,7 +212,7 @@ int paging_domctl(struct domain *d, xen_
+ XEN_GUEST_HANDLE(void) u_domctl);
+
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d);
++int paging_teardown(struct domain *d);
+
+ /* Call once all of the references to the domain have gone away */
+ void paging_final_teardown(struct domain *d);