int rc;
if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
- return 1;
+ return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
{
int rc;
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
- return 1;
+ return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
{
int rc;
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
- return 1;
+ return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
{
for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
{
+ if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
+ {
+ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
+ if ( ret )
+ goto out;
+ }
+
if ( is_guest_l1_slot(i) )
switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
{
fail:
MEM_LOG("Failure in alloc_l1_table: entry %d", i);
+ out:
while ( i-- > 0 )
if ( is_guest_l1_slot(i) )
put_page_from_l1e(pl1e[i], d);
rc = -EBUSY;
}
}
+ else if ( pv_l1tf_check_l1e(pt_dom, nl1e) )
+ return -ERESTART;
else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
preserve_ad)) )
{
rc = -EBUSY;
}
}
+ else if ( pv_l1tf_check_l2e(d, nl2e) )
+ return -ERESTART;
else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
preserve_ad)) )
{
rc = -EFAULT;
}
}
+ else if ( pv_l1tf_check_l3e(d, nl3e) )
+ return -ERESTART;
else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
preserve_ad)) )
{
rc = -EFAULT;
}
}
+ else if ( pv_l1tf_check_l4e(d, nl4e) )
+ return -ERESTART;
else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
preserve_ad)) )
{
/* Check the new PTE. */
nl1e = l1e_from_intpte(val);
+
+ if ( !(l1e_get_flags(nl1e) & _PAGE_PRESENT) && pv_l1tf_check_l1e(d, nl1e) )
+ return X86EMUL_RETRY;
+
switch ( ret = get_page_from_l1e(nl1e, d, d) )
{
default:
* What we can do is force a PV guest which writes a vulnerable PTE into
* shadow mode, so Xen controls the pagetables which are reachable by the CPU
* pagewalk.
+ *
+ * The core of the L1TF vulnerability is that the address bits of the PTE
+ * (accounting for PSE and factoring in the level-relevant part of the linear
+ * access) are sent for an L1D lookup (to retrieve the next-level PTE, or
+ * eventual memory address) before the Present or reserved bits (which would
+ * cause a terminal fault) are accounted for. If an L1D hit occurs, the
+ * resulting data is available for potentially dependent instructions.
+ *
+ * For Present PTEs, the PV type-count safety logic ensures that the address
+ * bits always point at a guest-accessible frame, which is safe WRT L1TF from
+ * Xen's point of view. In practice, a PV guest should be unable to set any
+ * reserved bits, so should be unable to create any present L1TF-vulnerable
+ * PTEs at all.
+ *
+ * Therefore, these safety checks apply to Not-Present PTEs only, where
+ * traditionally, Xen would have let the guest write any value it chose.
+ *
+ * The all-zero PTE potentially leaks mfn 0. All software on the system is
+ * expected to cooperate and not put any secrets there. In a Xen system,
+ * neither Xen nor dom0 are expected to touch mfn 0, as it typically contains
+ * the real mode IVT and Bios Data Area. Therefore, mfn 0 is considered safe.
+ *
+ * Any PTE whose address is higher than the maximum cacheable address is safe,
+ * as it won't get an L1D hit.
+ *
+ * Speculative superpages also need accounting for, as PSE is considered
+ * irrespective of Present. We disallow PSE being set, as it allows an
+ * attacker to leak 2M or 1G of data starting from mfn 0. Also, because of
+ * recursive/linear pagetables, we must consider PSE even at L4, as hardware
+ * will interpret an L4e as an L3e during a recursive walk.
*/
+static inline bool_t is_l1tf_safe_maddr(intpte_t pte)
+{
+ paddr_t maddr = pte & l1tf_addr_mask;
+
+ return maddr == 0 || maddr >= l1tf_safe_maddr;
+}
+
+static inline bool_t pv_l1tf_check_pte(struct domain *d, unsigned int level,
+ intpte_t pte)
+{
+ ASSERT(is_pv_domain(d));
+ ASSERT(!(pte & _PAGE_PRESENT));
+
+ if ( d->arch.pv_domain.check_l1tf && !paging_mode_sh_forced(d) &&
+ (((level > 1) && (pte & _PAGE_PSE)) || !is_l1tf_safe_maddr(pte)) )
+ {
+#ifdef CONFIG_SHADOW_PAGING
+ struct tasklet *t = &d->arch.paging.shadow.pv_l1tf_tasklet;
+
+ printk(XENLOG_G_WARNING
+ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Shadowing\n",
+ d->domain_id, level, pte);
+ /*
+ * Safety consideration for accessing tasklet.scheduled_on without the
+ * tasklet lock. This is a singleshot tasklet with the side effect of
+ * setting PG_SH_forced (checked just above). Multiple vcpus can race
+ * to schedule the tasklet, but if we observe it scheduled anywhere,
+ * that is good enough.
+ */
+ smp_rmb();
+ if ( !tasklet_is_scheduled(t) )
+ tasklet_schedule(t);
+#else
+ printk(XENLOG_G_ERR
+ "d%d L1TF-vulnerable L%ue %016"PRIx64" - Crashing\n",
+ d->domain_id, level, pte);
+ domain_crash(d);
+#endif
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline bool_t pv_l1tf_check_l1e(struct domain *d, l1_pgentry_t l1e)
+{
+ return pv_l1tf_check_pte(d, 1, l1e.l1);
+}
+
+static inline bool_t pv_l1tf_check_l2e(struct domain *d, l2_pgentry_t l2e)
+{
+ return pv_l1tf_check_pte(d, 2, l2e.l2);
+}
+
+static inline bool_t pv_l1tf_check_l3e(struct domain *d, l3_pgentry_t l3e)
+{
+ return pv_l1tf_check_pte(d, 3, l3e.l3);
+}
+
+static inline bool_t pv_l1tf_check_l4e(struct domain *d, l4_pgentry_t l4e)
+{
+ return pv_l1tf_check_pte(d, 4, l4e.l4);
+}
+
void pv_l1tf_tasklet(unsigned long data);
static inline void pv_l1tf_domain_init(struct domain *d)