This patch introduces a new XENMEM_access_r_pw permission.
Functionally, it is similar to XENMEM_access_r, but for processors
with TERTIARY_EXEC_EPT_PAGING_WRITE support (Intel 12th Gen/Alder Lake
and later, Xeon 4th Gen/Sappire Rapids and later), it also permits the
CPU to write to the page during guest page-table walks (e.g., updating
A/D bits) without triggering an EPT violation.
This behavior works by both enabling the EPT paging-write feature and
setting the EPT paging-write flag in the EPT leaf entry.
This feature provides a significant performance boost for
introspection tools that monitor guest page-table updates. Previously,
every page-table modification by the guest—including routine updates
like setting A/D bits—triggered an EPT violation, adding unnecessary
overhead. The new XENMEM_access_r_pw permission allows these
"uninteresting" updates to occur without EPT violations, improving
efficiency.
Additionally, this feature simplifies the handling of race conditions
in scenarios where an introspection tool:
- Sets an "invisible breakpoint" in the altp2m view for a function F.
- Monitors guest page-table updates to track whether the page
containing F is paged out.
- Encounters a cleared Access (A) bit on the page containing F while
the guest is about to execute the breakpoint.
In the current implementation:
- If xc_monitor_inguest_pagefault() is enabled, the introspection tool
must emulate both the breakpoint and the setting of the Access bit.
- If xc_monitor_inguest_pagefault() is disabled, Xen handles the EPT
violation without notifying the introspection tool, setting the
Access bit and emulating the instruction. However, Xen fetches the
instruction from the default view instead of the altp2m view,
potentially causing the breakpoint to be missed.
With this patch, setting XENMEM_access_r_pw for monitored guest
page-tables prevents EPT violations in these cases. This change
enhances performance and reduces complexity for introspection tools,
ensuring seamless breakpoint handling while tracking guest page-table
updates.
Signed-off-by: Petr Beneš <w1benny@gmail.com>
Acked-by: Tamas K Lengyel <tamas@tklengyel.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
ACCESS(rwx),
ACCESS(rx2rw),
ACCESS(n2rwx),
+ ACCESS(r_pw),
#undef ACCESS
};
break;
else
goto err;
+ case XENMEM_access_r_pw:
case XENMEM_access_rx2rw:
case XENMEM_access_rx:
case XENMEM_access_r:
violation = npfec.read_access || npfec.insn_fetch;
break;
case XENMEM_access_r:
+ case XENMEM_access_r_pw:
violation = npfec.write_access || npfec.insn_fetch;
break;
default:
ACCESS(rwx),
ACCESS(rx2rw),
ACCESS(n2rwx),
+ ACCESS(r_pw),
#undef ACCESS
};
e->p2m.read = 0;
break;
case p2m_access_r:
+ case p2m_access_r_pw:
e->p2m.write = 0;
e->p2m.xn = 1;
break;
violation = npfec.read_access || npfec.write_access || npfec.insn_fetch;
break;
case p2m_access_r:
+ case p2m_access_r_pw:
violation = npfec.write_access || npfec.insn_fetch;
break;
case p2m_access_w:
case XENMEM_access_r:
case XENMEM_access_n:
+ case XENMEM_access_r_pw:
if ( pfec & PFEC_write_access )
req.u.mem_access.flags |= MEM_ACCESS_R | MEM_ACCESS_W;
if ( pfec & PFEC_insn_fetch )
P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection");
P(cpu_has_vmx_notify_vm_exiting, "Notify VM Exit");
P(cpu_has_vmx_virt_spec_ctrl, "Virtualize SPEC_CTRL");
+ P(cpu_has_vmx_ept_paging_write, "EPT Paging-Write");
#undef P
if ( !printed )
if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS )
{
- uint64_t opt = TERTIARY_EXEC_VIRT_SPEC_CTRL;
+ uint64_t opt = (TERTIARY_EXEC_VIRT_SPEC_CTRL |
+ TERTIARY_EXEC_EPT_PAGING_WRITE);
_vmx_tertiary_exec_control = adjust_vmx_controls2(
"Tertiary Exec Control", 0, opt,
#define cpu_has_vmx_virt_spec_ctrl \
(vmx_tertiary_exec_control & TERTIARY_EXEC_VIRT_SPEC_CTRL)
+#define cpu_has_vmx_ept_paging_write \
+ (vmx_tertiary_exec_control & TERTIARY_EXEC_EPT_PAGING_WRITE)
+
#define VMX_EPT_EXEC_ONLY_SUPPORTED 0x00000001
#define VMX_EPT_WALK_LENGTH_4_SUPPORTED 0x00000040
#define VMX_EPT_MEMORY_TYPE_UC 0x00000100
case p2m_access_r:
case p2m_access_rx:
case p2m_access_rx2rw:
+ case p2m_access_r_pw:
return IOMMUF_readable;
case p2m_access_w:
case p2m_access_n2rwx:
p2ma_10 = p2m_access_n;
break;
+ case p2m_access_r_pw:
+ p2ma_10 = p2m_access_r;
+ break;
default:
p2ma_10 = p2m_access_n;
/* For safety, remove all permissions. */
ACCESS(rwx),
ACCESS(rx2rw),
ACCESS(n2rwx),
+ ACCESS(r_pw),
#undef ACCESS
};
break;
case XENMEM_access_r:
+ case XENMEM_access_r_pw:
violation = data->flags & MEM_ACCESS_WX;
break;
ACCESS(rwx),
ACCESS(rx2rw),
ACCESS(n2rwx),
+ ACCESS(r_pw),
#undef ACCESS
};
}
/* Then restrict with access permissions */
+ entry->pw = 0;
switch ( entry->access )
{
case p2m_access_n:
case p2m_access_n2rwx:
entry->r = entry->w = entry->x = 0;
break;
+ case p2m_access_r_pw:
+ entry->pw = !!cpu_has_vmx_ept_paging_write;
+ fallthrough;
case p2m_access_r:
entry->w = entry->x = 0;
break;
* pausing the vcpu
*/
XENMEM_access_n2rwx,
+
+ /*
+ * Same as XENMEM_access_r, but on processors with
+ * the TERTIARY_EXEC_EPT_PAGING_WRITE support,
+ * CPU-initiated page-table walks can still
+ * write to it (e.g., update A/D bits)
+ */
+ XENMEM_access_r_pw,
+
/* Take the domain default */
XENMEM_access_default
} xenmem_access_t;
* generates an event but does not pause the
* vcpu */
+ p2m_access_r_pw = 10, /* Special: same as R, but on processors with
+ * the TERTIARY_EXEC_EPT_PAGING_WRITE support,
+ * CPU-initiated page-table walks can still
+ * write to it (e.g., update A/D bits)
+ */
+
/* NOTE: Assumed to be only 4 bits right now on x86. */
} p2m_access_t;