$(patsubst $(srcdir)/%,%,$(wildcard $(srcdir)/internals/*.html.in))
internals_html = $(internals_html_in:%.html.in=%.html)
+kbase_html_in = \
+ $(patsubst $(srcdir)/%,%,$(wildcard $(srcdir)/kbase/*.html.in))
+kbase_html = $(kbase_html_in:%.html.in=%.html)
+
# Since we ship pre-built html in the tarball, we must also
# ship the sources, even when those sources are themselves
# generated.
$(xml) $(qemu_xml) $(lxc_xml) $(admin_xml) $(fig) $(png) $(css) \
$(javascript) $(logofiles) \
$(internals_html_in) $(internals_html) $(fonts) \
+ $(kbase_html_in) $(kbase_html) \
aclperms.htmlinc \
hvsupport.pl \
$(schema_DATA)
$(addprefix $(srcdir)/,$(apihtml)) \
$(addprefix $(srcdir)/,$(devhelphtml)) \
$(addprefix $(srcdir)/,$(internals_html)) \
+ $(addprefix $(srcdir)/,$(kbase_html)) \
$(srcdir)/hvsupport.html.in $(srcdir)/aclperms.htmlinc
timestamp="$(shell if test -n "$$SOURCE_DATE_EPOCH"; \
lxc_api: $(srcdir)/libvirt-lxc-api.xml $(srcdir)/libvirt-lxc-refs.xml
admin_api: $(srcdir)/libvirt-admin-api.xml $(srcdir)/libvirt-admin-refs.xml
-web: $(dot_html) $(internals_html) html/index.html devhelp/index.html
+web: $(dot_html) $(internals_html) $(kbase_html) \
+ html/index.html devhelp/index.html
hvsupport.html: $(srcdir)/hvsupport.html.in
$(mkinstalldirs) $(DESTDIR)$(HTML_DIR)/internals
for f in $(internals_html); do \
$(INSTALL) -m 0644 $(srcdir)/$$f $(DESTDIR)$(HTML_DIR)/internals; done
+ $(mkinstalldirs) $(DESTDIR)$(HTML_DIR)/kbase
+ for f in $(kbase_html); do \
+ $(INSTALL) -m 0644 $(srcdir)/$$f $(DESTDIR)$(HTML_DIR)/kbase; done
$(mkinstalldirs) $(DESTDIR)$(DEVHELP_DIR)
for file in $(devhelphtml) $(devhelppng) $(devhelpcss); do \
$(INSTALL) -m 0644 $(srcdir)/$${file} $(DESTDIR)$(DEVHELP_DIR) ; \
for f in $(internals_html); do \
rm -f $(DESTDIR)$(HTML_DIR)/$$f; \
done
+ for f in $(kbase_html); do \
+ rm -f $(DESTDIR)$(HTML_DIR)/$$f; \
+ done
for f in $(devhelphtml) $(devhelppng) $(devhelpcss); do \
rm -f $(DESTDIR)$(DEVHELP_DIR)/$$(basename $$f); \
done
<dt><a href="uri.html">URI format</a></dt>
<dd>The URI formats used for connecting to libvirt</dd>
- <dt><a href="locking.html">Disk locking</a></dt>
- <dd>Ensuring exclusive guest access to disks with
- <a href="locking-lockd.html">virtlockd</a> or
- <a href="locking-sanlock.html">Sanlock</a></dd>
-
<dt><a href="cgroups.html">CGroups</a></dt>
<dd>Control groups integration</dd>
<dt><a href="hvsupport.html">Driver support</a></dt>
<dd>matrix of API support per hypervisor per release</dd>
- <dt><a href="secureusage.html">Secure usage</a></dt>
- <dd>Secure usage of the libvirt APIs</dd>
+ <dt><a href="kbase.html">Knowledge Base</a></dt>
+ <dd>Task oriented guides to key features</dd>
</dl>
</div>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <body class="docs">
+ <h2>Knowledge base</h2>
+
+ <div class="panel">
+ <dl>
+ <dt><a href="kbase/locking.html">Disk locking</a></dt>
+ <dd>Ensuring exclusive guest access to disks with
+ <a href="kbase/locking-lockd.html">virtlockd</a> or
+ <a href="kbase/locking-sanlock.html">Sanlock</a></dd>
+
+ <dt><a href="kbase/secureusage.html">Secure usage</a></dt>
+ <dd>Secure usage of the libvirt APIs</dd>
+
+ <dt><a href="kbase/launch_security_sev.html">Launch security</a></dt>
+ <dd>Securely launching VMs with AMD SEV</dd>
+ </dl>
+ </div>
+
+ <br class="clear"/>
+
+ </body>
+</html>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <body>
+ <h1>Launch security with AMD SEV</h1>
+
+ <ul id="toc"></ul>
+
+ <p>
+ Storage encryption in modern public cloud computing is a common practice.
+ However, from the point of view of a user of these cloud workloads, a
+ significant amount of trust needs to be put in the cloud platform security as
+ well as integrity (was the hypervisor tampered?). For this reason there's ever
+ rising demand for securing data in use, i.e. memory encryption.
+ One of the solutions addressing this matter is AMD SEV.
+ </p>
+
+ <h2>AMD SEV</h2>
+ <p>
+ SEV (Secure Encrypted Virtualization) is a feature extension of AMD's SME (Secure
+ Memory Encryption) intended for KVM virtual machines which is supported
+ primarily on AMD's EPYC CPU line. In contrast to SME, SEV uses a unique memory encryption
+ key for each VM. The whole encryption of memory pages is completely transparent
+ to the hypervisor and happens inside dedicated hardware in the on-die memory controller.
+ Each controller includes a high-performance Advanced Encryption Standard
+ (AES) engine that encrypts data when it is written to DRAM and decrypts it
+ when read.
+
+ For more details about the technology itself, you can visit
+ <a href="https://developer.amd.com/sev/">AMD's developer portal</a>.
+ </p>
+
+ <h2><a id="Host">Enabling SEV on the host</a></h2>
+ <p>
+ Before VMs can make use of the SEV feature you need to make sure your
+ AMD CPU does support SEV. You can check whether SEV is among the CPU
+ flags with:
+ </p>
+
+ <pre>
+$ cat /proc/cpuinfo | grep sev
+...
+sme ssbd sev ibpb</pre>
+
+ <p>
+ Next step is to enable SEV in the kernel, because it is disabled by default.
+ This is done by putting the following onto the kernel command line:
+ </p>
+
+ <pre>
+mem_encrypt=on kvm_amd.sev=1
+ </pre>
+
+ <p>
+ To make the changes persistent, append the above to the variable holding
+ parameters of the kernel command line in
+ <code>/etc/default/grub</code> to preserve SEV settings across reboots
+ </p>
+
+ <pre>
+$ cat /etc/default/grub
+...
+GRUB_CMDLINE_LINUX="... mem_encrypt=on kvm_amd.sev=1"
+$ grub2-mkconfig -o /boot/efi/EFI/<distro>/grub.cfg</pre>
+
+ <p>
+ <code>mem_encrypt=on</code> turns on the SME memory encryption feature on
+ the host which protects against the physical attack on the hypervisor
+ memory. The <code>kvm_amd.sev</code> parameter actually enables SEV in
+ the kvm module. It can be set on the command line alongside
+ <code>mem_encrypt</code> like shown above, or it can be put into a
+ module config under <code>/etc/modprobe.d/</code>
+ </p>
+
+ <pre>
+$ cat /etc/modprobe.d/sev.conf
+options kvm_amd sev=1
+ </pre>
+
+ <p>
+ After rebooting the host, you should see SEV being enabled in the kernel:
+ </p>
+
+ <pre>
+$ cat /sys/module/kvm_amd/parameters/sev
+1
+ </pre>
+
+ <h2><a id="Virt">Checking SEV support in the virt stack</a></h2>
+ <p>
+ <b>Note: All of the commands bellow need to be run with root privileges.</b>
+ </p>
+
+ <p>
+ First make sure you have the following packages in the specified versions:
+ </p>
+
+ <ul>
+ <li>
+ libvirt >= 4.5.0 (>5.1.0 recommended due to additional SEV bugfixes)
+ </li>
+ <li>
+ QEMU >= 2.12.0
+ </li>
+ </ul>
+ <p>
+ To confirm that the virtualization stack supports SEV, run the following:
+ </p>
+
+ <pre>
+# virsh domcapabilities
+<domainCapabilities>
+...
+ <features>
+ ...
+ <sev supported='yes'>
+ <cbitpos>47</cbitpos>
+ <reducedPhysBits>1</reducedPhysBits>
+ </sev>
+ ...
+ </features>
+</domainCapabilities></pre>
+ <p>
+ Note that if libvirt was already installed and libvirtd running before enabling SEV in the kernel followed by the host reboot you need to force libvirtd
+ to re-probe both the host and QEMU capabilities. First stop libvirtd:
+ </p>
+
+ <pre>
+# systemctl stop libvirtd.service
+ </pre>
+
+ <p>
+ Now you need to clean the capabilities cache:
+ </p>
+
+ <pre>
+# rm -f /var/cache/libvirt/qemu/capabilities/*
+ </pre>
+
+ <p>
+ If you now restart libvirtd, it will re-probe the capabilities and if
+ you now run:
+ </p>
+
+ <pre>
+# virsh domcapabilities
+ </pre>
+
+ <p>
+ SEV should be listed as supported. If you still see:
+ </p>
+
+ <pre>
+<sev supported='no'/>
+ </pre>
+
+ <p>
+ it means one of two things:
+ <ol>
+ <li>
+ libvirt does support SEV, but either QEMU or the host does not
+ </li>
+ <li>
+ you have libvirt <=5.1.0 which suffered from getting a
+ <code>'Permission denied'</code> on <code>/dev/sev</code> because
+ of the default permissions on the character device which prevented
+ QEMU from opening it during capabilities probing - you can either
+ manually tweak the permissions so that QEMU has access to it or
+ preferably install libvirt 5.1.0 or higher
+ </li>
+ </ol>
+ </p>
+
+ <h2><a id="Configuration">VM Configuration</a></h2>
+ <p>
+ SEV is enabled in the XML by specifying the
+ <a href="https://libvirt.org/formatdomain.html#launchSecurity"><launchSecurity> </a> element. However, specifying <code>launchSecurity</code> isn't
+ enough to boot an SEV VM. Further configuration requirements are discussed
+ below.
+ </p>
+
+ <h3><a id="Machine">Machine type</a></h3>
+ <p>
+ Even though both Q35 and legacy PC machine types (for PC see also
+ "virtio") can be used with SEV, usage of the legacy PC machine type is
+ strongly discouraged, since depending on how your OVMF package was
+ built (e.g. including features like SecureBoot or SMM) Q35 may even be
+ required.
+ </p>
+
+ <h5>Q35</h5>
+<pre>
+...
+<os>
+ <type arch='x86_64' machine='pc-q35-3.0'>hvm</type>
+ ...
+</os>
+...</pre>
+
+ <h5>i440fx (discouraged)</h5>
+ <pre>
+...
+<os>
+ <type arch='x86_64' machine='pc-i440fx-3.0'>hvm</type>
+ ...
+</os>
+...
+ </pre>
+
+ <h3><a id="Boot">Boot loader</a></h3>
+ <p>
+ SEV is only going to work with OVMF (UEFI), so you'll need to point libvirt to
+ the correct OVMF binary.
+ </p>
+ <pre>
+...
+<os>
+ <type arch='x86_64' machine='pc-q35-3.0'>hvm</type>
+ <loader readonly='yes' type='pflash'>/usr/share/edk2/ovmf/OVMF_CODE.fd</loader>
+</os>
+...</pre>
+
+ <h3><a id="Memory">Memory</a></h3>
+ <p>
+ Internally, SEV expects that the encrypted memory pages won't be swapped out or move
+ around so the VM memory needs to be pinned in physical RAM which will be
+ handled by QEMU. Apart from that, certain memory regions allocated by QEMU
+ itself (UEFI pflash, device ROMs, video RAM, etc.) have to be encrypted as
+ well. This causes a conflict in how libvirt tries to protect the host.
+ By default, libvirt enforces a memory hard limit on each VM's cgroup in order
+ to protect the host from malicious QEMU to allocate and lock all the available
+ memory. This limit corresponds to the total memory allocation for the VM given
+ by <code><currentMemory></code> element. However, trying to account for the additional
+ memory regions QEMU allocates when calculating the limit in an automated manner
+ is non-deterministic. One way to resolve this is to set the hard limit manually.
+
+ <p>
+ Note: Figuring out the right number so that your guest boots and isn't killed is
+ challenging, but 256MiB extra memory over the total guest RAM should suffice for
+ most workloads and may serve as a good starting point.
+
+ For example, a domain with 4GB memory with a 256MiB extra hard limit would look
+ like this:
+ </p>
+ </p>
+
+ <pre>
+# virsh edit <domain>
+<domain>
+ ...
+ <currentMemory unit='KiB'>4194304</currentMemory>
+ <memtune>
+ <hard_limit unit='KiB'>4456448</hard_limit>
+ </memtune>
+ ...
+</domain></pre>
+ <p>
+ There's another, preferred method of taking care of the limits by
+ using the<code><memoryBacking></code> element along with the
+ <code><locked/></code> subelement:
+ </p>
+
+ <pre>
+<domain>
+ ...
+ <memoryBacking>
+ <locked/>
+ </memoryBacking>
+ ...
+</domain></pre>
+
+ <p>
+ What that does is that it tells libvirt not to force any hard limit (well,
+ unlimited) upon the VM cgroup. The obvious advantage is that one doesn't need
+ to determine the hard limit for every single SEV-enabled VM. However, there is
+ a significant security-related drawback to this approach. Since no hard limit
+ is applied, a malicious QEMU could perform a DoS attack by locking all of the
+ host's available memory. The way to avoid this issue and to protect the host is
+ to enforce a bigger hard limit on the master cgroup containing all of the VMs
+ - on systemd this is <code>machine.slice</code>.
+ </p>
+
+ <pre>
+# systemctl set-property machine.slice MemoryHigh=<value></pre>
+
+ <p>
+ To put even stricter measures in place which would involve the OOM killer, use
+ <pre>
+# systemctl set-property machine.slice MemoryMax=<value></pre>
+ instead. Alternatively, you can create a systemd config (don't forget
+ to reload systemd configuration in this case):
+ <pre>
+# cat << EOF > /etc/systemd/system.control/machine.slice.d/90-MemoryMax.conf
+MemoryMax=<value>
+EOF</pre>
+ The trade-off to keep in mind with the second approach is that the VMs
+ can still perform DoS on each other.
+ </p>
+
+ <h3><a id="Virtio">Virtio</a></h3>
+ <p>
+ In order to make virtio devices work, we need to enable emulated IOMMU
+ on the devices so that virtual DMA can work.
+ </p>
+
+ <pre>
+# virsh edit <domain>
+<domain>
+ ...
+ <controller type='virtio-serial' index='0'>
+ <driver iommu='on'/>
+ </controller>
+ <controller type='scsi' index='0' model='virtio-scsi'>
+ <driver iommu='on'/>
+ </controller>
+ ...
+ <memballoon model='virtio'>
+ <driver iommu='on'/>
+ </memballoon>
+ <rng model='virtio'>
+ <backend model='random'>/dev/urandom</backend>
+ <driver iommu='on'/>
+ </rng>
+ ...
+<domain></pre>
+
+ <p>
+ If you for some reason want to use the legacy PC machine type, further changes
+ to the virtio
+ configuration is required, because SEV will not work with Virtio <1.0. In
+ libvirt, this is handled by using the virtio-non-transitional device model
+ (libvirt >= 5.2.0 required).
+
+ <p>
+ Note: some devices like video devices don't
+ support non-transitional model, which means that virtio GPU cannot be used.
+ </p>
+ </p>
+
+ <pre>
+<domain>
+ ...
+ <devices>
+ ...
+ <memballoon model='virtio-non-transitional'>
+ <driver iommu='on'/>
+ </memballoon>
+ </devices>
+ ...
+</domain></pre>
+
+ <h2><a id="Limitations">Limitations</a></h2>
+ <p>
+ Currently, the boot disk cannot be of type virtio-blk, instead, virtio-scsi
+ needs to be used if virtio is desired. This limitation is expected to be lifted
+ with future releases of kernel (the kernel used at the time of writing the
+ article is 5.0.14).
+ If you still cannot start an SEV VM, it could be because of wrong SELinux label on the <code>/dev/sev</code> device with selinux-policy <3.14.2.40 which prevents QEMU from touching the device. This can be resolved by upgrading the package, tuning the selinux policy rules manually to allow svirt_t to access the device (see <code>audit2allow</code> on how to do that) or putting SELinux into permissive mode (discouraged).
+ </p>
+
+ <h2><a id="Examples">Full domain XML examples</a></h2>
+
+ <h5>Q35 machine</h5>
+ <pre>
+<domain type='kvm'>
+ <name>sev-dummy</name>
+ <memory unit='KiB'>4194304</memory>
+ <currentMemory unit='KiB'>4194304</currentMemory>
+ <memoryBacking>
+ <locked/>
+ </memoryBacking>
+ <vcpu placement='static'>4</vcpu>
+ <os>
+ <type arch='x86_64' machine='pc-q35-3.0'>hvm</type>
+ <loader readonly='yes' type='pflash'>/usr/share/edk2/ovmf/OVMF_CODE.fd</loader>
+ <nvram>/var/lib/libvirt/qemu/nvram/sev-dummy_VARS.fd</nvram>
+ </os>
+ <features>
+ <acpi/>
+ <apic/>
+ <vmport state='off'/>
+ </features>
+ <cpu mode='host-model' check='partial'>
+ <model fallback='allow'/>
+ </cpu>
+ <clock offset='utc'>
+ <timer name='rtc' tickpolicy='catchup'/>
+ <timer name='pit' tickpolicy='delay'/>
+ <timer name='hpet' present='no'/>
+ </clock>
+ <on_poweroff>destroy</on_poweroff>
+ <on_reboot>restart</on_reboot>
+ <on_crash>destroy</on_crash>
+ <pm>
+ <suspend-to-mem enabled='no'/>
+ <suspend-to-disk enabled='no'/>
+ </pm>
+ <devices>
+ <emulator>/usr/bin/qemu-kvm</emulator>
+ <disk type='file' device='disk'>
+ <driver name='qemu' type='qcow2'/>
+ <source file='/var/lib/libvirt/images/sev-dummy.qcow2'/>
+ <target dev='sda' bus='scsi'/>
+ <boot order='1'/>
+ </disk>
+ <controller type='virtio-serial' index='0'>
+ <driver iommu='on'/>
+ </controller>
+ <controller type='scsi' index='0' model='virtio-scsi'>
+ <driver iommu='on'/>
+ </controller>
+ <interface type='network'>
+ <mac address='52:54:00:cc:56:90'/>
+ <source network='default'/>
+ <model type='virtio'/>
+ <driver iommu='on'/>
+ </interface>
+ <graphics type='spice' autoport='yes'>
+ <listen type='address'/>
+ <gl enable='no'/>
+ </graphics>
+ <video>
+ <model type='qxl'/>
+ </video>
+ <memballoon model='virtio'>
+ <driver iommu='on'/>
+ </memballoon>
+ <rng model='virtio'>
+ <driver iommu='on'/>
+ </rng>
+ </devices>
+ <launchSecurity type='sev'>
+ <cbitpos>47</cbitpos>
+ <reducedPhysBits>1</reducedPhysBits>
+ <policy>0x0003</policy>
+ </launchSecurity>
+</domain></pre>
+
+ <h5>PC-i440fx machine:</h5>
+ <pre>
+<domain type='kvm'>
+ <name>sev-dummy-legacy</name>
+ <memory unit='KiB'>4194304</memory>
+ <currentMemory unit='KiB'>4194304</currentMemory>
+ <memtune>
+ <hard_limit unit='KiB'>5242880</hard_limit>
+ </memtune>
+ <vcpu placement='static'>4</vcpu>
+ <os>
+ <type arch='x86_64' machine='pc-i440fx-3.0'>hvm</type>
+ <loader readonly='yes' type='pflash'>/usr/share/edk2/ovmf/OVMF_CODE.fd</loader>
+ <nvram>/var/lib/libvirt/qemu/nvram/sev-dummy_VARS.fd</nvram>
+ <boot dev='hd'/>
+ </os>
+ <features>
+ <acpi/>
+ <apic/>
+ <vmport state='off'/>
+ </features>
+ <cpu mode='host-model' check='partial'>
+ <model fallback='allow'/>
+ </cpu>
+ <clock offset='utc'>
+ <timer name='rtc' tickpolicy='catchup'/>
+ <timer name='pit' tickpolicy='delay'/>
+ <timer name='hpet' present='no'/>
+ </clock>
+ <on_poweroff>destroy</on_poweroff>
+ <on_reboot>restart</on_reboot>
+ <on_crash>destroy</on_crash>
+ <pm>
+ <suspend-to-mem enabled='no'/>
+ <suspend-to-disk enabled='no'/>
+ </pm>
+ <devices>
+ <emulator>/usr/bin/qemu-kvm</emulator>
+ <disk type='file' device='disk'>
+ <driver name='qemu' type='qcow2'/>
+ <source file='/var/lib/libvirt/images/sev-dummy-seabios.qcow2'/>
+ <target dev='sda' bus='sata'/>
+ </disk>
+ <interface type='network'>
+ <mac address='52:54:00:d8:96:c8'/>
+ <source network='default'/>
+ <model type='virtio-non-transitional'/>
+ </interface>
+ <serial type='pty'>
+ <target type='isa-serial' port='0'>
+ <model name='isa-serial'/>
+ </target>
+ </serial>
+ <console type='pty'>
+ <target type='serial' port='0'/>
+ </console>
+ <input type='tablet' bus='usb'>
+ <address type='usb' bus='0' port='1'/>
+ </input>
+ <input type='mouse' bus='ps2'/>
+ <input type='keyboard' bus='ps2'/>
+ <graphics type='spice' autoport='yes'>
+ <listen type='address'/>
+ <gl enable='no'/>
+ </graphics>
+ <video>
+ <model type='qxl' ram='65536' vram='65536' vgamem='16384' heads='1' primary='yes'/>
+ </video>
+ <memballoon model='virtio-non-transitional'>
+ <driver iommu='on'/>
+ </memballoon>
+ <rng model='virtio-non-transitional'>
+ <driver iommu='on'/>
+ </rng>
+ </devices>
+ <launchSecurity type='sev'>
+ <cbitpos>47</cbitpos>
+ <reducedPhysBits>1</reducedPhysBits>
+ <policy>0x0003</policy>
+ </launchSecurity>
+</domain></pre>
+ </body>
+</html>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <body>
+ <h1>Virtual machine lock manager, virtlockd plugin</h1>
+
+ <ul id="toc"></ul>
+
+ <p>
+ This page describes use of the <code>virtlockd</code>
+ service as a <a href="locking.html">lock driver</a>
+ plugin for virtual machine disk mutual exclusion.
+ </p>
+
+ <h2><a id="background">virtlockd background</a></h2>
+
+ <p>
+ The virtlockd daemon is a single purpose binary which
+ focuses exclusively on the task of acquiring and holding
+ locks on behalf of running virtual machines. It is
+ designed to offer a low overhead, portable locking
+ scheme can be used out of the box on virtualization
+ hosts with minimal configuration overheads. It makes
+ use of the POSIX fcntl advisory locking capability
+ to hold locks, which is supported by the majority of
+ commonly used filesystems.
+ </p>
+
+ <h2><a id="sanlock">virtlockd daemon setup</a></h2>
+
+ <p>
+ In most OS, the virtlockd daemon itself will not require
+ any upfront configuration work. It is installed by default
+ when libvirtd is present, and a systemd socket unit is
+ registered such that the daemon will be automatically
+ started when first required. With OS that predate systemd
+ though, it will be necessary to start it at boot time,
+ prior to libvirtd being started. On RHEL/Fedora distros,
+ this can be achieved as follows
+ </p>
+
+ <pre>
+# chkconfig virtlockd on
+# service virtlockd start
+ </pre>
+
+ <p>
+ The above instructions apply to the instance of virtlockd
+ that runs privileged, and is used by the libvirtd daemon
+ that runs privileged. If running libvirtd as an unprivileged
+ user, it will always automatically spawn an instance of
+ the virtlockd daemon unprivileged too. This requires no
+ setup at all.
+ </p>
+
+ <h2><a id="lockdplugin">libvirt lockd plugin configuration</a></h2>
+
+ <p>
+ Once the virtlockd daemon is running, or setup to autostart,
+ the next step is to configure the libvirt lockd plugin.
+ There is a separate configuration file for each libvirt
+ driver that is using virtlockd. For QEMU, we will edit
+ <code>/etc/libvirt/qemu-lockd.conf</code>
+ </p>
+
+ <p>
+ The default behaviour of the lockd plugin is to acquire locks
+ directly on the virtual disk images associated with the guest
+ <disk> elements. This ensures it can run out of the box
+ with no configuration, providing locking for disk images on
+ shared filesystems such as NFS. It does not provide any cross
+ host protection for storage that is backed by block devices,
+ since locks acquired on device nodes in /dev only apply within
+ the host. It may also be the case that the filesystem holding
+ the disk images is not capable of supporting fcntl locks.
+ </p>
+
+ <p>
+ To address these problems it is possible to tell lockd to
+ acquire locks on an indirect file. Essentially lockd will
+ calculate the SHA256 checksum of the fully qualified path,
+ and create a zero length file in a given directory whose
+ filename is the checksum. It will then acquire a lock on
+ that file. Assuming the block devices assigned to the guest
+ are using stable paths (eg /dev/disk/by-path/XXXXXXX) then
+ this will allow for locks to apply across hosts. This
+ feature can be enabled by setting a configuration setting
+ that specifies the directory in which to create the lock
+ files. The directory referred to should of course be
+ placed on a shared filesystem (eg NFS) that is accessible
+ to all hosts which can see the shared block devices.
+ </p>
+
+ <pre>
+$ su - root
+# augtool -s set \
+ /files/etc/libvirt/qemu-lockd.conf/file_lockspace_dir \
+ "/var/lib/libvirt/lockd/files"
+ </pre>
+
+ <p>
+ If the guests are using either LVM and SCSI block devices
+ for their virtual disks, there is a unique identifier
+ associated with each device. It is possible to tell lockd
+ to use this UUID as the basis for acquiring locks, rather
+ than the SHA256 sum of the filename. The benefit of this
+ is that the locking protection will work even if the file
+ paths to the given block device are different on each
+ host.
+ </p>
+
+ <pre>
+$ su - root
+# augtool -s set \
+ /files/etc/libvirt/qemu-lockd.conf/scsi_lockspace_dir \
+ "/var/lib/libvirt/lockd/scsi"
+# augtool -s set \
+ /files/etc/libvirt/qemu-lockd.conf/lvm_lockspace_dir \
+ "/var/lib/libvirt/lockd/lvm"
+ </pre>
+
+ <p>
+ It is important to remember that the changes made to the
+ <code>/etc/libvirt/qemu-lockd.conf</code> file must be
+ propagated to all hosts before any virtual machines are
+ launched on them. This ensures that all hosts are using
+ the same locking mechanism
+ </p>
+
+ <h2><a id="qemuconfig">QEMU/KVM driver configuration</a></h2>
+
+ <p>
+ The QEMU driver is capable of using the virtlockd plugin
+ since the release <span>1.0.2</span>.
+ The out of the box configuration, however, currently
+ uses the <strong>nop</strong> lock manager plugin.
+ To get protection for disks, it is thus necessary
+ to reconfigure QEMU to activate the <strong>lockd</strong>
+ driver. This is achieved by editing the QEMU driver
+ configuration file (<code>/etc/libvirt/qemu.conf</code>)
+ and changing the <code>lock_manager</code> configuration
+ tunable.
+ </p>
+
+ <pre>
+$ su - root
+# augtool -s set /files/etc/libvirt/qemu.conf/lock_manager lockd
+# service libvirtd restart
+ </pre>
+
+ <p>
+ Every time you start a guest, the virtlockd daemon will acquire
+ locks on the disk files directly, or in one of the configured
+ lookaside directories based on SHA256 sum. To check that locks
+ are being acquired as expected, the <code>lslocks</code> tool
+ can be run.
+ </p>
+
+ </body>
+</html>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <body>
+ <h1>Virtual machine lock manager, sanlock plugin</h1>
+
+ <ul id="toc"></ul>
+
+ <p>
+ This page describes use of the
+ <a href="https://fedorahosted.org/sanlock/">sanlock</a>
+ service as a <a href="locking.html">lock driver</a>
+ plugin for virtual machine disk mutual exclusion.
+ </p>
+
+ <h2><a id="sanlock">Sanlock daemon setup</a></h2>
+
+ <p>
+ On many operating systems, the <strong>sanlock</strong> plugin
+ is distributed in a sub-package which needs to be installed
+ separately from the main libvirt RPM. On a Fedora/RHEL host
+ this can be done with the <code>yum</code> command
+ </p>
+
+ <pre>
+$ su - root
+# yum install libvirt-lock-sanlock
+ </pre>
+
+ <p>
+ The next step is to start the sanlock daemon. For maximum
+ safety sanlock prefers to have a connection to a watchdog
+ daemon. This will cause the entire host to be rebooted in
+ the event that sanlock crashes / terminates abnormally.
+ To start the watchdog daemon on a Fedora/RHEL host
+ the following commands can be run:
+ </p>
+
+ <pre>
+$ su - root
+# chkconfig wdmd on
+# service wdmd start
+ </pre>
+
+ <p>
+ Once the watchdog is running, sanlock can be started
+ as follows
+ </p>
+
+ <pre>
+# chkconfig sanlock on
+# service sanlock start
+ </pre>
+
+ <p>
+ <em>Note:</em> if you wish to avoid the use of the
+ watchdog, add the following line to <code>/etc/sysconfig/sanlock</code>
+ before starting it
+ </p>
+
+ <pre>
+SANLOCKOPTS="-w 0"
+ </pre>
+
+ <p>
+ The sanlock daemon must be started on every single host
+ that will be running virtual machines. So repeat these
+ steps as necessary.
+ </p>
+
+ <h2><a id="sanlockplugin">libvirt sanlock plugin configuration</a></h2>
+
+ <p>
+ Once the sanlock daemon is running, the next step is to
+ configure the libvirt sanlock plugin. There is a separate
+ configuration file for each libvirt driver that is using
+ sanlock. For QEMU, we will edit <code>/etc/libvirt/qemu-sanlock.conf</code>
+ There is one mandatory parameter that needs to be set,
+ the <code>host_id</code>. This is an integer between
+ 1 and 2000, which must be set to a <strong>unique</strong>
+ value on each host running virtual machines.
+ </p>
+
+ <pre>
+$ su - root
+# augtool -s set /files/etc/libvirt/qemu-sanlock.conf/host_id 1
+ </pre>
+
+ <p>
+ Repeat this on every host, changing <strong>1</strong> to a
+ unique value for the host.
+ </p>
+
+ <h2><a id="sanlockstorage">libvirt sanlock storage configuration</a></h2>
+
+ <p>
+ The sanlock plugin needs to create leases in a directory
+ that is on a filesystem shared between all hosts running
+ virtual machines. Obvious choices for this include NFS
+ or GFS2. The libvirt sanlock plugin expects its lease
+ directory be at <code>/var/lib/libvirt/sanlock</code>
+ so update the host's <code>/etc/fstab</code> to mount
+ a suitable shared/cluster filesystem at that location
+ </p>
+
+ <pre>
+$ su - root
+# echo "some.nfs.server:/export/sanlock /var/lib/libvirt/sanlock nfs hard,nointr 0 0" >> /etc/fstab
+# mount /var/lib/libvirt/sanlock
+ </pre>
+
+ <p>
+ If your sanlock daemon happen to run under non-root
+ privileges, you need to tell this to libvirt so it
+ chowns created files correctly. This can be done by
+ setting <code>user</code> and/or <code>group</code>
+ variables in the configuration file. Accepted values
+ range is specified in description to the same
+ variables in <code>/etc/libvirt/qemu.conf</code>. For
+ example:
+ </p>
+
+ <pre>
+augtool -s set /files/etc/libvirt/qemu-sanlock.conf/user sanlock
+augtool -s set /files/etc/libvirt/qemu-sanlock.conf/group sanlock
+ </pre>
+
+ <p>
+ But remember, that if this is NFS share, you need a
+ no_root_squash-ed one for chown (and chmod possibly)
+ to succeed.
+ </p>
+
+ <p>
+ In terms of storage requirements, if the filesystem
+ uses 512 byte sectors, you need to allow for <code>1MB</code>
+ of storage for each guest disk. So if you have a network
+ with 20 virtualization hosts, each running 50 virtual
+ machines and an average of 2 disks per guest, you will
+ need <code>20*50*2 == 2000 MB</code> of storage for
+ sanlock.
+ </p>
+
+
+ <p>
+ On one of the hosts on the network is it wise to setup
+ a cron job which runs the <code>virt-sanlock-cleanup</code>
+ script periodically. This scripts deletes any lease
+ files which are not currently in use by running virtual
+ machines, freeing up disk space on the shared filesystem.
+ Unless VM disks are very frequently created + deleted
+ it should be sufficient to run the cleanup once a week.
+ </p>
+
+ <h2><a id="qemuconfig">QEMU/KVM driver configuration</a></h2>
+
+ <p>
+ The QEMU/KVM driver is fully integrated with the lock
+ manager framework as of release <span>0.9.3</span>.
+ The out of the box configuration, however, currently
+ uses the <strong>nop</strong> lock manager plugin.
+ To get protection for disks, it is thus necessary
+ to reconfigure QEMU to activate the <strong>sanlock</strong>
+ driver. This is achieved by editing the QEMU driver
+ configuration file (<code>/etc/libvirt/qemu.conf</code>)
+ and changing the <code>lock_manager</code> configuration
+ tunable.
+ </p>
+
+ <pre>
+$ su - root
+# augtool -s set /files/etc/libvirt/qemu.conf/lock_manager sanlock
+# service libvirtd restart
+ </pre>
+
+ <p>
+ If all went well, libvirtd will have talked to sanlock
+ and created the basic lockspace. This can be checked
+ by looking for existence of the following file
+ </p>
+
+ <pre>
+# ls /var/lib/libvirt/sanlock/
+__LIBVIRT__DISKS__
+ </pre>
+
+ <p>
+ Every time you start a guest, additional lease files will appear
+ in this directory, one for each virtual disk. The lease
+ files are named based on the MD5 checksum of the fully qualified
+ path of the virtual disk backing file. So if the guest is given
+ a disk backed by <code>/var/lib/libvirt/images/demo.img</code>
+ expect to see a lease <code>/var/lib/libvirt/sanlock/bfa0240911bc17753e0b473688822159</code>
+ </p>
+
+ <p>
+ It should be obvious that for locking to work correctly, every
+ host running virtual machines should have storage configured
+ in the same way. The easiest way to do this is to use the libvirt
+ storage pool capability to configure any NFS volumes, iSCSI targets,
+ or SCSI HBAs used for guest storage. Simply replicate the same
+ storage pool XML across every host. It is important that any
+ storage pools exposing block devices are configured to create
+ volume paths under <code>/dev/disks/by-path</code> to ensure
+ stable paths across hosts. An example iSCSI configuration
+ which ensures this is:
+ </p>
+
+ <pre>
+<pool type='iscsi'>
+ <name>myiscsipool</name>
+ <source>
+ <host name='192.168.254.8'/>
+ <device path='your-iscsi-target-iqn'/>
+ </source>
+ <target>
+ <path>/dev/disk/by-path</path>
+ </target>
+</pool>
+ </pre>
+
+ <h2><a id="domainconfig">Domain configuration</a></h2>
+
+ <p>
+ In case sanlock loses access to disk locks for some reason, it will
+ kill all domains that lost their locks. This default behavior may
+ be changed using
+ <a href="formatdomain.html#elementsEvents">on_lockfailure
+ element</a> in domain XML. When this element is present, sanlock
+ will call <code>sanlock_helper</code> (provided by libvirt) with
+ the specified action. This helper binary will connect to libvirtd
+ and thus it may need to authenticate if libvirtd was configured to
+ require that on the read-write UNIX socket. To provide the
+ appropriate credentials to sanlock_helper, a
+ <a href="auth.html#Auth_client_config">client authentication
+ file</a> needs to contain something like the following:
+ </p>
+ <pre>
+[auth-libvirt-localhost]
+credentials=sanlock
+
+[credentials-sanlock]
+authname=login
+password=password
+ </pre>
+ </body>
+</html>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <body>
+ <h1>Virtual machine lock manager</h1>
+
+ <ul id="toc"></ul>
+
+ <p>
+ Libvirt includes a framework for ensuring mutual exclusion
+ between virtual machines using host resources. Typically
+ this is used to prevent two VM processes from having concurrent
+ write access to the same disk image, as this would result in
+ data corruption if the guest was not using a cluster
+ aware filesystem.
+ </p>
+
+ <h2><a id="plugins">Lock manager plugins</a></h2>
+
+ <p>
+ The lock manager framework has a pluggable architecture,
+ to allow different locking technologies to be used.
+ </p>
+
+ <dl>
+ <dt><code>nop</code></dt>
+ <dd>This is a "no op" implementation which does absolutely
+ nothing. This can be used if mutual exclusion between
+ virtual machines is not required, or if it is being
+ solved at another level in the management stack.</dd>
+ <dt><code><a href="locking-lockd.html">lockd</a></code></dt>
+ <dd>This is the current preferred implementation shipped
+ with libvirt. It uses the <code>virtlockd</code> daemon
+ to manage locks using the POSIX fcntl() advisory locking
+ capability. As such it requires a shared filesystem of
+ some kind be accessible to all hosts which share the
+ same image storage.</dd>
+ <dt><code><a href="locking-sanlock.html">sanlock</a></code></dt>
+ <dd>This is an alternative implementation preferred by
+ the oVirt project. It uses a disk paxos algorithm for
+ maintaining continuously renewed leases. In the default
+ setup it requires some shared filesystem, but it is
+ possible to use it in a manual mode where the management
+ application creates leases in SAN storage volumes.
+ </dd>
+ </dl>
+ </body>
+</html>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <body>
+
+ <h1>Secure Usage of Libvirt</h1>
+
+ <ul id="toc"></ul>
+
+ <p>
+ This page details information that application developers and
+ administrators of libvirt should be aware of when working with
+ libvirt, that may have a bearing on security of the system.
+ </p>
+
+
+ <h2><a id="diskimage">Disk image handling</a></h2>
+
+ <h3><a id="diskimageformat">Disk image format probing</a></h3>
+
+ <p>
+ Historically there have been multiple flaws in QEMU and most
+ projects using QEMU, related to handling of disk formats.
+ The problems occur when a guest is given a virtual disk backed
+ by raw disk format on the host. If the management application
+ on the host tries to auto-detect / probe the disk format, it
+ is vulnerable to a malicious guest which can write a qcow2
+ file header into its raw disk. If the management application
+ subsequently probes the disk, it will see it as a 'qcow2' disk
+ instead of a 'raw' disk. Since 'qcow2' disks can have a copy
+ on write backing file, such flaw can be leveraged to read
+ arbitrary files on the host. The same type of flaw may occur
+ if the management application allows users to upload pre-created
+ raw images.
+ </p>
+
+ <p>
+ <strong>Recommendation:</strong> never attempt to automatically
+ detect the format of a disk image based on file contents which
+ are accessible to / originate from an untrusted source.
+ </p>
+
+ <h3><a id="diskimagebacking">Disk image backing files</a></h3>
+
+ <p>
+ If a management application allows users to upload pre-created
+ disk images in non-raw formats, it can be tricked into giving
+ the user access to arbitrary host files via the copy-on-write
+ backing file feature. This is because the qcow2 disk format
+ header contains a filename field which can point to any location.
+ It can also point to network protocols such as NBD, HTTP, GlusterFS,
+ RBD and more. This could allow for compromise of almost arbitrary
+ data accessible on the LAN/WAN.
+ </p>
+
+ <p>
+ <strong>Recommendation:</strong> always validate that a disk
+ image originating from an untrusted source has no backing
+ file set. If a backing file is seen, reject the image.
+ </p>
+
+ <h3><a id="diskimagesize">Disk image size validation</a></h3>
+
+ <p>
+ If an application allows users to upload pre-created disk
+ images in non-raw formats, it is essential to validate the
+ logical disk image size, rather than the physical disk
+ image size. Non-raw disk images have a grow-on-demand
+ capability, so a user can provide a qcow2 image that may
+ be only 1 MB in size, but is configured to grow to many
+ TB in size.
+ </p>
+
+ <p>
+ <strong>Recommendation:</strong> if receiving a non-raw disk
+ image from an untrusted source, validate the logical image
+ size stored in the disk image metadata against some finite
+ limit.
+ </p>
+
+ <h3><a id="diskimageaccess">Disk image data access</a></h3>
+
+ <p>
+ If an untrusted disk image is ever mounted on the host OS by
+ a management application or administrator, this opens an
+ avenue of attack with which to potentially compromise the
+ host kernel. Filesystem drivers in OS kernels are often very
+ complex code and thus may have bugs lurking in them. With
+ Linux, there are a large number of filesystem drivers, many
+ of which attract little security analysis attention. Linux
+ will helpfully probe filesystem formats if not told to use an
+ explicit format, allowing an attacker the ability to target
+ specific weak filesystem drivers. Even commonly used and
+ widely audited filesystems such as <code>ext4</code> have had
+ <a href="https://lwn.net/Articles/538898/">bugs lurking in them</a>
+ undetected for years at a time.
+ </p>
+
+ <p>
+ <strong>Recommendation:</strong> if there is a need to access
+ the content of a disk image, use a single-use throwaway virtual
+ machine to access the data. Never mount disk images on the host
+ OS. Ideally make use of the <a href="http://libguestfs.org">libguestfs</a>
+ tools and APIs for accessing disks
+ </p>
+
+ <h2><a id="migration">Guest migration network</a></h2>
+
+ <p>
+ Most hypervisors with support for guest migration between hosts
+ make use of one (or more) network connections. Typically the source
+ host will connect to some port on the target host to initiate the
+ migration. There may be separate connections for co-ordinating the
+ migration, transferring memory state and transferring storage.
+ If the network over which migration takes place is accessible the
+ guest, or client applications, there is potential for data leakage
+ via packet snooping/capture. It is also possible for a malicious
+ guest or client to make attempts to connect to the target host
+ to trigger bogus migration operations, or at least inflict a denial
+ of service attack.
+ </p>
+
+ <p>
+ <strong>Recommendations:</strong> there are several things to consider
+ when performing migration
+ </p>
+
+ <ul>
+ <li>Use a specific address for establishing the migration
+ connection which is accessible only to the virtualization
+ hosts themselves, not libvirt clients or virtual guests.
+ Most hypervisors allow the management application to provide
+ the IP address of the target host as a way to
+ determine which network migration takes place on. This is
+ effectively the connect() socket address for the source host.</li>
+ <li>Use a specific address for listening for incoming migration
+ connections which is accessible only to the virtualization
+ hosts themselves, not libvirt clients or virtual guests.
+ Most hypervisors allow the management application to configure
+ the IP address on which the target host listens. This is
+ the bind() socket address for the target host.</li>
+ <li>Use an encrypted migration protocol. Some hypervisors
+ have support for encrypting the migration memory/storage
+ data. In other cases it can be tunnelled over the libvirtd
+ RPC protocol connections.</li>
+ </ul>
+
+ <h2><a id="storage">Storage encryption</a></h2>
+
+ <p>
+ Virtual disk images will typically contain confidential data
+ belonging to the owner of the virtual machine. It is desirable
+ to protect this against data center administrators as much as
+ possible. For example, a rogue storage administrator may attempt
+ to access disk contents directly from a storage host, or a network
+ administrator/attack may attempt to snoop on data packets relating
+ to storage access. Use of disk encryption on the virtualization
+ host can ensure that only the virtualization host administrator
+ can see the plain text contents of disk images.
+ </p>
+
+ <p>
+ <strong>Recommendation:</strong> make use of storage encryption
+ to protect non-local storage from attack by rogue network /
+ storage administrators or external attackers. This is particularly
+ important if the storage protocol itself does not offer any kind
+ of encryption capabilities.
+ </p>
+
+ </body>
+</html>
+++ /dev/null
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml">
- <body>
- <h1>Launch security with AMD SEV</h1>
-
- <ul id="toc"></ul>
-
- <p>
- Storage encryption in modern public cloud computing is a common practice.
- However, from the point of view of a user of these cloud workloads, a
- significant amount of trust needs to be put in the cloud platform security as
- well as integrity (was the hypervisor tampered?). For this reason there's ever
- rising demand for securing data in use, i.e. memory encryption.
- One of the solutions addressing this matter is AMD SEV.
- </p>
-
- <h2>AMD SEV</h2>
- <p>
- SEV (Secure Encrypted Virtualization) is a feature extension of AMD's SME (Secure
- Memory Encryption) intended for KVM virtual machines which is supported
- primarily on AMD's EPYC CPU line. In contrast to SME, SEV uses a unique memory encryption
- key for each VM. The whole encryption of memory pages is completely transparent
- to the hypervisor and happens inside dedicated hardware in the on-die memory controller.
- Each controller includes a high-performance Advanced Encryption Standard
- (AES) engine that encrypts data when it is written to DRAM and decrypts it
- when read.
-
- For more details about the technology itself, you can visit
- <a href="https://developer.amd.com/sev/">AMD's developer portal</a>.
- </p>
-
- <h2><a id="Host">Enabling SEV on the host</a></h2>
- <p>
- Before VMs can make use of the SEV feature you need to make sure your
- AMD CPU does support SEV. You can check whether SEV is among the CPU
- flags with:
- </p>
-
- <pre>
-$ cat /proc/cpuinfo | grep sev
-...
-sme ssbd sev ibpb</pre>
-
- <p>
- Next step is to enable SEV in the kernel, because it is disabled by default.
- This is done by putting the following onto the kernel command line:
- </p>
-
- <pre>
-mem_encrypt=on kvm_amd.sev=1
- </pre>
-
- <p>
- To make the changes persistent, append the above to the variable holding
- parameters of the kernel command line in
- <code>/etc/default/grub</code> to preserve SEV settings across reboots
- </p>
-
- <pre>
-$ cat /etc/default/grub
-...
-GRUB_CMDLINE_LINUX="... mem_encrypt=on kvm_amd.sev=1"
-$ grub2-mkconfig -o /boot/efi/EFI/<distro>/grub.cfg</pre>
-
- <p>
- <code>mem_encrypt=on</code> turns on the SME memory encryption feature on
- the host which protects against the physical attack on the hypervisor
- memory. The <code>kvm_amd.sev</code> parameter actually enables SEV in
- the kvm module. It can be set on the command line alongside
- <code>mem_encrypt</code> like shown above, or it can be put into a
- module config under <code>/etc/modprobe.d/</code>
- </p>
-
- <pre>
-$ cat /etc/modprobe.d/sev.conf
-options kvm_amd sev=1
- </pre>
-
- <p>
- After rebooting the host, you should see SEV being enabled in the kernel:
- </p>
-
- <pre>
-$ cat /sys/module/kvm_amd/parameters/sev
-1
- </pre>
-
- <h2><a id="Virt">Checking SEV support in the virt stack</a></h2>
- <p>
- <b>Note: All of the commands bellow need to be run with root privileges.</b>
- </p>
-
- <p>
- First make sure you have the following packages in the specified versions:
- </p>
-
- <ul>
- <li>
- libvirt >= 4.5.0 (>5.1.0 recommended due to additional SEV bugfixes)
- </li>
- <li>
- QEMU >= 2.12.0
- </li>
- </ul>
- <p>
- To confirm that the virtualization stack supports SEV, run the following:
- </p>
-
- <pre>
-# virsh domcapabilities
-<domainCapabilities>
-...
- <features>
- ...
- <sev supported='yes'>
- <cbitpos>47</cbitpos>
- <reducedPhysBits>1</reducedPhysBits>
- </sev>
- ...
- </features>
-</domainCapabilities></pre>
- <p>
- Note that if libvirt was already installed and libvirtd running before enabling SEV in the kernel followed by the host reboot you need to force libvirtd
- to re-probe both the host and QEMU capabilities. First stop libvirtd:
- </p>
-
- <pre>
-# systemctl stop libvirtd.service
- </pre>
-
- <p>
- Now you need to clean the capabilities cache:
- </p>
-
- <pre>
-# rm -f /var/cache/libvirt/qemu/capabilities/*
- </pre>
-
- <p>
- If you now restart libvirtd, it will re-probe the capabilities and if
- you now run:
- </p>
-
- <pre>
-# virsh domcapabilities
- </pre>
-
- <p>
- SEV should be listed as supported. If you still see:
- </p>
-
- <pre>
-<sev supported='no'/>
- </pre>
-
- <p>
- it means one of two things:
- <ol>
- <li>
- libvirt does support SEV, but either QEMU or the host does not
- </li>
- <li>
- you have libvirt <=5.1.0 which suffered from getting a
- <code>'Permission denied'</code> on <code>/dev/sev</code> because
- of the default permissions on the character device which prevented
- QEMU from opening it during capabilities probing - you can either
- manually tweak the permissions so that QEMU has access to it or
- preferably install libvirt 5.1.0 or higher
- </li>
- </ol>
- </p>
-
- <h2><a id="Configuration">VM Configuration</a></h2>
- <p>
- SEV is enabled in the XML by specifying the
- <a href="https://libvirt.org/formatdomain.html#launchSecurity"><launchSecurity> </a> element. However, specifying <code>launchSecurity</code> isn't
- enough to boot an SEV VM. Further configuration requirements are discussed
- below.
- </p>
-
- <h3><a id="Machine">Machine type</a></h3>
- <p>
- Even though both Q35 and legacy PC machine types (for PC see also
- "virtio") can be used with SEV, usage of the legacy PC machine type is
- strongly discouraged, since depending on how your OVMF package was
- built (e.g. including features like SecureBoot or SMM) Q35 may even be
- required.
- </p>
-
- <h5>Q35</h5>
-<pre>
-...
-<os>
- <type arch='x86_64' machine='pc-q35-3.0'>hvm</type>
- ...
-</os>
-...</pre>
-
- <h5>i440fx (discouraged)</h5>
- <pre>
-...
-<os>
- <type arch='x86_64' machine='pc-i440fx-3.0'>hvm</type>
- ...
-</os>
-...
- </pre>
-
- <h3><a id="Boot">Boot loader</a></h3>
- <p>
- SEV is only going to work with OVMF (UEFI), so you'll need to point libvirt to
- the correct OVMF binary.
- </p>
- <pre>
-...
-<os>
- <type arch='x86_64' machine='pc-q35-3.0'>hvm</type>
- <loader readonly='yes' type='pflash'>/usr/share/edk2/ovmf/OVMF_CODE.fd</loader>
-</os>
-...</pre>
-
- <h3><a id="Memory">Memory</a></h3>
- <p>
- Internally, SEV expects that the encrypted memory pages won't be swapped out or move
- around so the VM memory needs to be pinned in physical RAM which will be
- handled by QEMU. Apart from that, certain memory regions allocated by QEMU
- itself (UEFI pflash, device ROMs, video RAM, etc.) have to be encrypted as
- well. This causes a conflict in how libvirt tries to protect the host.
- By default, libvirt enforces a memory hard limit on each VM's cgroup in order
- to protect the host from malicious QEMU to allocate and lock all the available
- memory. This limit corresponds to the total memory allocation for the VM given
- by <code><currentMemory></code> element. However, trying to account for the additional
- memory regions QEMU allocates when calculating the limit in an automated manner
- is non-deterministic. One way to resolve this is to set the hard limit manually.
-
- <p>
- Note: Figuring out the right number so that your guest boots and isn't killed is
- challenging, but 256MiB extra memory over the total guest RAM should suffice for
- most workloads and may serve as a good starting point.
-
- For example, a domain with 4GB memory with a 256MiB extra hard limit would look
- like this:
- </p>
- </p>
-
- <pre>
-# virsh edit <domain>
-<domain>
- ...
- <currentMemory unit='KiB'>4194304</currentMemory>
- <memtune>
- <hard_limit unit='KiB'>4456448</hard_limit>
- </memtune>
- ...
-</domain></pre>
- <p>
- There's another, preferred method of taking care of the limits by
- using the<code><memoryBacking></code> element along with the
- <code><locked/></code> subelement:
- </p>
-
- <pre>
-<domain>
- ...
- <memoryBacking>
- <locked/>
- </memoryBacking>
- ...
-</domain></pre>
-
- <p>
- What that does is that it tells libvirt not to force any hard limit (well,
- unlimited) upon the VM cgroup. The obvious advantage is that one doesn't need
- to determine the hard limit for every single SEV-enabled VM. However, there is
- a significant security-related drawback to this approach. Since no hard limit
- is applied, a malicious QEMU could perform a DoS attack by locking all of the
- host's available memory. The way to avoid this issue and to protect the host is
- to enforce a bigger hard limit on the master cgroup containing all of the VMs
- - on systemd this is <code>machine.slice</code>.
- </p>
-
- <pre>
-# systemctl set-property machine.slice MemoryHigh=<value></pre>
-
- <p>
- To put even stricter measures in place which would involve the OOM killer, use
- <pre>
-# systemctl set-property machine.slice MemoryMax=<value></pre>
- instead. Alternatively, you can create a systemd config (don't forget
- to reload systemd configuration in this case):
- <pre>
-# cat << EOF > /etc/systemd/system.control/machine.slice.d/90-MemoryMax.conf
-MemoryMax=<value>
-EOF</pre>
- The trade-off to keep in mind with the second approach is that the VMs
- can still perform DoS on each other.
- </p>
-
- <h3><a id="Virtio">Virtio</a></h3>
- <p>
- In order to make virtio devices work, we need to enable emulated IOMMU
- on the devices so that virtual DMA can work.
- </p>
-
- <pre>
-# virsh edit <domain>
-<domain>
- ...
- <controller type='virtio-serial' index='0'>
- <driver iommu='on'/>
- </controller>
- <controller type='scsi' index='0' model='virtio-scsi'>
- <driver iommu='on'/>
- </controller>
- ...
- <memballoon model='virtio'>
- <driver iommu='on'/>
- </memballoon>
- <rng model='virtio'>
- <backend model='random'>/dev/urandom</backend>
- <driver iommu='on'/>
- </rng>
- ...
-<domain></pre>
-
- <p>
- If you for some reason want to use the legacy PC machine type, further changes
- to the virtio
- configuration is required, because SEV will not work with Virtio <1.0. In
- libvirt, this is handled by using the virtio-non-transitional device model
- (libvirt >= 5.2.0 required).
-
- <p>
- Note: some devices like video devices don't
- support non-transitional model, which means that virtio GPU cannot be used.
- </p>
- </p>
-
- <pre>
-<domain>
- ...
- <devices>
- ...
- <memballoon model='virtio-non-transitional'>
- <driver iommu='on'/>
- </memballoon>
- </devices>
- ...
-</domain></pre>
-
- <h2><a id="Limitations">Limitations</a></h2>
- <p>
- Currently, the boot disk cannot be of type virtio-blk, instead, virtio-scsi
- needs to be used if virtio is desired. This limitation is expected to be lifted
- with future releases of kernel (the kernel used at the time of writing the
- article is 5.0.14).
- If you still cannot start an SEV VM, it could be because of wrong SELinux label on the <code>/dev/sev</code> device with selinux-policy <3.14.2.40 which prevents QEMU from touching the device. This can be resolved by upgrading the package, tuning the selinux policy rules manually to allow svirt_t to access the device (see <code>audit2allow</code> on how to do that) or putting SELinux into permissive mode (discouraged).
- </p>
-
- <h2><a id="Examples">Full domain XML examples</a></h2>
-
- <h5>Q35 machine</h5>
- <pre>
-<domain type='kvm'>
- <name>sev-dummy</name>
- <memory unit='KiB'>4194304</memory>
- <currentMemory unit='KiB'>4194304</currentMemory>
- <memoryBacking>
- <locked/>
- </memoryBacking>
- <vcpu placement='static'>4</vcpu>
- <os>
- <type arch='x86_64' machine='pc-q35-3.0'>hvm</type>
- <loader readonly='yes' type='pflash'>/usr/share/edk2/ovmf/OVMF_CODE.fd</loader>
- <nvram>/var/lib/libvirt/qemu/nvram/sev-dummy_VARS.fd</nvram>
- </os>
- <features>
- <acpi/>
- <apic/>
- <vmport state='off'/>
- </features>
- <cpu mode='host-model' check='partial'>
- <model fallback='allow'/>
- </cpu>
- <clock offset='utc'>
- <timer name='rtc' tickpolicy='catchup'/>
- <timer name='pit' tickpolicy='delay'/>
- <timer name='hpet' present='no'/>
- </clock>
- <on_poweroff>destroy</on_poweroff>
- <on_reboot>restart</on_reboot>
- <on_crash>destroy</on_crash>
- <pm>
- <suspend-to-mem enabled='no'/>
- <suspend-to-disk enabled='no'/>
- </pm>
- <devices>
- <emulator>/usr/bin/qemu-kvm</emulator>
- <disk type='file' device='disk'>
- <driver name='qemu' type='qcow2'/>
- <source file='/var/lib/libvirt/images/sev-dummy.qcow2'/>
- <target dev='sda' bus='scsi'/>
- <boot order='1'/>
- </disk>
- <controller type='virtio-serial' index='0'>
- <driver iommu='on'/>
- </controller>
- <controller type='scsi' index='0' model='virtio-scsi'>
- <driver iommu='on'/>
- </controller>
- <interface type='network'>
- <mac address='52:54:00:cc:56:90'/>
- <source network='default'/>
- <model type='virtio'/>
- <driver iommu='on'/>
- </interface>
- <graphics type='spice' autoport='yes'>
- <listen type='address'/>
- <gl enable='no'/>
- </graphics>
- <video>
- <model type='qxl'/>
- </video>
- <memballoon model='virtio'>
- <driver iommu='on'/>
- </memballoon>
- <rng model='virtio'>
- <driver iommu='on'/>
- </rng>
- </devices>
- <launchSecurity type='sev'>
- <cbitpos>47</cbitpos>
- <reducedPhysBits>1</reducedPhysBits>
- <policy>0x0003</policy>
- </launchSecurity>
-</domain></pre>
-
- <h5>PC-i440fx machine:</h5>
- <pre>
-<domain type='kvm'>
- <name>sev-dummy-legacy</name>
- <memory unit='KiB'>4194304</memory>
- <currentMemory unit='KiB'>4194304</currentMemory>
- <memtune>
- <hard_limit unit='KiB'>5242880</hard_limit>
- </memtune>
- <vcpu placement='static'>4</vcpu>
- <os>
- <type arch='x86_64' machine='pc-i440fx-3.0'>hvm</type>
- <loader readonly='yes' type='pflash'>/usr/share/edk2/ovmf/OVMF_CODE.fd</loader>
- <nvram>/var/lib/libvirt/qemu/nvram/sev-dummy_VARS.fd</nvram>
- <boot dev='hd'/>
- </os>
- <features>
- <acpi/>
- <apic/>
- <vmport state='off'/>
- </features>
- <cpu mode='host-model' check='partial'>
- <model fallback='allow'/>
- </cpu>
- <clock offset='utc'>
- <timer name='rtc' tickpolicy='catchup'/>
- <timer name='pit' tickpolicy='delay'/>
- <timer name='hpet' present='no'/>
- </clock>
- <on_poweroff>destroy</on_poweroff>
- <on_reboot>restart</on_reboot>
- <on_crash>destroy</on_crash>
- <pm>
- <suspend-to-mem enabled='no'/>
- <suspend-to-disk enabled='no'/>
- </pm>
- <devices>
- <emulator>/usr/bin/qemu-kvm</emulator>
- <disk type='file' device='disk'>
- <driver name='qemu' type='qcow2'/>
- <source file='/var/lib/libvirt/images/sev-dummy-seabios.qcow2'/>
- <target dev='sda' bus='sata'/>
- </disk>
- <interface type='network'>
- <mac address='52:54:00:d8:96:c8'/>
- <source network='default'/>
- <model type='virtio-non-transitional'/>
- </interface>
- <serial type='pty'>
- <target type='isa-serial' port='0'>
- <model name='isa-serial'/>
- </target>
- </serial>
- <console type='pty'>
- <target type='serial' port='0'/>
- </console>
- <input type='tablet' bus='usb'>
- <address type='usb' bus='0' port='1'/>
- </input>
- <input type='mouse' bus='ps2'/>
- <input type='keyboard' bus='ps2'/>
- <graphics type='spice' autoport='yes'>
- <listen type='address'/>
- <gl enable='no'/>
- </graphics>
- <video>
- <model type='qxl' ram='65536' vram='65536' vgamem='16384' heads='1' primary='yes'/>
- </video>
- <memballoon model='virtio-non-transitional'>
- <driver iommu='on'/>
- </memballoon>
- <rng model='virtio-non-transitional'>
- <driver iommu='on'/>
- </rng>
- </devices>
- <launchSecurity type='sev'>
- <cbitpos>47</cbitpos>
- <reducedPhysBits>1</reducedPhysBits>
- <policy>0x0003</policy>
- </launchSecurity>
-</domain></pre>
- </body>
-</html>
+++ /dev/null
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml">
- <body>
- <h1>Virtual machine lock manager, virtlockd plugin</h1>
-
- <ul id="toc"></ul>
-
- <p>
- This page describes use of the <code>virtlockd</code>
- service as a <a href="locking.html">lock driver</a>
- plugin for virtual machine disk mutual exclusion.
- </p>
-
- <h2><a id="background">virtlockd background</a></h2>
-
- <p>
- The virtlockd daemon is a single purpose binary which
- focuses exclusively on the task of acquiring and holding
- locks on behalf of running virtual machines. It is
- designed to offer a low overhead, portable locking
- scheme can be used out of the box on virtualization
- hosts with minimal configuration overheads. It makes
- use of the POSIX fcntl advisory locking capability
- to hold locks, which is supported by the majority of
- commonly used filesystems.
- </p>
-
- <h2><a id="sanlock">virtlockd daemon setup</a></h2>
-
- <p>
- In most OS, the virtlockd daemon itself will not require
- any upfront configuration work. It is installed by default
- when libvirtd is present, and a systemd socket unit is
- registered such that the daemon will be automatically
- started when first required. With OS that predate systemd
- though, it will be necessary to start it at boot time,
- prior to libvirtd being started. On RHEL/Fedora distros,
- this can be achieved as follows
- </p>
-
- <pre>
-# chkconfig virtlockd on
-# service virtlockd start
- </pre>
-
- <p>
- The above instructions apply to the instance of virtlockd
- that runs privileged, and is used by the libvirtd daemon
- that runs privileged. If running libvirtd as an unprivileged
- user, it will always automatically spawn an instance of
- the virtlockd daemon unprivileged too. This requires no
- setup at all.
- </p>
-
- <h2><a id="lockdplugin">libvirt lockd plugin configuration</a></h2>
-
- <p>
- Once the virtlockd daemon is running, or setup to autostart,
- the next step is to configure the libvirt lockd plugin.
- There is a separate configuration file for each libvirt
- driver that is using virtlockd. For QEMU, we will edit
- <code>/etc/libvirt/qemu-lockd.conf</code>
- </p>
-
- <p>
- The default behaviour of the lockd plugin is to acquire locks
- directly on the virtual disk images associated with the guest
- <disk> elements. This ensures it can run out of the box
- with no configuration, providing locking for disk images on
- shared filesystems such as NFS. It does not provide any cross
- host protection for storage that is backed by block devices,
- since locks acquired on device nodes in /dev only apply within
- the host. It may also be the case that the filesystem holding
- the disk images is not capable of supporting fcntl locks.
- </p>
-
- <p>
- To address these problems it is possible to tell lockd to
- acquire locks on an indirect file. Essentially lockd will
- calculate the SHA256 checksum of the fully qualified path,
- and create a zero length file in a given directory whose
- filename is the checksum. It will then acquire a lock on
- that file. Assuming the block devices assigned to the guest
- are using stable paths (eg /dev/disk/by-path/XXXXXXX) then
- this will allow for locks to apply across hosts. This
- feature can be enabled by setting a configuration setting
- that specifies the directory in which to create the lock
- files. The directory referred to should of course be
- placed on a shared filesystem (eg NFS) that is accessible
- to all hosts which can see the shared block devices.
- </p>
-
- <pre>
-$ su - root
-# augtool -s set \
- /files/etc/libvirt/qemu-lockd.conf/file_lockspace_dir \
- "/var/lib/libvirt/lockd/files"
- </pre>
-
- <p>
- If the guests are using either LVM and SCSI block devices
- for their virtual disks, there is a unique identifier
- associated with each device. It is possible to tell lockd
- to use this UUID as the basis for acquiring locks, rather
- than the SHA256 sum of the filename. The benefit of this
- is that the locking protection will work even if the file
- paths to the given block device are different on each
- host.
- </p>
-
- <pre>
-$ su - root
-# augtool -s set \
- /files/etc/libvirt/qemu-lockd.conf/scsi_lockspace_dir \
- "/var/lib/libvirt/lockd/scsi"
-# augtool -s set \
- /files/etc/libvirt/qemu-lockd.conf/lvm_lockspace_dir \
- "/var/lib/libvirt/lockd/lvm"
- </pre>
-
- <p>
- It is important to remember that the changes made to the
- <code>/etc/libvirt/qemu-lockd.conf</code> file must be
- propagated to all hosts before any virtual machines are
- launched on them. This ensures that all hosts are using
- the same locking mechanism
- </p>
-
- <h2><a id="qemuconfig">QEMU/KVM driver configuration</a></h2>
-
- <p>
- The QEMU driver is capable of using the virtlockd plugin
- since the release <span>1.0.2</span>.
- The out of the box configuration, however, currently
- uses the <strong>nop</strong> lock manager plugin.
- To get protection for disks, it is thus necessary
- to reconfigure QEMU to activate the <strong>lockd</strong>
- driver. This is achieved by editing the QEMU driver
- configuration file (<code>/etc/libvirt/qemu.conf</code>)
- and changing the <code>lock_manager</code> configuration
- tunable.
- </p>
-
- <pre>
-$ su - root
-# augtool -s set /files/etc/libvirt/qemu.conf/lock_manager lockd
-# service libvirtd restart
- </pre>
-
- <p>
- Every time you start a guest, the virtlockd daemon will acquire
- locks on the disk files directly, or in one of the configured
- lookaside directories based on SHA256 sum. To check that locks
- are being acquired as expected, the <code>lslocks</code> tool
- can be run.
- </p>
-
- </body>
-</html>
+++ /dev/null
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml">
- <body>
- <h1>Virtual machine lock manager, sanlock plugin</h1>
-
- <ul id="toc"></ul>
-
- <p>
- This page describes use of the
- <a href="https://fedorahosted.org/sanlock/">sanlock</a>
- service as a <a href="locking.html">lock driver</a>
- plugin for virtual machine disk mutual exclusion.
- </p>
-
- <h2><a id="sanlock">Sanlock daemon setup</a></h2>
-
- <p>
- On many operating systems, the <strong>sanlock</strong> plugin
- is distributed in a sub-package which needs to be installed
- separately from the main libvirt RPM. On a Fedora/RHEL host
- this can be done with the <code>yum</code> command
- </p>
-
- <pre>
-$ su - root
-# yum install libvirt-lock-sanlock
- </pre>
-
- <p>
- The next step is to start the sanlock daemon. For maximum
- safety sanlock prefers to have a connection to a watchdog
- daemon. This will cause the entire host to be rebooted in
- the event that sanlock crashes / terminates abnormally.
- To start the watchdog daemon on a Fedora/RHEL host
- the following commands can be run:
- </p>
-
- <pre>
-$ su - root
-# chkconfig wdmd on
-# service wdmd start
- </pre>
-
- <p>
- Once the watchdog is running, sanlock can be started
- as follows
- </p>
-
- <pre>
-# chkconfig sanlock on
-# service sanlock start
- </pre>
-
- <p>
- <em>Note:</em> if you wish to avoid the use of the
- watchdog, add the following line to <code>/etc/sysconfig/sanlock</code>
- before starting it
- </p>
-
- <pre>
-SANLOCKOPTS="-w 0"
- </pre>
-
- <p>
- The sanlock daemon must be started on every single host
- that will be running virtual machines. So repeat these
- steps as necessary.
- </p>
-
- <h2><a id="sanlockplugin">libvirt sanlock plugin configuration</a></h2>
-
- <p>
- Once the sanlock daemon is running, the next step is to
- configure the libvirt sanlock plugin. There is a separate
- configuration file for each libvirt driver that is using
- sanlock. For QEMU, we will edit <code>/etc/libvirt/qemu-sanlock.conf</code>
- There is one mandatory parameter that needs to be set,
- the <code>host_id</code>. This is an integer between
- 1 and 2000, which must be set to a <strong>unique</strong>
- value on each host running virtual machines.
- </p>
-
- <pre>
-$ su - root
-# augtool -s set /files/etc/libvirt/qemu-sanlock.conf/host_id 1
- </pre>
-
- <p>
- Repeat this on every host, changing <strong>1</strong> to a
- unique value for the host.
- </p>
-
- <h2><a id="sanlockstorage">libvirt sanlock storage configuration</a></h2>
-
- <p>
- The sanlock plugin needs to create leases in a directory
- that is on a filesystem shared between all hosts running
- virtual machines. Obvious choices for this include NFS
- or GFS2. The libvirt sanlock plugin expects its lease
- directory be at <code>/var/lib/libvirt/sanlock</code>
- so update the host's <code>/etc/fstab</code> to mount
- a suitable shared/cluster filesystem at that location
- </p>
-
- <pre>
-$ su - root
-# echo "some.nfs.server:/export/sanlock /var/lib/libvirt/sanlock nfs hard,nointr 0 0" >> /etc/fstab
-# mount /var/lib/libvirt/sanlock
- </pre>
-
- <p>
- If your sanlock daemon happen to run under non-root
- privileges, you need to tell this to libvirt so it
- chowns created files correctly. This can be done by
- setting <code>user</code> and/or <code>group</code>
- variables in the configuration file. Accepted values
- range is specified in description to the same
- variables in <code>/etc/libvirt/qemu.conf</code>. For
- example:
- </p>
-
- <pre>
-augtool -s set /files/etc/libvirt/qemu-sanlock.conf/user sanlock
-augtool -s set /files/etc/libvirt/qemu-sanlock.conf/group sanlock
- </pre>
-
- <p>
- But remember, that if this is NFS share, you need a
- no_root_squash-ed one for chown (and chmod possibly)
- to succeed.
- </p>
-
- <p>
- In terms of storage requirements, if the filesystem
- uses 512 byte sectors, you need to allow for <code>1MB</code>
- of storage for each guest disk. So if you have a network
- with 20 virtualization hosts, each running 50 virtual
- machines and an average of 2 disks per guest, you will
- need <code>20*50*2 == 2000 MB</code> of storage for
- sanlock.
- </p>
-
-
- <p>
- On one of the hosts on the network is it wise to setup
- a cron job which runs the <code>virt-sanlock-cleanup</code>
- script periodically. This scripts deletes any lease
- files which are not currently in use by running virtual
- machines, freeing up disk space on the shared filesystem.
- Unless VM disks are very frequently created + deleted
- it should be sufficient to run the cleanup once a week.
- </p>
-
- <h2><a id="qemuconfig">QEMU/KVM driver configuration</a></h2>
-
- <p>
- The QEMU/KVM driver is fully integrated with the lock
- manager framework as of release <span>0.9.3</span>.
- The out of the box configuration, however, currently
- uses the <strong>nop</strong> lock manager plugin.
- To get protection for disks, it is thus necessary
- to reconfigure QEMU to activate the <strong>sanlock</strong>
- driver. This is achieved by editing the QEMU driver
- configuration file (<code>/etc/libvirt/qemu.conf</code>)
- and changing the <code>lock_manager</code> configuration
- tunable.
- </p>
-
- <pre>
-$ su - root
-# augtool -s set /files/etc/libvirt/qemu.conf/lock_manager sanlock
-# service libvirtd restart
- </pre>
-
- <p>
- If all went well, libvirtd will have talked to sanlock
- and created the basic lockspace. This can be checked
- by looking for existence of the following file
- </p>
-
- <pre>
-# ls /var/lib/libvirt/sanlock/
-__LIBVIRT__DISKS__
- </pre>
-
- <p>
- Every time you start a guest, additional lease files will appear
- in this directory, one for each virtual disk. The lease
- files are named based on the MD5 checksum of the fully qualified
- path of the virtual disk backing file. So if the guest is given
- a disk backed by <code>/var/lib/libvirt/images/demo.img</code>
- expect to see a lease <code>/var/lib/libvirt/sanlock/bfa0240911bc17753e0b473688822159</code>
- </p>
-
- <p>
- It should be obvious that for locking to work correctly, every
- host running virtual machines should have storage configured
- in the same way. The easiest way to do this is to use the libvirt
- storage pool capability to configure any NFS volumes, iSCSI targets,
- or SCSI HBAs used for guest storage. Simply replicate the same
- storage pool XML across every host. It is important that any
- storage pools exposing block devices are configured to create
- volume paths under <code>/dev/disks/by-path</code> to ensure
- stable paths across hosts. An example iSCSI configuration
- which ensures this is:
- </p>
-
- <pre>
-<pool type='iscsi'>
- <name>myiscsipool</name>
- <source>
- <host name='192.168.254.8'/>
- <device path='your-iscsi-target-iqn'/>
- </source>
- <target>
- <path>/dev/disk/by-path</path>
- </target>
-</pool>
- </pre>
-
- <h2><a id="domainconfig">Domain configuration</a></h2>
-
- <p>
- In case sanlock loses access to disk locks for some reason, it will
- kill all domains that lost their locks. This default behavior may
- be changed using
- <a href="formatdomain.html#elementsEvents">on_lockfailure
- element</a> in domain XML. When this element is present, sanlock
- will call <code>sanlock_helper</code> (provided by libvirt) with
- the specified action. This helper binary will connect to libvirtd
- and thus it may need to authenticate if libvirtd was configured to
- require that on the read-write UNIX socket. To provide the
- appropriate credentials to sanlock_helper, a
- <a href="auth.html#Auth_client_config">client authentication
- file</a> needs to contain something like the following:
- </p>
- <pre>
-[auth-libvirt-localhost]
-credentials=sanlock
-
-[credentials-sanlock]
-authname=login
-password=password
- </pre>
- </body>
-</html>
+++ /dev/null
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml">
- <body>
- <h1>Virtual machine lock manager</h1>
-
- <ul id="toc"></ul>
-
- <p>
- Libvirt includes a framework for ensuring mutual exclusion
- between virtual machines using host resources. Typically
- this is used to prevent two VM processes from having concurrent
- write access to the same disk image, as this would result in
- data corruption if the guest was not using a cluster
- aware filesystem.
- </p>
-
- <h2><a id="plugins">Lock manager plugins</a></h2>
-
- <p>
- The lock manager framework has a pluggable architecture,
- to allow different locking technologies to be used.
- </p>
-
- <dl>
- <dt><code>nop</code></dt>
- <dd>This is a "no op" implementation which does absolutely
- nothing. This can be used if mutual exclusion between
- virtual machines is not required, or if it is being
- solved at another level in the management stack.</dd>
- <dt><code><a href="locking-lockd.html">lockd</a></code></dt>
- <dd>This is the current preferred implementation shipped
- with libvirt. It uses the <code>virtlockd</code> daemon
- to manage locks using the POSIX fcntl() advisory locking
- capability. As such it requires a shared filesystem of
- some kind be accessible to all hosts which share the
- same image storage.</dd>
- <dt><code><a href="locking-sanlock.html">sanlock</a></code></dt>
- <dd>This is an alternative implementation preferred by
- the oVirt project. It uses a disk paxos algorithm for
- maintaining continuously renewed leases. In the default
- setup it requires some shared filesystem, but it is
- possible to use it in a manual mode where the management
- application creates leases in SAN storage volumes.
- </dd>
- </dl>
- </body>
-</html>
+++ /dev/null
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml">
- <body>
-
- <h1>Secure Usage of Libvirt</h1>
-
- <ul id="toc"></ul>
-
- <p>
- This page details information that application developers and
- administrators of libvirt should be aware of when working with
- libvirt, that may have a bearing on security of the system.
- </p>
-
-
- <h2><a id="diskimage">Disk image handling</a></h2>
-
- <h3><a id="diskimageformat">Disk image format probing</a></h3>
-
- <p>
- Historically there have been multiple flaws in QEMU and most
- projects using QEMU, related to handling of disk formats.
- The problems occur when a guest is given a virtual disk backed
- by raw disk format on the host. If the management application
- on the host tries to auto-detect / probe the disk format, it
- is vulnerable to a malicious guest which can write a qcow2
- file header into its raw disk. If the management application
- subsequently probes the disk, it will see it as a 'qcow2' disk
- instead of a 'raw' disk. Since 'qcow2' disks can have a copy
- on write backing file, such flaw can be leveraged to read
- arbitrary files on the host. The same type of flaw may occur
- if the management application allows users to upload pre-created
- raw images.
- </p>
-
- <p>
- <strong>Recommendation:</strong> never attempt to automatically
- detect the format of a disk image based on file contents which
- are accessible to / originate from an untrusted source.
- </p>
-
- <h3><a id="diskimagebacking">Disk image backing files</a></h3>
-
- <p>
- If a management application allows users to upload pre-created
- disk images in non-raw formats, it can be tricked into giving
- the user access to arbitrary host files via the copy-on-write
- backing file feature. This is because the qcow2 disk format
- header contains a filename field which can point to any location.
- It can also point to network protocols such as NBD, HTTP, GlusterFS,
- RBD and more. This could allow for compromise of almost arbitrary
- data accessible on the LAN/WAN.
- </p>
-
- <p>
- <strong>Recommendation:</strong> always validate that a disk
- image originating from an untrusted source has no backing
- file set. If a backing file is seen, reject the image.
- </p>
-
- <h3><a id="diskimagesize">Disk image size validation</a></h3>
-
- <p>
- If an application allows users to upload pre-created disk
- images in non-raw formats, it is essential to validate the
- logical disk image size, rather than the physical disk
- image size. Non-raw disk images have a grow-on-demand
- capability, so a user can provide a qcow2 image that may
- be only 1 MB in size, but is configured to grow to many
- TB in size.
- </p>
-
- <p>
- <strong>Recommendation:</strong> if receiving a non-raw disk
- image from an untrusted source, validate the logical image
- size stored in the disk image metadata against some finite
- limit.
- </p>
-
- <h3><a id="diskimageaccess">Disk image data access</a></h3>
-
- <p>
- If an untrusted disk image is ever mounted on the host OS by
- a management application or administrator, this opens an
- avenue of attack with which to potentially compromise the
- host kernel. Filesystem drivers in OS kernels are often very
- complex code and thus may have bugs lurking in them. With
- Linux, there are a large number of filesystem drivers, many
- of which attract little security analysis attention. Linux
- will helpfully probe filesystem formats if not told to use an
- explicit format, allowing an attacker the ability to target
- specific weak filesystem drivers. Even commonly used and
- widely audited filesystems such as <code>ext4</code> have had
- <a href="https://lwn.net/Articles/538898/">bugs lurking in them</a>
- undetected for years at a time.
- </p>
-
- <p>
- <strong>Recommendation:</strong> if there is a need to access
- the content of a disk image, use a single-use throwaway virtual
- machine to access the data. Never mount disk images on the host
- OS. Ideally make use of the <a href="http://libguestfs.org">libguestfs</a>
- tools and APIs for accessing disks
- </p>
-
- <h2><a id="migration">Guest migration network</a></h2>
-
- <p>
- Most hypervisors with support for guest migration between hosts
- make use of one (or more) network connections. Typically the source
- host will connect to some port on the target host to initiate the
- migration. There may be separate connections for co-ordinating the
- migration, transferring memory state and transferring storage.
- If the network over which migration takes place is accessible the
- guest, or client applications, there is potential for data leakage
- via packet snooping/capture. It is also possible for a malicious
- guest or client to make attempts to connect to the target host
- to trigger bogus migration operations, or at least inflict a denial
- of service attack.
- </p>
-
- <p>
- <strong>Recommendations:</strong> there are several things to consider
- when performing migration
- </p>
-
- <ul>
- <li>Use a specific address for establishing the migration
- connection which is accessible only to the virtualization
- hosts themselves, not libvirt clients or virtual guests.
- Most hypervisors allow the management application to provide
- the IP address of the target host as a way to
- determine which network migration takes place on. This is
- effectively the connect() socket address for the source host.</li>
- <li>Use a specific address for listening for incoming migration
- connections which is accessible only to the virtualization
- hosts themselves, not libvirt clients or virtual guests.
- Most hypervisors allow the management application to configure
- the IP address on which the target host listens. This is
- the bind() socket address for the target host.</li>
- <li>Use an encrypted migration protocol. Some hypervisors
- have support for encrypting the migration memory/storage
- data. In other cases it can be tunnelled over the libvirtd
- RPC protocol connections.</li>
- </ul>
-
- <h2><a id="storage">Storage encryption</a></h2>
-
- <p>
- Virtual disk images will typically contain confidential data
- belonging to the owner of the virtual machine. It is desirable
- to protect this against data center administrators as much as
- possible. For example, a rogue storage administrator may attempt
- to access disk contents directly from a storage host, or a network
- administrator/attack may attempt to snoop on data packets relating
- to storage access. Use of disk encryption on the virtualization
- host can ensure that only the virtualization host administrator
- can see the plain text contents of disk images.
- </p>
-
- <p>
- <strong>Recommendation:</strong> make use of storage encryption
- to protect non-local storage from attack by rogue network /
- storage administrators or external attackers. This is particularly
- important if the storage protocol itself does not offer any kind
- of encryption capabilities.
- </p>
-
- </body>
-</html>