From: Juergen Gross Date: Fri, 28 Aug 2020 15:07:35 +0000 (+0200) Subject: tools/libxc: move libxenguest to tools/libs/guest X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=e3dd624e487c1aca1835138d3990f78af9906238;p=people%2Fpauldu%2Fxen.git tools/libxc: move libxenguest to tools/libs/guest tools/libxc now contains libxenguest only. Move it to tools/libs/guest. When generating the pkg-config file for libxenguest a filter is now required for replacing "xenctrl" by "xencontrol" in the "Requires.private:" entry. Add this filter to tools/libs/libs.mk. Signed-off-by: Juergen Gross Reviewed-by: Samuel Thibault (stubdom parts) --- diff --git a/.gitignore b/.gitignore index d22b031ed2..eb637a98e9 100644 --- a/.gitignore +++ b/.gitignore @@ -71,7 +71,6 @@ stubdom/include stubdom/ioemu stubdom/ioemu/ stubdom/libs-* -stubdom/libxc-* stubdom/libxencall-* stubdom/libxenevtchn-* stubdom/libxenforeignmemory-* @@ -121,6 +120,14 @@ tools/libs/foreignmemory/headers.chk tools/libs/foreignmemory/xenforeignmemory.pc tools/libs/devicemodel/headers.chk tools/libs/devicemodel/xendevicemodel.pc +tools/libs/guest/_*.[ch] +tools/libs/guest/libxenguest.map +tools/libs/guest/xenguest.pc +tools/libs/guest/xc_bitops.h +tools/libs/guest/xc_core.h +tools/libs/guest/xc_core_arm.h +tools/libs/guest/xc_core_x86.h +tools/libs/guest/xc_private.h tools/console/xenconsole tools/console/xenconsoled tools/console/client/_paths.h @@ -197,12 +204,6 @@ tools/include/xen-xsm/* tools/include/xen-foreign/*.(c|h|size) tools/include/xen-foreign/checker tools/libvchan/xenvchan.pc -tools/libxc/*.pc -tools/libxc/xc_bitops.h -tools/libxc/xc_core.h -tools/libxc/xc_core_arm.h -tools/libxc/xc_core_x86.h -tools/libxc/xc_private.h tools/libxl/_libxl.api-for-check tools/libxl/*.api-ok tools/libxl/*.pc @@ -370,7 +371,6 @@ tools/include/xen-foreign/arm64.h tools/misc/xen-hptool tools/misc/xen-mfndump tools/libs/toolcore/include/_*.h -tools/libxc/_*.[ch] tools/libxl/_*.[ch] tools/libxl/testidl tools/libxl/testidl.c diff --git a/stubdom/Makefile b/stubdom/Makefile index 6c481285ec..fb9617fa14 100644 --- a/stubdom/Makefile +++ b/stubdom/Makefile @@ -331,7 +331,9 @@ endif # libraries under tools/libs ####### -STUB_LIBS := toolcore toollog evtchn gnttab call foreignmemory devicemodel ctrl +STUB_LIBS := toolcore toollog evtchn gnttab call foreignmemory devicemodel ctrl guest + +LIBDEP_guest := cross-zlib ####### # common handling @@ -362,13 +364,10 @@ endef $(foreach lib,$(STUB_LIBS),$(eval $(call BUILD_lib,$(lib)))) -libxc-$(XEN_TARGET_ARCH)/stamp: $(XEN_ROOT)/tools/libxc/Makefile - $(do_links) - xenstore/stamp: $(XEN_ROOT)/tools/xenstore/Makefile $(do_links) -LINK_DIRS := libxc-$(XEN_TARGET_ARCH) xenstore $(foreach dir,$(STUB_LIBS),libs-$(XEN_TARGET_ARCH)/$(dir)) +LINK_DIRS := xenstore $(foreach dir,$(STUB_LIBS),libs-$(XEN_TARGET_ARCH)/$(dir)) LINK_STAMPS := $(foreach dir,$(LINK_DIRS),$(dir)/stamp) mk-headers-$(XEN_TARGET_ARCH): $(IOEMU_LINKFARM_TARGET) $(LINK_STAMPS) @@ -391,16 +390,6 @@ $(TARGETS_MINIOS): mini-os-%: mkdir -p $@/$$i ; \ done -####### -# libxc -####### - -.PHONY: libxc -libxc: libxc-$(XEN_TARGET_ARCH)/libxenguest.a -libxc-$(XEN_TARGET_ARCH)/libxenguest.a: libxenevtchn libxenctrl cross-zlib -libxc-$(XEN_TARGET_ARCH)/libxenguest.a: mk-headers-$(XEN_TARGET_ARCH) $(NEWLIB_STAMPFILE) - CPPFLAGS="$(TARGET_CPPFLAGS)" CFLAGS="$(TARGET_CFLAGS)" $(MAKE) DESTDIR= CONFIG_LIBXC_MINIOS=y -C libxc-$(XEN_TARGET_ARCH) - ####### # ioemu ####### @@ -409,7 +398,7 @@ ioemu-minios-config.mk: $(CURDIR)/ioemu-minios.cfg MINIOS_CONFIG="$<" CONFIG_FILE="$(CURDIR)/$@" $(MAKE) DESTDIR= -C $(MINI_OS) config .PHONY: ioemu -ioemu: cross-zlib cross-libpci libxc ioemu-minios-config.mk +ioemu: cross-zlib cross-libpci libxenguest ioemu-minios-config.mk [ -f ioemu/config-host.mak ] || \ ( $(buildmakevars2shellvars); \ cd ioemu ; \ @@ -503,15 +492,15 @@ xenstore: $(CROSS_ROOT) xenstore-minios-config.mk .PHONY: ioemu-stubdom ioemu-stubdom: APP_OBJS=$(CURDIR)/ioemu/i386-stubdom/qemu.a $(CURDIR)/ioemu/i386-stubdom/libqemu.a $(CURDIR)/ioemu/libqemu_common.a -ioemu-stubdom: mini-os-$(XEN_TARGET_ARCH)-ioemu lwip-$(XEN_TARGET_ARCH) libxc ioemu +ioemu-stubdom: mini-os-$(XEN_TARGET_ARCH)-ioemu lwip-$(XEN_TARGET_ARCH) libxenguest ioemu DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/ioemu-minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) APP_OBJS="$(APP_OBJS)" .PHONY: caml-stubdom -caml-stubdom: mini-os-$(XEN_TARGET_ARCH)-caml lwip-$(XEN_TARGET_ARCH) libxc cross-ocaml caml +caml-stubdom: mini-os-$(XEN_TARGET_ARCH)-caml lwip-$(XEN_TARGET_ARCH) libxenguest cross-ocaml caml DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/caml/minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) APP_OBJS="$(CURDIR)/caml/main-caml.o $(CURDIR)/caml/caml.o $(CAMLLIB)/libasmrun.a" .PHONY: c-stubdom -c-stubdom: mini-os-$(XEN_TARGET_ARCH)-c lwip-$(XEN_TARGET_ARCH) libxc c +c-stubdom: mini-os-$(XEN_TARGET_ARCH)-c lwip-$(XEN_TARGET_ARCH) libxenguest c DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/c/minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) APP_OBJS=$(CURDIR)/c/main.a .PHONY: vtpm-stubdom @@ -523,11 +512,11 @@ vtpmmgr-stubdom: mini-os-$(XEN_TARGET_ARCH)-vtpmmgr vtpmmgr DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/vtpmmgr/minios.cfg" $(MAKE) -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< APP_OBJS="$(CURDIR)/vtpmmgr/vtpmmgr.a" APP_LDLIBS="-lm -lpolarssl" .PHONY: pv-grub -pv-grub: mini-os-$(XEN_TARGET_ARCH)-grub libxc grub +pv-grub: mini-os-$(XEN_TARGET_ARCH)-grub libxenguest grub DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/grub/minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< APP_OBJS=$(CURDIR)/grub-$(XEN_TARGET_ARCH)/main.a .PHONY: xenstore-stubdom -xenstore-stubdom: mini-os-$(XEN_TARGET_ARCH)-xenstore libxc xenstore +xenstore-stubdom: mini-os-$(XEN_TARGET_ARCH)-xenstore libxenguest xenstore DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/xenstore-minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< APP_OBJS=$(CURDIR)/xenstore/xenstored.a ######### @@ -621,7 +610,6 @@ clean: rm -f $(STUBDOMPATH) rm -f *-minios-config.mk rm -fr pkg-config - [ ! -e libxc-$(XEN_TARGET_ARCH)/Makefile ] || $(MAKE) DESTDIR= -C libxc-$(XEN_TARGET_ARCH) clean -[ ! -d ioemu ] || $(MAKE) DESTDIR= -C ioemu clean -[ ! -d xenstore ] || $(MAKE) DESTDIR= -C xenstore clean @@ -632,7 +620,7 @@ crossclean: clean rm -fr newlib-$(XEN_TARGET_ARCH) rm -fr zlib-$(XEN_TARGET_ARCH) pciutils-$(XEN_TARGET_ARCH) rm -fr libs-$(XEN_TARGET_ARCH) - rm -fr libxc-$(XEN_TARGET_ARCH) ioemu xenstore + rm -fr ioemu xenstore rm -fr gmp-$(XEN_TARGET_ARCH) rm -fr polarssl-$(XEN_TARGET_ARCH) rm -fr openssl-$(XEN_TARGET_ARCH) diff --git a/stubdom/grub/Makefile b/stubdom/grub/Makefile index d33fa2f71e..7397661c9b 100644 --- a/stubdom/grub/Makefile +++ b/stubdom/grub/Makefile @@ -7,7 +7,7 @@ BOOT=$(OBJ_DIR)/boot-$(XEN_TARGET_ARCH).o DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libs/toollog/include DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libs/ctrl/include -DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc/include +DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libs/guest/include DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/include -I. DEF_CPPFLAGS += -I../grub-upstream/stage1 DEF_CPPFLAGS += -I../grub-upstream/stage2 diff --git a/stubdom/mini-os.mk b/stubdom/mini-os.mk index b1387df3f8..e1640a7cbc 100644 --- a/stubdom/mini-os.mk +++ b/stubdom/mini-os.mk @@ -14,4 +14,4 @@ CALL_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/call FOREIGNMEMORY_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/foreignmemory DEVICEMODEL_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/devicemodel CTRL_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/ctrl -GUEST_PATH = $(XEN_ROOT)/stubdom/libxc-$(MINIOS_TARGET_ARCH) +GUEST_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/guest diff --git a/tools/Makefile b/tools/Makefile index 7c9f9fc900..f9b4012290 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -6,7 +6,6 @@ include $(XEN_ROOT)/tools/Rules.mk SUBDIRS-y := SUBDIRS-y += libs -SUBDIRS-y += libxc SUBDIRS-y += flask SUBDIRS-y += fuzz SUBDIRS-y += xenstore @@ -44,7 +43,7 @@ SUBDIRS-y += pygrub SUBDIRS-$(OCAML_TOOLS) += ocaml ifeq ($(CONFIG_RUMP),y) -SUBDIRS-y := libs libxc xenstore +SUBDIRS-y := libs xenstore endif # For the sake of linking, set the sys-root diff --git a/tools/Rules.mk b/tools/Rules.mk index 35940cb338..e17ac3ecc6 100644 --- a/tools/Rules.mk +++ b/tools/Rules.mk @@ -15,7 +15,6 @@ XEN_INCLUDE = $(XEN_ROOT)/tools/include include $(XEN_ROOT)/tools/libs/uselibs.mk -XEN_libxenguest = $(XEN_ROOT)/tools/libxc XEN_libxenlight = $(XEN_ROOT)/tools/libxl # Currently libxlutil lives in the same directory as libxenlight XEN_libxlutil = $(XEN_libxenlight) @@ -105,11 +104,7 @@ $(foreach lib,$(LIBS_LIBS),$(eval $(call LIB_defs,$(lib)))) # code which compiles against libxenctrl get __XEN_TOOLS__ and # therefore sees the unstable hypercall interfaces. CFLAGS_libxenctrl += $(CFLAGS_libxentoollog) $(CFLAGS_libxenforeignmemory) $(CFLAGS_libxendevicemodel) -D__XEN_TOOLS__ - -CFLAGS_libxenguest = -I$(XEN_libxenguest)/include $(CFLAGS_libxenevtchn) $(CFLAGS_libxenforeignmemory) $(CFLAGS_xeninclude) -SHDEPS_libxenguest = $(SHLIB_libxenevtchn) $(SHLIB_libxenctrl) -LDLIBS_libxenguest = $(SHDEPS_libxenguest) $(XEN_libxenguest)/libxenguest$(libextension) -SHLIB_libxenguest = $(SHDEPS_libxenguest) -Wl,-rpath-link=$(XEN_libxenguest) +CFLAGS_libxenguest += $(CFLAGS_libxenevtchn) $(CFLAGS_libxenforeignmemory) CFLAGS_libxenstore = -I$(XEN_libxenstore)/include $(CFLAGS_xeninclude) SHDEPS_libxenstore = $(SHLIB_libxentoolcore) $(SHLIB_libxenctrl) diff --git a/tools/libs/Makefile b/tools/libs/Makefile index 7648ea0e4c..f15c1688f7 100644 --- a/tools/libs/Makefile +++ b/tools/libs/Makefile @@ -10,6 +10,7 @@ SUBDIRS-y += call SUBDIRS-y += foreignmemory SUBDIRS-y += devicemodel SUBDIRS-y += ctrl +SUBDIRS-y += guest SUBDIRS-y += hypfs ifeq ($(CONFIG_RUMP),y) diff --git a/tools/libs/guest/COPYING b/tools/libs/guest/COPYING new file mode 100644 index 0000000000..7ca8702509 --- /dev/null +++ b/tools/libs/guest/COPYING @@ -0,0 +1,467 @@ +Note that the only valid version of the LGPL as far as the files in +this directory (and its subdirectories) are concerned is _this_ +particular version of the license (i.e., *only* v2.1, not v2.2 or v3.x +or whatever), unless explicitly otherwise stated. + +Where clause 3 is invoked in order to relicense under the GPL then +this shall be considered to be GPL v2 only for files which have +specified LGPL v2.1 only. + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/tools/libs/guest/Makefile b/tools/libs/guest/Makefile new file mode 100644 index 0000000000..e53aeabd3e --- /dev/null +++ b/tools/libs/guest/Makefile @@ -0,0 +1,121 @@ +XEN_ROOT = $(CURDIR)/../../.. +include $(XEN_ROOT)/tools/Rules.mk + +ifeq ($(CONFIG_LIBXC_MINIOS),y) +# Save/restore of a domain is currently incompatible with a stubdom environment +override CONFIG_MIGRATE := n +endif + +LINK_FILES := xc_private.h xc_core.h xc_core_x86.h xc_core_arm.h xc_bitops.h + +$(LINK_FILES): + ln -sf $(XEN_ROOT)/tools/libs/ctrl/$(notdir $@) $@ + +SRCS-y += xg_private.c +SRCS-y += xg_domain.c +SRCS-y += xg_suspend.c +ifeq ($(CONFIG_MIGRATE),y) +SRCS-y += xg_sr_common.c +SRCS-$(CONFIG_X86) += xg_sr_common_x86.c +SRCS-$(CONFIG_X86) += xg_sr_common_x86_pv.c +SRCS-$(CONFIG_X86) += xg_sr_restore_x86_pv.c +SRCS-$(CONFIG_X86) += xg_sr_restore_x86_hvm.c +SRCS-$(CONFIG_X86) += xg_sr_save_x86_pv.c +SRCS-$(CONFIG_X86) += xg_sr_save_x86_hvm.c +SRCS-y += xg_sr_restore.c +SRCS-y += xg_sr_save.c +SRCS-y += xg_offline_page.c +else +SRCS-y += xg_nomigrate.c +endif + +vpath %.c ../../../xen/common/libelf +CFLAGS += -I../../../xen/common/libelf + +ELF_SRCS-y += libelf-tools.c libelf-loader.c +ELF_SRCS-y += libelf-dominfo.c + +SRCS-y += $(ELF_SRCS-y) + +$(patsubst %.c,%.o,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign +$(patsubst %.c,%.opic,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign + +ifeq ($(CONFIG_X86),y) # Add libx86 to the build +vpath %.c ../../../xen/lib/x86 + +SRCS-y += cpuid.c msr.c +endif + +# new domain builder +SRCS-y += xg_dom_core.c +SRCS-y += xg_dom_boot.c +SRCS-y += xg_dom_elfloader.c +SRCS-$(CONFIG_X86) += xg_dom_bzimageloader.c +SRCS-$(CONFIG_X86) += xg_dom_decompress_lz4.c +SRCS-$(CONFIG_X86) += xg_dom_hvmloader.c +SRCS-$(CONFIG_ARM) += xg_dom_armzimageloader.c +SRCS-y += xg_dom_binloader.c +SRCS-y += xg_dom_compat_linux.c + +SRCS-$(CONFIG_X86) += xg_dom_x86.c +SRCS-$(CONFIG_X86) += xg_cpuid_x86.c +SRCS-$(CONFIG_ARM) += xg_dom_arm.c + +ifeq ($(CONFIG_LIBXC_MINIOS),y) +SRCS-y += xg_dom_decompress_unsafe.c +SRCS-y += xg_dom_decompress_unsafe_bzip2.c +SRCS-y += xg_dom_decompress_unsafe_lzma.c +SRCS-y += xg_dom_decompress_unsafe_lzo1x.c +SRCS-y += xg_dom_decompress_unsafe_xz.c +endif + +-include $(XEN_TARGET_ARCH)/Makefile + +CFLAGS += -Werror -Wmissing-prototypes +CFLAGS += -I. -I./include $(CFLAGS_xeninclude) +CFLAGS += -D__XEN_TOOLS__ +CFLAGS += -include $(XEN_ROOT)/tools/config.h + +# Needed for posix_fadvise64() in xc_linux.c +CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE + +CFLAGS += $(PTHREAD_CFLAGS) +CFLAGS += $(CFLAGS_libxentoollog) +CFLAGS += $(CFLAGS_libxenevtchn) +CFLAGS += $(CFLAGS_libxendevicemodel) + +# libxenguest includes xc_private.h, so needs this despite not using +# this functionality directly. +CFLAGS += $(CFLAGS_libxencall) $(CFLAGS_libxenforeignmemory) + +ifeq ($(CONFIG_MiniOS),y) +zlib-options = +else +zlib-options = $(ZLIB) +endif + +xc_dom_bzimageloader.o: CFLAGS += $(filter -D%,$(zlib-options)) +xc_dom_bzimageloader.opic: CFLAGS += $(filter -D%,$(zlib-options)) + +LIBHEADER := xenguest.h + +NO_HEADERS_CHK := y + +include $(XEN_ROOT)/tools/libs/libs.mk + +libxenguest.so.$(MAJOR).$(MINOR): COMPRESSION_LIBS = $(filter -l%,$(zlib-options)) +libxenguest.so.$(MAJOR).$(MINOR): APPEND_LDFLAGS += $(COMPRESSION_LIBS) -lz + +genpath-target = $(call buildmakevars2header,_paths.h) +$(eval $(genpath-target)) + +xc_private.h: _paths.h + +$(LIB_OBJS) $(PIC_OBJS): $(LINK_FILES) + +$(PKG_CONFIG_LOCAL): PKG_CONFIG_INCDIR = $(XEN_libxenctrl)/include +$(PKG_CONFIG_LOCAL): PKG_CONFIG_CFLAGS_LOCAL = $(CFLAGS_xeninclude) + +.PHONY: cleanlocal +cleanlocal: + rm -f libxenguest.map diff --git a/tools/libs/guest/include/xenguest.h b/tools/libs/guest/include/xenguest.h new file mode 100644 index 0000000000..4643384790 --- /dev/null +++ b/tools/libs/guest/include/xenguest.h @@ -0,0 +1,327 @@ +/****************************************************************************** + * xenguest.h + * + * A library for guest domain management in Xen. + * + * Copyright (c) 2003-2004, K A Fraser. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + */ + +#ifndef XENGUEST_H +#define XENGUEST_H + +#include + +#define XC_NUMA_NO_NODE (~0U) + +#define XCFLAGS_LIVE (1 << 0) +#define XCFLAGS_DEBUG (1 << 1) + +#define X86_64_B_SIZE 64 +#define X86_32_B_SIZE 32 + +/* + * User not using xc_suspend_* / xc_await_suspent may not want to + * include the full libxenevtchn API here. + */ +struct xenevtchn_handle; + +/* For save's precopy_policy(). */ +struct precopy_stats +{ + unsigned int iteration; + unsigned int total_written; + long dirty_count; /* -1 if unknown */ +}; + +/* + * A precopy_policy callback may not be running in the same address + * space as libxc an so precopy_stats is passed by value. + */ +typedef int (*precopy_policy_t)(struct precopy_stats, void *); + +/* callbacks provided by xc_domain_save */ +struct save_callbacks { + /* + * Called after expiration of checkpoint interval, + * to suspend the guest. + */ + int (*suspend)(void *data); + + /* + * Called before and after every batch of page data sent during + * the precopy phase of a live migration to ask the caller what + * to do next based on the current state of the precopy migration. + * + * Should return one of the values listed below: + */ +#define XGS_POLICY_ABORT (-1) /* Abandon the migration entirely + * and tidy up. */ +#define XGS_POLICY_CONTINUE_PRECOPY 0 /* Remain in the precopy phase. */ +#define XGS_POLICY_STOP_AND_COPY 1 /* Immediately suspend and transmit the + * remaining dirty pages. */ + precopy_policy_t precopy_policy; + + /* + * Called after the guest's dirty pages have been + * copied into an output buffer. + * Callback function resumes the guest & the device model, + * returns to xc_domain_save. + * xc_domain_save then flushes the output buffer, while the + * guest continues to run. + */ + int (*postcopy)(void *data); + + /* + * Called after the memory checkpoint has been flushed + * out into the network. Typical actions performed in this + * callback include: + * (a) send the saved device model state (for HVM guests), + * (b) wait for checkpoint ack + * (c) release the network output buffer pertaining to the acked checkpoint. + * (c) sleep for the checkpoint interval. + * + * returns: + * 0: terminate checkpointing gracefully + * 1: take another checkpoint + */ + int (*checkpoint)(void *data); + + /* + * Called after the checkpoint callback. + * + * returns: + * 0: terminate checkpointing gracefully + * 1: take another checkpoint + */ + int (*wait_checkpoint)(void *data); + + /* Enable qemu-dm logging dirty pages to xen */ + int (*switch_qemu_logdirty)(uint32_t domid, unsigned enable, void *data); /* HVM only */ + + /* to be provided as the last argument to each callback function */ + void *data; +}; + +/* Type of stream. Plain, or using a continuous replication protocol? */ +typedef enum { + XC_STREAM_PLAIN, + XC_STREAM_REMUS, + XC_STREAM_COLO, +} xc_stream_type_t; + +/** + * This function will save a running domain. + * + * @param xch a handle to an open hypervisor interface + * @param io_fd the file descriptor to save a domain to + * @param dom the id of the domain + * @param flags XCFLAGS_xxx + * @param stream_type XC_STREAM_PLAIN if the far end of the stream + * doesn't use checkpointing + * @param recv_fd Only used for XC_STREAM_COLO. Contains backchannel from + * the destination side. + * @return 0 on success, -1 on failure + */ +int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, + uint32_t flags, struct save_callbacks *callbacks, + xc_stream_type_t stream_type, int recv_fd); + +/* callbacks provided by xc_domain_restore */ +struct restore_callbacks { + /* + * Called once the STATIC_DATA_END record has been received/inferred. + * + * For compatibility with older streams, provides a list of static data + * expected to be found in the stream, which was missing. A higher level + * toolstack is responsible for providing any necessary compatibiltiy. + */ +#define XGR_SDD_MISSING_CPUID (1 << 0) +#define XGR_SDD_MISSING_MSR (1 << 1) + int (*static_data_done)(unsigned int missing, void *data); + + /* Called after a new checkpoint to suspend the guest. */ + int (*suspend)(void *data); + + /* + * Called after the secondary vm is ready to resume. + * Callback function resumes the guest & the device model, + * returns to xc_domain_restore. + */ + int (*postcopy)(void *data); + + /* + * A checkpoint record has been found in the stream. + * returns: + */ +#define XGR_CHECKPOINT_ERROR 0 /* Terminate processing */ +#define XGR_CHECKPOINT_SUCCESS 1 /* Continue reading more data from the stream */ +#define XGR_CHECKPOINT_FAILOVER 2 /* Failover and resume VM */ + int (*checkpoint)(void *data); + + /* + * Called after the checkpoint callback. + * + * returns: + * 0: terminate checkpointing gracefully + * 1: take another checkpoint + */ + int (*wait_checkpoint)(void *data); + + /* + * callback to send store gfn and console gfn to xl + * if we want to resume vm before xc_domain_save() + * exits. + */ + void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn, + void *data); + + /* to be provided as the last argument to each callback function */ + void *data; +}; + +/** + * This function will restore a saved domain. + * + * Domain is restored in a suspended state ready to be unpaused. + * + * @param xch a handle to an open hypervisor interface + * @param io_fd the file descriptor to restore a domain from + * @param dom the id of the domain + * @param store_evtchn the xenstore event channel for this domain to use + * @param store_mfn filled with the gfn of the store page + * @param store_domid the backend domain for xenstore + * @param console_evtchn the console event channel for this domain to use + * @param console_mfn filled with the gfn of the console page + * @param console_domid the backend domain for xenconsole + * @param stream_type XC_STREAM_PLAIN if the far end of the stream is using + * checkpointing + * @param callbacks non-NULL to receive a callback to restore toolstack + * specific data + * @param send_back_fd Only used for XC_STREAM_COLO. Contains backchannel to + * the source side. + * @return 0 on success, -1 on failure + */ +int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, + unsigned int store_evtchn, unsigned long *store_mfn, + uint32_t store_domid, unsigned int console_evtchn, + unsigned long *console_mfn, uint32_t console_domid, + xc_stream_type_t stream_type, + struct restore_callbacks *callbacks, int send_back_fd); + +/** + * This function will create a domain for a paravirtualized Linux + * using file names pointing to kernel and ramdisk + * + * @parm xch a handle to an open hypervisor interface + * @parm domid the id of the domain + * @parm mem_mb memory size in megabytes + * @parm image_name name of the kernel image file + * @parm ramdisk_name name of the ramdisk image file + * @parm cmdline command line string + * @parm flags domain creation flags + * @parm store_evtchn the store event channel for this domain to use + * @parm store_mfn returned with the mfn of the store page + * @parm console_evtchn the console event channel for this domain to use + * @parm conole_mfn returned with the mfn of the console page + * @return 0 on success, -1 on failure + */ +int xc_linux_build(xc_interface *xch, + uint32_t domid, + unsigned int mem_mb, + const char *image_name, + const char *ramdisk_name, + const char *cmdline, + const char *features, + unsigned long flags, + unsigned int store_evtchn, + unsigned long *store_mfn, + unsigned int console_evtchn, + unsigned long *console_mfn); + +/* + * Sets *lockfd to -1. + * Has deallocated everything even on error. + */ +int xc_suspend_evtchn_release(xc_interface *xch, + struct xenevtchn_handle *xce, + uint32_t domid, int suspend_evtchn, int *lockfd); + +/** + * This function eats the initial notification. + * xce must not be used for anything else + * See xc_suspend_evtchn_init_sane re lockfd. + */ +int xc_suspend_evtchn_init_exclusive(xc_interface *xch, + struct xenevtchn_handle *xce, + uint32_t domid, int port, int *lockfd); + +/* xce must not be used for anything else */ +int xc_await_suspend(xc_interface *xch, struct xenevtchn_handle *xce, + int suspend_evtchn); + +/** + * The port will be signaled immediately after this call + * The caller should check the domain status and look for the next event + * On success, *lockfd will be set to >=0 and *lockfd must be preserved + * and fed to xc_suspend_evtchn_release. (On error *lockfd is + * undefined and xc_suspend_evtchn_release is not allowed.) + */ +int xc_suspend_evtchn_init_sane(xc_interface *xch, + struct xenevtchn_handle *xce, + uint32_t domid, int port, int *lockfd); + +int xc_mark_page_online(xc_interface *xch, unsigned long start, + unsigned long end, uint32_t *status); + +int xc_mark_page_offline(xc_interface *xch, unsigned long start, + unsigned long end, uint32_t *status); + +int xc_query_page_offline_status(xc_interface *xch, unsigned long start, + unsigned long end, uint32_t *status); + +int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn); + + +/** + * Memory related information, such as PFN types, the P2M table, + * the guest word width and the guest page table levels. + */ +struct xc_domain_meminfo { + unsigned int pt_levels; + unsigned int guest_width; + xen_pfn_t *pfn_type; + xen_pfn_t *p2m_table; + unsigned long p2m_size; +}; + +int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid, + struct xc_domain_meminfo *minfo); + +int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *mem); + +/** + * This function map m2p table + * @parm xch a handle to an open hypervisor interface + * @parm max_mfn the max pfn + * @parm prot the flags to map, such as read/write etc + * @parm mfn0 return the first mfn, can be NULL + * @return mapped m2p table on success, NULL on failure + */ +xen_pfn_t *xc_map_m2p(xc_interface *xch, + unsigned long max_mfn, + int prot, + unsigned long *mfn0); +#endif /* XENGUEST_H */ diff --git a/tools/libs/guest/xg_cpuid_x86.c b/tools/libs/guest/xg_cpuid_x86.c new file mode 100644 index 0000000000..0f24d6dd08 --- /dev/null +++ b/tools/libs/guest/xg_cpuid_x86.c @@ -0,0 +1,665 @@ +/****************************************************************************** + * xc_cpuid_x86.c + * + * Compute cpuid of a domain. + * + * Copyright (c) 2008, Citrix Systems, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + */ + +#include +#include +#include +#include "xc_private.h" +#include "xc_bitops.h" +#include +#include + +enum { +#define XEN_CPUFEATURE(name, value) X86_FEATURE_##name = value, +#include +}; + +#include + +#include + +#define bitmaskof(idx) (1u << ((idx) & 31)) +#define featureword_of(idx) ((idx) >> 5) + +int xc_get_cpu_levelling_caps(xc_interface *xch, uint32_t *caps) +{ + DECLARE_SYSCTL; + int ret; + + sysctl.cmd = XEN_SYSCTL_get_cpu_levelling_caps; + ret = do_sysctl(xch, &sysctl); + + if ( !ret ) + *caps = sysctl.u.cpu_levelling_caps.caps; + + return ret; +} + +int xc_get_cpu_featureset(xc_interface *xch, uint32_t index, + uint32_t *nr_features, uint32_t *featureset) +{ + DECLARE_SYSCTL; + DECLARE_HYPERCALL_BOUNCE(featureset, + *nr_features * sizeof(*featureset), + XC_HYPERCALL_BUFFER_BOUNCE_OUT); + int ret; + + if ( xc_hypercall_bounce_pre(xch, featureset) ) + return -1; + + sysctl.cmd = XEN_SYSCTL_get_cpu_featureset; + sysctl.u.cpu_featureset.index = index; + sysctl.u.cpu_featureset.nr_features = *nr_features; + set_xen_guest_handle(sysctl.u.cpu_featureset.features, featureset); + + ret = do_sysctl(xch, &sysctl); + + xc_hypercall_bounce_post(xch, featureset); + + if ( !ret ) + *nr_features = sysctl.u.cpu_featureset.nr_features; + + return ret; +} + +uint32_t xc_get_cpu_featureset_size(void) +{ + return FEATURESET_NR_ENTRIES; +} + +const uint32_t *xc_get_static_cpu_featuremask( + enum xc_static_cpu_featuremask mask) +{ + static const uint32_t masks[][FEATURESET_NR_ENTRIES] = { +#define MASK(x) [XC_FEATUREMASK_ ## x] = INIT_ ## x ## _FEATURES + + MASK(KNOWN), + MASK(SPECIAL), + MASK(PV_MAX), + MASK(PV_DEF), + MASK(HVM_SHADOW_MAX), + MASK(HVM_SHADOW_DEF), + MASK(HVM_HAP_MAX), + MASK(HVM_HAP_DEF), + +#undef MASK + }; + + if ( (unsigned int)mask >= ARRAY_SIZE(masks) ) + return NULL; + + return masks[mask]; +} + +int xc_get_cpu_policy_size(xc_interface *xch, uint32_t *nr_leaves, + uint32_t *nr_msrs) +{ + struct xen_sysctl sysctl = {}; + int ret; + + sysctl.cmd = XEN_SYSCTL_get_cpu_policy; + + ret = do_sysctl(xch, &sysctl); + + if ( !ret ) + { + *nr_leaves = sysctl.u.cpu_policy.nr_leaves; + *nr_msrs = sysctl.u.cpu_policy.nr_msrs; + } + + return ret; +} + +int xc_get_system_cpu_policy(xc_interface *xch, uint32_t index, + uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves, + uint32_t *nr_msrs, xen_msr_entry_t *msrs) +{ + struct xen_sysctl sysctl = {}; + DECLARE_HYPERCALL_BOUNCE(leaves, + *nr_leaves * sizeof(*leaves), + XC_HYPERCALL_BUFFER_BOUNCE_OUT); + DECLARE_HYPERCALL_BOUNCE(msrs, + *nr_msrs * sizeof(*msrs), + XC_HYPERCALL_BUFFER_BOUNCE_OUT); + int ret; + + if ( xc_hypercall_bounce_pre(xch, leaves) || + xc_hypercall_bounce_pre(xch, msrs) ) + return -1; + + sysctl.cmd = XEN_SYSCTL_get_cpu_policy; + sysctl.u.cpu_policy.index = index; + sysctl.u.cpu_policy.nr_leaves = *nr_leaves; + set_xen_guest_handle(sysctl.u.cpu_policy.cpuid_policy, leaves); + sysctl.u.cpu_policy.nr_msrs = *nr_msrs; + set_xen_guest_handle(sysctl.u.cpu_policy.msr_policy, msrs); + + ret = do_sysctl(xch, &sysctl); + + xc_hypercall_bounce_post(xch, leaves); + xc_hypercall_bounce_post(xch, msrs); + + if ( !ret ) + { + *nr_leaves = sysctl.u.cpu_policy.nr_leaves; + *nr_msrs = sysctl.u.cpu_policy.nr_msrs; + } + + return ret; +} + +int xc_get_domain_cpu_policy(xc_interface *xch, uint32_t domid, + uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves, + uint32_t *nr_msrs, xen_msr_entry_t *msrs) +{ + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BOUNCE(leaves, + *nr_leaves * sizeof(*leaves), + XC_HYPERCALL_BUFFER_BOUNCE_OUT); + DECLARE_HYPERCALL_BOUNCE(msrs, + *nr_msrs * sizeof(*msrs), + XC_HYPERCALL_BUFFER_BOUNCE_OUT); + int ret; + + if ( xc_hypercall_bounce_pre(xch, leaves) || + xc_hypercall_bounce_pre(xch, msrs) ) + return -1; + + domctl.cmd = XEN_DOMCTL_get_cpu_policy; + domctl.domain = domid; + domctl.u.cpu_policy.nr_leaves = *nr_leaves; + set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves); + domctl.u.cpu_policy.nr_msrs = *nr_msrs; + set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs); + + ret = do_domctl(xch, &domctl); + + xc_hypercall_bounce_post(xch, leaves); + xc_hypercall_bounce_post(xch, msrs); + + if ( !ret ) + { + *nr_leaves = domctl.u.cpu_policy.nr_leaves; + *nr_msrs = domctl.u.cpu_policy.nr_msrs; + } + + return ret; +} + +int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid, + uint32_t nr_leaves, xen_cpuid_leaf_t *leaves, + uint32_t nr_msrs, xen_msr_entry_t *msrs, + uint32_t *err_leaf_p, uint32_t *err_subleaf_p, + uint32_t *err_msr_p) +{ + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BOUNCE(leaves, + nr_leaves * sizeof(*leaves), + XC_HYPERCALL_BUFFER_BOUNCE_IN); + DECLARE_HYPERCALL_BOUNCE(msrs, + nr_msrs * sizeof(*msrs), + XC_HYPERCALL_BUFFER_BOUNCE_IN); + int ret; + + if ( err_leaf_p ) + *err_leaf_p = -1; + if ( err_subleaf_p ) + *err_subleaf_p = -1; + if ( err_msr_p ) + *err_msr_p = -1; + + if ( xc_hypercall_bounce_pre(xch, leaves) ) + return -1; + + if ( xc_hypercall_bounce_pre(xch, msrs) ) + return -1; + + domctl.cmd = XEN_DOMCTL_set_cpu_policy; + domctl.domain = domid; + domctl.u.cpu_policy.nr_leaves = nr_leaves; + set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves); + domctl.u.cpu_policy.nr_msrs = nr_msrs; + set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs); + domctl.u.cpu_policy.err_leaf = -1; + domctl.u.cpu_policy.err_subleaf = -1; + domctl.u.cpu_policy.err_msr = -1; + + ret = do_domctl(xch, &domctl); + + xc_hypercall_bounce_post(xch, leaves); + xc_hypercall_bounce_post(xch, msrs); + + if ( err_leaf_p ) + *err_leaf_p = domctl.u.cpu_policy.err_leaf; + if ( err_subleaf_p ) + *err_subleaf_p = domctl.u.cpu_policy.err_subleaf; + if ( err_msr_p ) + *err_msr_p = domctl.u.cpu_policy.err_msr; + + return ret; +} + +static int compare_leaves(const void *l, const void *r) +{ + const xen_cpuid_leaf_t *lhs = l; + const xen_cpuid_leaf_t *rhs = r; + + if ( lhs->leaf != rhs->leaf ) + return lhs->leaf < rhs->leaf ? -1 : 1; + + if ( lhs->subleaf != rhs->subleaf ) + return lhs->subleaf < rhs->subleaf ? -1 : 1; + + return 0; +} + +static xen_cpuid_leaf_t *find_leaf( + xen_cpuid_leaf_t *leaves, unsigned int nr_leaves, + const struct xc_xend_cpuid *xend) +{ + const xen_cpuid_leaf_t key = { xend->leaf, xend->subleaf }; + + return bsearch(&key, leaves, nr_leaves, sizeof(*leaves), compare_leaves); +} + +static int xc_cpuid_xend_policy( + xc_interface *xch, uint32_t domid, const struct xc_xend_cpuid *xend) +{ + int rc; + xc_dominfo_t di; + unsigned int nr_leaves, nr_msrs; + uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1; + /* + * Three full policies. The host, domain max, and domain current for the + * domain type. + */ + xen_cpuid_leaf_t *host = NULL, *max = NULL, *cur = NULL; + unsigned int nr_host, nr_max, nr_cur; + + if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 || + di.domid != domid ) + { + ERROR("Failed to obtain d%d info", domid); + rc = -ESRCH; + goto fail; + } + + rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs); + if ( rc ) + { + PERROR("Failed to obtain policy info size"); + rc = -errno; + goto fail; + } + + rc = -ENOMEM; + if ( (host = calloc(nr_leaves, sizeof(*host))) == NULL || + (max = calloc(nr_leaves, sizeof(*max))) == NULL || + (cur = calloc(nr_leaves, sizeof(*cur))) == NULL ) + { + ERROR("Unable to allocate memory for %u CPUID leaves", nr_leaves); + goto fail; + } + + /* Get the domain's current policy. */ + nr_msrs = 0; + nr_cur = nr_leaves; + rc = xc_get_domain_cpu_policy(xch, domid, &nr_cur, cur, &nr_msrs, NULL); + if ( rc ) + { + PERROR("Failed to obtain d%d current policy", domid); + rc = -errno; + goto fail; + } + + /* Get the domain's max policy. */ + nr_msrs = 0; + nr_max = nr_leaves; + rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_max + : XEN_SYSCTL_cpu_policy_pv_max, + &nr_max, max, &nr_msrs, NULL); + if ( rc ) + { + PERROR("Failed to obtain %s max policy", di.hvm ? "hvm" : "pv"); + rc = -errno; + goto fail; + } + + /* Get the host policy. */ + nr_msrs = 0; + nr_host = nr_leaves; + rc = xc_get_system_cpu_policy(xch, XEN_SYSCTL_cpu_policy_host, + &nr_host, host, &nr_msrs, NULL); + if ( rc ) + { + PERROR("Failed to obtain host policy"); + rc = -errno; + goto fail; + } + + rc = -EINVAL; + for ( ; xend->leaf != XEN_CPUID_INPUT_UNUSED; ++xend ) + { + xen_cpuid_leaf_t *cur_leaf = find_leaf(cur, nr_cur, xend); + const xen_cpuid_leaf_t *max_leaf = find_leaf(max, nr_max, xend); + const xen_cpuid_leaf_t *host_leaf = find_leaf(host, nr_host, xend); + + if ( cur_leaf == NULL || max_leaf == NULL || host_leaf == NULL ) + { + ERROR("Missing leaf %#x, subleaf %#x", xend->leaf, xend->subleaf); + goto fail; + } + + for ( unsigned int i = 0; i < ARRAY_SIZE(xend->policy); i++ ) + { + uint32_t *cur_reg = &cur_leaf->a + i; + const uint32_t *max_reg = &max_leaf->a + i; + const uint32_t *host_reg = &host_leaf->a + i; + + if ( xend->policy[i] == NULL ) + continue; + + for ( unsigned int j = 0; j < 32; j++ ) + { + bool val; + + if ( xend->policy[i][j] == '1' ) + val = true; + else if ( xend->policy[i][j] == '0' ) + val = false; + else if ( xend->policy[i][j] == 'x' ) + val = test_bit(31 - j, max_reg); + else if ( xend->policy[i][j] == 'k' || + xend->policy[i][j] == 's' ) + val = test_bit(31 - j, host_reg); + else + { + ERROR("Bad character '%c' in policy[%d] string '%s'", + xend->policy[i][j], i, xend->policy[i]); + goto fail; + } + + clear_bit(31 - j, cur_reg); + if ( val ) + set_bit(31 - j, cur_reg); + } + } + } + + /* Feed the transformed currrent policy back up to Xen. */ + rc = xc_set_domain_cpu_policy(xch, domid, nr_cur, cur, 0, NULL, + &err_leaf, &err_subleaf, &err_msr); + if ( rc ) + { + PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)", + domid, err_leaf, err_subleaf, err_msr); + rc = -errno; + goto fail; + } + + /* Success! */ + + fail: + free(cur); + free(max); + free(host); + + return rc; +} + +int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid, bool restore, + const uint32_t *featureset, unsigned int nr_features, + bool pae, + const struct xc_xend_cpuid *xend) +{ + int rc; + xc_dominfo_t di; + unsigned int i, nr_leaves, nr_msrs; + xen_cpuid_leaf_t *leaves = NULL; + struct cpuid_policy *p = NULL; + uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1; + uint32_t host_featureset[FEATURESET_NR_ENTRIES] = {}; + uint32_t len = ARRAY_SIZE(host_featureset); + + if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 || + di.domid != domid ) + { + ERROR("Failed to obtain d%d info", domid); + rc = -ESRCH; + goto out; + } + + rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs); + if ( rc ) + { + PERROR("Failed to obtain policy info size"); + rc = -errno; + goto out; + } + + rc = -ENOMEM; + if ( (leaves = calloc(nr_leaves, sizeof(*leaves))) == NULL || + (p = calloc(1, sizeof(*p))) == NULL ) + goto out; + + /* Get the host policy. */ + rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_host, + &len, host_featureset); + if ( rc ) + { + /* Tolerate "buffer too small", as we've got the bits we need. */ + if ( errno == ENOBUFS ) + rc = 0; + else + { + PERROR("Failed to obtain host featureset"); + rc = -errno; + goto out; + } + } + + /* Get the domain's default policy. */ + nr_msrs = 0; + rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_default + : XEN_SYSCTL_cpu_policy_pv_default, + &nr_leaves, leaves, &nr_msrs, NULL); + if ( rc ) + { + PERROR("Failed to obtain %s default policy", di.hvm ? "hvm" : "pv"); + rc = -errno; + goto out; + } + + rc = x86_cpuid_copy_from_buffer(p, leaves, nr_leaves, + &err_leaf, &err_subleaf); + if ( rc ) + { + ERROR("Failed to deserialise CPUID (err leaf %#x, subleaf %#x) (%d = %s)", + err_leaf, err_subleaf, -rc, strerror(-rc)); + goto out; + } + + /* + * Account for feature which have been disabled by default since Xen 4.13, + * so migrated-in VM's don't risk seeing features disappearing. + */ + if ( restore ) + { + p->basic.rdrand = test_bit(X86_FEATURE_RDRAND, host_featureset); + + if ( di.hvm ) + { + p->feat.mpx = test_bit(X86_FEATURE_MPX, host_featureset); + } + } + + if ( featureset ) + { + uint32_t disabled_features[FEATURESET_NR_ENTRIES], + feat[FEATURESET_NR_ENTRIES] = {}; + static const uint32_t deep_features[] = INIT_DEEP_FEATURES; + unsigned int i, b; + + /* + * The user supplied featureset may be shorter or longer than + * FEATURESET_NR_ENTRIES. Shorter is fine, and we will zero-extend. + * Longer is fine, so long as it only padded with zeros. + */ + unsigned int user_len = min(FEATURESET_NR_ENTRIES + 0u, nr_features); + + /* Check for truncated set bits. */ + rc = -EOPNOTSUPP; + for ( i = user_len; i < nr_features; ++i ) + if ( featureset[i] != 0 ) + goto out; + + memcpy(feat, featureset, sizeof(*featureset) * user_len); + + /* Disable deep dependencies of disabled features. */ + for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i ) + disabled_features[i] = ~feat[i] & deep_features[i]; + + for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b ) + { + const uint32_t *dfs; + + if ( !test_bit(b, disabled_features) || + !(dfs = x86_cpuid_lookup_deep_deps(b)) ) + continue; + + for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i ) + { + feat[i] &= ~dfs[i]; + disabled_features[i] &= ~dfs[i]; + } + } + + cpuid_featureset_to_policy(feat, p); + } + else + { + if ( di.hvm ) + p->basic.pae = pae; + } + + if ( !di.hvm ) + { + /* + * On hardware without CPUID Faulting, PV guests see real topology. + * As a consequence, they also need to see the host htt/cmp fields. + */ + p->basic.htt = test_bit(X86_FEATURE_HTT, host_featureset); + p->extd.cmp_legacy = test_bit(X86_FEATURE_CMP_LEGACY, host_featureset); + } + else + { + /* + * Topology for HVM guests is entirely controlled by Xen. For now, we + * hardcode APIC_ID = vcpu_id * 2 to give the illusion of no SMT. + */ + p->basic.htt = true; + p->extd.cmp_legacy = false; + + /* + * Leaf 1 EBX[23:16] is Maximum Logical Processors Per Package. + * Update to reflect vLAPIC_ID = vCPU_ID * 2, but make sure to avoid + * overflow. + */ + if ( !(p->basic.lppp & 0x80) ) + p->basic.lppp *= 2; + + switch ( p->x86_vendor ) + { + case X86_VENDOR_INTEL: + for ( i = 0; (p->cache.subleaf[i].type && + i < ARRAY_SIZE(p->cache.raw)); ++i ) + { + p->cache.subleaf[i].cores_per_package = + (p->cache.subleaf[i].cores_per_package << 1) | 1; + p->cache.subleaf[i].threads_per_cache = 0; + } + break; + + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + /* + * Leaf 0x80000008 ECX[15:12] is ApicIdCoreSize. + * Leaf 0x80000008 ECX[7:0] is NumberOfCores (minus one). + * Update to reflect vLAPIC_ID = vCPU_ID * 2. But avoid + * - overflow, + * - going out of sync with leaf 1 EBX[23:16], + * - incrementing ApicIdCoreSize when it's zero (which changes the + * meaning of bits 7:0). + * + * UPDATE: I addition to avoiding overflow, some + * proprietary operating systems have trouble with + * apic_id_size values greater than 7. Limit the value to + * 7 for now. + */ + if ( p->extd.nc < 0x7f ) + { + if ( p->extd.apic_id_size != 0 && p->extd.apic_id_size < 0x7 ) + p->extd.apic_id_size++; + + p->extd.nc = (p->extd.nc << 1) | 1; + } + break; + } + + /* + * These settings are necessary to cause earlier HVM_PARAM_NESTEDHVM / + * XEN_DOMCTL_disable_migrate settings to be reflected correctly in + * CPUID. Xen will discard these bits if configuration hasn't been + * set for the domain. + */ + p->extd.itsc = true; + p->basic.vmx = true; + p->extd.svm = true; + } + + rc = x86_cpuid_copy_to_buffer(p, leaves, &nr_leaves); + if ( rc ) + { + ERROR("Failed to serialise CPUID (%d = %s)", -rc, strerror(-rc)); + goto out; + } + + rc = xc_set_domain_cpu_policy(xch, domid, nr_leaves, leaves, 0, NULL, + &err_leaf, &err_subleaf, &err_msr); + if ( rc ) + { + PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)", + domid, err_leaf, err_subleaf, err_msr); + rc = -errno; + goto out; + } + + if ( xend && (rc = xc_cpuid_xend_policy(xch, domid, xend)) ) + goto out; + + rc = 0; + +out: + free(p); + free(leaves); + + return rc; +} diff --git a/tools/libs/guest/xg_dom_arm.c b/tools/libs/guest/xg_dom_arm.c new file mode 100644 index 0000000000..3f66f1d890 --- /dev/null +++ b/tools/libs/guest/xg_dom_arm.c @@ -0,0 +1,552 @@ +/* + * Xen domain builder -- ARM + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * Copyright (c) 2011, Citrix Systems + */ +#include +#include + +#include +#include +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" + +#define NR_MAGIC_PAGES 4 +#define CONSOLE_PFN_OFFSET 0 +#define XENSTORE_PFN_OFFSET 1 +#define MEMACCESS_PFN_OFFSET 2 +#define VUART_PFN_OFFSET 3 + +#define LPAE_SHIFT 9 + +#define PFN_4K_SHIFT (0) +#define PFN_2M_SHIFT (PFN_4K_SHIFT+LPAE_SHIFT) +#define PFN_1G_SHIFT (PFN_2M_SHIFT+LPAE_SHIFT) +#define PFN_512G_SHIFT (PFN_1G_SHIFT+LPAE_SHIFT) + +/* get guest IO ABI protocol */ +const char *xc_domain_get_native_protocol(xc_interface *xch, + uint32_t domid) +{ + return XEN_IO_PROTO_ABI_ARM; +} + +/* ------------------------------------------------------------------------ */ + +static int alloc_magic_pages(struct xc_dom_image *dom) +{ + int rc, i; + const xen_pfn_t base = GUEST_MAGIC_BASE >> XC_PAGE_SHIFT; + xen_pfn_t p2m[NR_MAGIC_PAGES]; + + BUILD_BUG_ON(NR_MAGIC_PAGES > GUEST_MAGIC_SIZE >> XC_PAGE_SHIFT); + + DOMPRINTF_CALLED(dom->xch); + + for (i = 0; i < NR_MAGIC_PAGES; i++) + p2m[i] = base + i; + + rc = xc_domain_populate_physmap_exact( + dom->xch, dom->guest_domid, NR_MAGIC_PAGES, + 0, 0, p2m); + if ( rc < 0 ) + return rc; + + dom->console_pfn = base + CONSOLE_PFN_OFFSET; + dom->xenstore_pfn = base + XENSTORE_PFN_OFFSET; + dom->vuart_gfn = base + VUART_PFN_OFFSET; + + xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn); + xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn); + xc_clear_domain_page(dom->xch, dom->guest_domid, base + MEMACCESS_PFN_OFFSET); + xc_clear_domain_page(dom->xch, dom->guest_domid, dom->vuart_gfn); + + xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_PFN, + dom->console_pfn); + xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_PFN, + dom->xenstore_pfn); + xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_MONITOR_RING_PFN, + base + MEMACCESS_PFN_OFFSET); + /* allocated by toolstack */ + xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_EVTCHN, + dom->console_evtchn); + xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN, + dom->xenstore_evtchn); + + return 0; +} + +/* ------------------------------------------------------------------------ */ + +static int start_info_arm(struct xc_dom_image *dom) +{ + DOMPRINTF_CALLED(dom->xch); + return 0; +} + +static int shared_info_arm(struct xc_dom_image *dom, void *ptr) +{ + DOMPRINTF_CALLED(dom->xch); + return 0; +} + +/* ------------------------------------------------------------------------ */ + +static int vcpu_arm32(struct xc_dom_image *dom) +{ + vcpu_guest_context_any_t any_ctx; + vcpu_guest_context_t *ctxt = &any_ctx.c; + int rc; + + DOMPRINTF_CALLED(dom->xch); + + /* clear everything */ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->user_regs.pc32 = dom->parms.virt_entry; + + /* Linux boot protocol. See linux.Documentation/arm/Booting. */ + ctxt->user_regs.r0_usr = 0; /* SBZ */ + /* Machine ID: We use DTB therefore no machine id */ + ctxt->user_regs.r1_usr = 0xffffffff; + /* ATAGS/DTB: We currently require that the guest kernel to be + * using CONFIG_ARM_APPENDED_DTB. Ensure that r2 does not look + * like a valid pointer to a set of ATAGS or a DTB. + */ + ctxt->user_regs.r2_usr = dom->devicetree_blob ? + dom->devicetree_seg.vstart : 0xffffffff; + + ctxt->sctlr = SCTLR_GUEST_INIT; + + ctxt->ttbr0 = 0; + ctxt->ttbr1 = 0; + ctxt->ttbcr = 0; /* Defined Reset Value */ + + ctxt->user_regs.cpsr = PSR_GUEST32_INIT; + + ctxt->flags = VGCF_online; + + DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx32, + ctxt->user_regs.cpsr, ctxt->user_regs.pc32); + + rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); + if ( rc != 0 ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); + + return rc; +} + +static int vcpu_arm64(struct xc_dom_image *dom) +{ + vcpu_guest_context_any_t any_ctx; + vcpu_guest_context_t *ctxt = &any_ctx.c; + int rc; + + DOMPRINTF_CALLED(dom->xch); + /* clear everything */ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->user_regs.pc64 = dom->parms.virt_entry; + + /* Linux boot protocol. See linux.Documentation/arm64/booting.txt. */ + ctxt->user_regs.x0 = dom->devicetree_blob ? + dom->devicetree_seg.vstart : 0xffffffff; + ctxt->user_regs.x1 = 0; + ctxt->user_regs.x2 = 0; + ctxt->user_regs.x3 = 0; + + DOMPRINTF("DTB %"PRIx64, ctxt->user_regs.x0); + + ctxt->sctlr = SCTLR_GUEST_INIT; + + ctxt->ttbr0 = 0; + ctxt->ttbr1 = 0; + ctxt->ttbcr = 0; /* Defined Reset Value */ + + ctxt->user_regs.cpsr = PSR_GUEST64_INIT; + + ctxt->flags = VGCF_online; + + DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx64, + ctxt->user_regs.cpsr, ctxt->user_regs.pc64); + + rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); + if ( rc != 0 ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); + + return rc; +} + +/* ------------------------------------------------------------------------ */ + +static int set_mode(xc_interface *xch, uint32_t domid, char *guest_type) +{ + static const struct { + char *guest; + uint32_t size; + } types[] = { + { "xen-3.0-aarch64", 64 }, + { "xen-3.0-armv7l", 32 }, + }; + DECLARE_DOMCTL; + int i,rc; + + domctl.domain = domid; + domctl.cmd = XEN_DOMCTL_set_address_size; + domctl.u.address_size.size = 0; + + for ( i = 0; i < ARRAY_SIZE(types); i++ ) + if ( !strcmp(types[i].guest, guest_type) ) + domctl.u.address_size.size = types[i].size; + if ( domctl.u.address_size.size == 0 ) + { + xc_dom_printf(xch, "%s: warning: unknown guest type %s", + __FUNCTION__, guest_type); + return -EINVAL; + } + + xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__, + guest_type, domctl.u.address_size.size); + rc = do_domctl(xch, &domctl); + if ( rc != 0 ) + xc_dom_printf(xch, "%s: warning: failed (rc=%d)", + __FUNCTION__, rc); + return rc; +} + +/* >0: success, *nr_pfns set to number actually populated + * 0: didn't try with this pfn shift (e.g. misaligned base etc) + * <0: ERROR + */ +static int populate_one_size(struct xc_dom_image *dom, int pfn_shift, + xen_pfn_t base_pfn, xen_pfn_t *nr_pfns, + xen_pfn_t *extents) +{ + /* The mask for this level */ + const uint64_t mask = ((uint64_t)1<<(pfn_shift))-1; + /* The shift, mask and next boundary for the level above this one */ + const int next_shift = pfn_shift + LPAE_SHIFT; + const uint64_t next_mask = ((uint64_t)1< next_boundary ) + end_pfn = next_boundary; + + count = ( end_pfn - base_pfn ) >> pfn_shift; + + /* Nothing to allocate */ + if ( !count ) + return 0; + + for ( i = 0 ; i < count ; i ++ ) + extents[i] = base_pfn + (i<xch, dom->guest_domid, count, + pfn_shift, 0, extents); + if ( nr <= 0 ) return nr; + DOMPRINTF("%s: populated %#x/%#x entries with shift %d", + __FUNCTION__, nr, count, pfn_shift); + + *nr_pfns = nr << pfn_shift; + + return 1; +} + +static int populate_guest_memory(struct xc_dom_image *dom, + xen_pfn_t base_pfn, xen_pfn_t nr_pfns) +{ + int rc = 0; + xen_pfn_t allocsz, pfn, *extents; + + extents = calloc(1024*1024,sizeof(xen_pfn_t)); + if ( extents == NULL ) + { + DOMPRINTF("%s: Unable to allocate extent array", __FUNCTION__); + return -1; + } + + DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)", + __FUNCTION__, + (uint64_t)base_pfn << XC_PAGE_SHIFT, + (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT, + (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT)); + + for ( pfn = 0; pfn < nr_pfns; pfn += allocsz ) + { + allocsz = min_t(int, 1024*1024, nr_pfns - pfn); +#if 0 /* Enable this to exercise/debug the code which tries to realign + * to a superpage boundary, by misaligning at the start. */ + if ( pfn == 0 ) + { + allocsz = 1; + rc = populate_one_size(dom, PFN_4K_SHIFT, + base_pfn + pfn, &allocsz, extents); + if (rc < 0) break; + if (rc > 0) continue; + /* Failed to allocate a single page? */ + break; + } +#endif + + rc = populate_one_size(dom, PFN_512G_SHIFT, + base_pfn + pfn, &allocsz, extents); + if ( rc < 0 ) break; + if ( rc > 0 ) continue; + + rc = populate_one_size(dom, PFN_1G_SHIFT, + base_pfn + pfn, &allocsz, extents); + if ( rc < 0 ) break; + if ( rc > 0 ) continue; + + rc = populate_one_size(dom, PFN_2M_SHIFT, + base_pfn + pfn, &allocsz, extents); + if ( rc < 0 ) break; + if ( rc > 0 ) continue; + + rc = populate_one_size(dom, PFN_4K_SHIFT, + base_pfn + pfn, &allocsz, extents); + if ( rc < 0 ) break; + if ( rc == 0 ) + { + DOMPRINTF("%s: Not enough RAM", __FUNCTION__); + errno = ENOMEM; + rc = -1; + goto out; + } + } + +out: + free(extents); + return rc < 0 ? rc : 0; +} + +static int meminit(struct xc_dom_image *dom) +{ + int i, rc; + uint64_t modbase; + + uint64_t ramsize = (uint64_t)dom->total_pages << XC_PAGE_SHIFT; + + const uint64_t bankbase[] = GUEST_RAM_BANK_BASES; + const uint64_t bankmax[] = GUEST_RAM_BANK_SIZES; + + /* Convenient */ + const uint64_t kernbase = dom->kernel_seg.vstart; + const uint64_t kernend = ROUNDUP(dom->kernel_seg.vend, 21/*2MB*/); + const uint64_t kernsize = kernend - kernbase; + const uint64_t dtb_size = dom->devicetree_blob ? + ROUNDUP(dom->devicetree_size, XC_PAGE_SHIFT) : 0; + const uint64_t ramdisk_size = dom->modules[0].blob ? + ROUNDUP(dom->modules[0].size, XC_PAGE_SHIFT) : 0; + const uint64_t modsize = dtb_size + ramdisk_size; + const uint64_t ram128mb = bankbase[0] + (128<<20); + + xen_pfn_t p2m_size; + uint64_t bank0end; + + assert(dom->rambase_pfn << XC_PAGE_SHIFT == bankbase[0]); + + if ( modsize + kernsize > bankmax[0] ) + { + DOMPRINTF("%s: Not enough memory for the kernel+dtb+initrd", + __FUNCTION__); + return -1; + } + + if ( ramsize == 0 ) + { + DOMPRINTF("%s: ram size is 0", __FUNCTION__); + return -1; + } + + if ( ramsize > GUEST_RAM_MAX ) + { + DOMPRINTF("%s: ram size is too large for guest address space: " + "%"PRIx64" > %llx", + __FUNCTION__, ramsize, GUEST_RAM_MAX); + return -1; + } + + rc = set_mode(dom->xch, dom->guest_domid, dom->guest_type); + if ( rc ) + return rc; + + for ( i = 0; ramsize && i < GUEST_RAM_BANKS; i++ ) + { + uint64_t banksize = ramsize > bankmax[i] ? bankmax[i] : ramsize; + + ramsize -= banksize; + + p2m_size = ( bankbase[i] + banksize - bankbase[0] ) >> XC_PAGE_SHIFT; + + dom->rambank_size[i] = banksize >> XC_PAGE_SHIFT; + } + + assert(dom->rambank_size[0] != 0); + assert(ramsize == 0); /* Too much RAM is rejected above */ + + dom->p2m_size = p2m_size; + + /* setup initial p2m and allocate guest memory */ + for ( i = 0; i < GUEST_RAM_BANKS && dom->rambank_size[i]; i++ ) + { + if ((rc = populate_guest_memory(dom, + bankbase[i] >> XC_PAGE_SHIFT, + dom->rambank_size[i]))) + return rc; + } + + /* + * We try to place dtb+initrd at 128MB or if we have less RAM + * as high as possible. If there is no space then fallback to + * just before the kernel. + * + * If changing this then consider + * xen/arch/arm/kernel.c:place_modules as well. + */ + bank0end = bankbase[0] + ((uint64_t)dom->rambank_size[0] << XC_PAGE_SHIFT); + + if ( bank0end >= ram128mb + modsize && kernend < ram128mb ) + modbase = ram128mb; + else if ( bank0end - modsize > kernend ) + modbase = bank0end - modsize; + else if (kernbase - bankbase[0] > modsize ) + modbase = kernbase - modsize; + else + return -1; + + DOMPRINTF("%s: placing boot modules at 0x%" PRIx64, __FUNCTION__, modbase); + + /* + * Must map DTB *after* initrd, to satisfy order of calls to + * xc_dom_alloc_segment in xc_dom_build_image, which must map + * things at monotonolically increasing addresses. + */ + if ( ramdisk_size ) + { + dom->modules[0].seg.vstart = modbase; + dom->modules[0].seg.vend = modbase + ramdisk_size; + + DOMPRINTF("%s: ramdisk: 0x%" PRIx64 " -> 0x%" PRIx64 "", + __FUNCTION__, + dom->modules[0].seg.vstart, dom->modules[0].seg.vend); + + modbase += ramdisk_size; + } + + if ( dtb_size ) + { + dom->devicetree_seg.vstart = modbase; + dom->devicetree_seg.vend = modbase + dtb_size; + + DOMPRINTF("%s: devicetree: 0x%" PRIx64 " -> 0x%" PRIx64 "", + __FUNCTION__, + dom->devicetree_seg.vstart, dom->devicetree_seg.vend); + + modbase += dtb_size; + } + + return 0; +} + +bool xc_dom_translated(const struct xc_dom_image *dom) +{ + return true; +} + +/* ------------------------------------------------------------------------ */ + +static int bootearly(struct xc_dom_image *dom) +{ + DOMPRINTF("%s: doing nothing", __FUNCTION__); + return 0; +} + +static int bootlate(struct xc_dom_image *dom) +{ + /* XXX + * map shared info + * map grant tables + * setup shared info + */ + return 0; +} + +/* ------------------------------------------------------------------------ */ + +static struct xc_dom_arch xc_dom_32 = { + .guest_type = "xen-3.0-armv7l", + .native_protocol = XEN_IO_PROTO_ABI_ARM, + .page_shift = PAGE_SHIFT_ARM, + .sizeof_pfn = 8, + .alloc_magic_pages = alloc_magic_pages, + .start_info = start_info_arm, + .shared_info = shared_info_arm, + .vcpu = vcpu_arm32, + .meminit = meminit, + .bootearly = bootearly, + .bootlate = bootlate, +}; + +static struct xc_dom_arch xc_dom_64 = { + .guest_type = "xen-3.0-aarch64", + .native_protocol = XEN_IO_PROTO_ABI_ARM, + .page_shift = PAGE_SHIFT_ARM, + .sizeof_pfn = 8, + .alloc_magic_pages = alloc_magic_pages, + .start_info = start_info_arm, + .shared_info = shared_info_arm, + .vcpu = vcpu_arm64, + .meminit = meminit, + .bootearly = bootearly, + .bootlate = bootlate, +}; + +static void __init register_arch_hooks(void) +{ + xc_dom_register_arch_hooks(&xc_dom_32); + xc_dom_register_arch_hooks(&xc_dom_64); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_armzimageloader.c b/tools/libs/guest/xg_dom_armzimageloader.c new file mode 100644 index 0000000000..4246c8e5fa --- /dev/null +++ b/tools/libs/guest/xg_dom_armzimageloader.c @@ -0,0 +1,271 @@ +/* + * Xen domain builder -- ARM zImage bits + * + * Parse and load ARM zImage kernel images. + * + * Copyright (C) 2012, Citrix Systems. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + */ + +#include +#include +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" + +#include /* XXX ntohl is not the right function... */ + +struct minimal_dtb_header { + uint32_t magic; + uint32_t total_size; + /* There are other fields but we don't use them yet. */ +}; + +#define DTB_MAGIC 0xd00dfeed + +/* ------------------------------------------------------------ */ +/* 32-bit zImage Support */ +/* ------------------------------------------------------------ */ + +#define ZIMAGE32_MAGIC_OFFSET 0x24 +#define ZIMAGE32_START_OFFSET 0x28 +#define ZIMAGE32_END_OFFSET 0x2c + +#define ZIMAGE32_MAGIC 0x016f2818 + +static int xc_dom_probe_zimage32_kernel(struct xc_dom_image *dom) +{ + uint32_t *zimage; + + if ( dom->kernel_blob == NULL ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: no kernel image loaded", __FUNCTION__); + return -EINVAL; + } + + if ( dom->kernel_size < 0x30 /*sizeof(struct setup_header)*/ ) + { + xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__); + return -EINVAL; + } + + zimage = (uint32_t *)dom->kernel_blob; + if ( zimage[ZIMAGE32_MAGIC_OFFSET/4] != ZIMAGE32_MAGIC ) + { + xc_dom_printf(dom->xch, "%s: kernel is not an arm32 zImage", __FUNCTION__); + return -EINVAL; + } + + return 0; +} + +static int xc_dom_parse_zimage32_kernel(struct xc_dom_image *dom) +{ + uint32_t *zimage; + uint32_t start, entry_addr; + uint64_t v_start, v_end; + uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT; + + DOMPRINTF_CALLED(dom->xch); + + zimage = (uint32_t *)dom->kernel_blob; + + /* Do not load kernel at the very first RAM address */ + v_start = rambase + 0x8000; + + if ( dom->kernel_size > UINT64_MAX - v_start ) + { + DOMPRINTF("%s: kernel is too large\n", __FUNCTION__); + return -EINVAL; + } + + v_end = v_start + dom->kernel_size; + + /* + * If start is invalid then the guest will start at some invalid + * address and crash, but this happens in guest context so doesn't + * concern us here. + */ + start = zimage[ZIMAGE32_START_OFFSET/4]; + + if (start == 0) + entry_addr = v_start; + else + entry_addr = start; + + /* find kernel segment */ + dom->kernel_seg.vstart = v_start; + dom->kernel_seg.vend = v_end; + + dom->parms.virt_entry = entry_addr; + dom->parms.virt_base = rambase; + + dom->guest_type = "xen-3.0-armv7l"; + DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "", + __FUNCTION__, dom->guest_type, + dom->kernel_seg.vstart, dom->kernel_seg.vend); + return 0; +} + +/* ------------------------------------------------------------ */ +/* 64-bit zImage Support */ +/* ------------------------------------------------------------ */ + +#define ZIMAGE64_MAGIC_V0 0x14000008 +#define ZIMAGE64_MAGIC_V1 0x644d5241 /* "ARM\x64" */ + +/* linux/Documentation/arm64/booting.txt */ +struct zimage64_hdr { + uint32_t magic0; + uint32_t res0; + uint64_t text_offset; /* Image load offset */ + uint64_t res1; + uint64_t res2; + /* zImage V1 only from here */ + uint64_t res3; + uint64_t res4; + uint64_t res5; + uint32_t magic1; + uint32_t res6; +}; +static int xc_dom_probe_zimage64_kernel(struct xc_dom_image *dom) +{ + struct zimage64_hdr *zimage; + + if ( dom->kernel_blob == NULL ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: no kernel image loaded", __FUNCTION__); + return -EINVAL; + } + + if ( dom->kernel_size < sizeof(*zimage) ) + { + xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__); + return -EINVAL; + } + + zimage = dom->kernel_blob; + if ( zimage->magic0 != ZIMAGE64_MAGIC_V0 && + zimage->magic1 != ZIMAGE64_MAGIC_V1 ) + { + xc_dom_printf(dom->xch, "%s: kernel is not an arm64 Image", __FUNCTION__); + return -EINVAL; + } + + return 0; +} + +static int xc_dom_parse_zimage64_kernel(struct xc_dom_image *dom) +{ + struct zimage64_hdr *zimage; + uint64_t v_start, v_end; + uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT; + + DOMPRINTF_CALLED(dom->xch); + + zimage = dom->kernel_blob; + + if ( zimage->text_offset > UINT64_MAX - rambase ) + { + DOMPRINTF("%s: kernel text offset is too large\n", __FUNCTION__); + return -EINVAL; + } + + v_start = rambase + zimage->text_offset; + + if ( dom->kernel_size > UINT64_MAX - v_start ) + { + DOMPRINTF("%s: kernel is too large\n", __FUNCTION__); + return -EINVAL; + } + + v_end = v_start + dom->kernel_size; + + dom->kernel_seg.vstart = v_start; + dom->kernel_seg.vend = v_end; + + /* Call the kernel at offset 0 */ + dom->parms.virt_entry = v_start; + dom->parms.virt_base = rambase; + + dom->guest_type = "xen-3.0-aarch64"; + DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "", + __FUNCTION__, dom->guest_type, + dom->kernel_seg.vstart, dom->kernel_seg.vend); + + return 0; +} + +/* ------------------------------------------------------------ */ +/* Common zImage Support */ +/* ------------------------------------------------------------ */ + +static int xc_dom_load_zimage_kernel(struct xc_dom_image *dom) +{ + void *dst; + + DOMPRINTF_CALLED(dom->xch); + + dst = xc_dom_seg_to_ptr(dom, &dom->kernel_seg); + if ( dst == NULL ) + { + DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->kernel_seg) => NULL", + __func__); + return -1; + } + + DOMPRINTF("%s: kernel seg %#"PRIx64"-%#"PRIx64, + __func__, dom->kernel_seg.vstart, dom->kernel_seg.vend); + DOMPRINTF("%s: copy %zd bytes from blob %p to dst %p", + __func__, dom->kernel_size, dom->kernel_blob, dst); + + memcpy(dst, dom->kernel_blob, dom->kernel_size); + + return 0; +} + +static struct xc_dom_loader zimage32_loader = { + .name = "Linux zImage (ARM32)", + .probe = xc_dom_probe_zimage32_kernel, + .parser = xc_dom_parse_zimage32_kernel, + .loader = xc_dom_load_zimage_kernel, +}; + +static struct xc_dom_loader zimage64_loader = { + .name = "Linux zImage (ARM64)", + .probe = xc_dom_probe_zimage64_kernel, + .parser = xc_dom_parse_zimage64_kernel, + .loader = xc_dom_load_zimage_kernel, +}; + +static void __init register_loader(void) +{ + xc_dom_register_loader(&zimage32_loader); + xc_dom_register_loader(&zimage64_loader); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_binloader.c b/tools/libs/guest/xg_dom_binloader.c new file mode 100644 index 0000000000..870a921427 --- /dev/null +++ b/tools/libs/guest/xg_dom_binloader.c @@ -0,0 +1,329 @@ +/* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * Some of the field descriptions were copied from "The Multiboot + * Specification", Copyright 1995, 96 Bryan Ford , + * Erich Stefan Boleyn Copyright 1999, 2000, 2001, 2002 + * Free Software Foundation, Inc. + */ + +/****************************************************************************** + * + * Loads simple binary images. It's like a .COM file in MS-DOS. No headers are + * present. The only requirement is that it must have a xen_bin_image table + * somewhere in the first 8192 bytes, starting on a 32-bit aligned address. + * Those familiar with the multiboot specification should recognize this, it's + * (almost) the same as the multiboot header. + * The layout of the xen_bin_image table is: + * + * Offset Type Name Note + * 0 uint32_t magic required + * 4 uint32_t flags required + * 8 uint32_t checksum required + * 12 uint32_t header_addr required + * 16 uint32_t load_addr required + * 20 uint32_t load_end_addr required + * 24 uint32_t bss_end_addr required + * 28 uint32_t entry_addr required + * + * - magic + * Magic number identifying the table. For images to be loaded by Xen 3, the + * magic value is 0x336ec578 ("xEn3" with the 0x80 bit of the "E" set). + * - flags + * bit 0: indicates whether the image needs to be loaded on a page boundary + * bit 1: reserved, must be 0 (the multiboot spec uses this bit to indicate + * that memory info should be passed to the image) + * bit 2: reserved, must be 0 (the multiboot spec uses this bit to indicate + * that the bootloader should pass video mode info to the image) + * bit 16: reserved, must be 1 (the multiboot spec uses this bit to indicate + * that the values in the fields header_addr - entry_addr are + * valid) + * All other bits should be set to 0. + * - checksum + * When added to "magic" and "flags", the resulting value should be 0. + * - header_addr + * Contains the virtual address corresponding to the beginning of the + * table - the memory location at which the magic value is supposed to be + * loaded. This field serves to synchronize the mapping between OS image + * offsets and virtual memory addresses. + * - load_addr + * Contains the virtual address of the beginning of the text segment. The + * offset in the OS image file at which to start loading is defined by the + * offset at which the table was found, minus (header addr - load addr). + * load addr must be less than or equal to header addr. + * - load_end_addr + * Contains the virtual address of the end of the data segment. + * (load_end_addr - load_addr) specifies how much data to load. This implies + * that the text and data segments must be consecutive in the OS image. If + * this field is zero, the domain builder assumes that the text and data + * segments occupy the whole OS image file. + * - bss_end_addr + * Contains the virtual address of the end of the bss segment. The domain + * builder initializes this area to zero, and reserves the memory it occupies + * to avoid placing boot modules and other data relevant to the loaded image + * in that area. If this field is zero, the domain builder assumes that no bss + * segment is present. + * - entry_addr + * The virtual address at which to start execution of the loaded image. + * + */ + +#include +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" + +#define round_pgup(_p) (((_p)+(PAGE_SIZE_X86-1))&PAGE_MASK_X86) +#define round_pgdown(_p) ((_p)&PAGE_MASK_X86) + +struct xen_bin_image_table +{ + uint32_t magic; + uint32_t flags; + uint32_t checksum; + uint32_t header_addr; + uint32_t load_addr; + uint32_t load_end_addr; + uint32_t bss_end_addr; + uint32_t entry_addr; +}; + +#define XEN_MULTIBOOT_MAGIC3 0x336ec578 + +#define XEN_MULTIBOOT_FLAG_ALIGN4K 0x00000001 +#define XEN_MULTIBOOT_FLAG_NEEDMEMINFO 0x00000002 +#define XEN_MULTIBOOT_FLAG_NEEDVIDINFO 0x00000004 +#define XEN_MULTIBOOT_FLAG_ADDRSVALID 0x00010000 +#define XEN_MULTIBOOT_FLAG_PAE_SHIFT 14 +#define XEN_MULTIBOOT_FLAG_PAE_MASK (3 << XEN_MULTIBOOT_FLAG_PAE_SHIFT) + +/* Flags we test for */ +#define FLAGS_MASK ((~ 0) & (~ XEN_MULTIBOOT_FLAG_ALIGN4K) & \ + (~ XEN_MULTIBOOT_FLAG_PAE_MASK)) +#define FLAGS_REQUIRED XEN_MULTIBOOT_FLAG_ADDRSVALID + +/* --------------------------------------------------------------------- */ + +static struct xen_bin_image_table *find_table(struct xc_dom_image *dom) +{ + struct xen_bin_image_table *table; + uint32_t *probe_ptr; + uint32_t *probe_end; + + if ( dom->kernel_size < sizeof(*table) ) + return NULL; + probe_ptr = dom->kernel_blob; + if ( dom->kernel_size > (8192 + sizeof(*table)) ) + probe_end = dom->kernel_blob + 8192; + else + probe_end = dom->kernel_blob + dom->kernel_size - sizeof(*table); + + for ( table = NULL; probe_ptr < probe_end; probe_ptr++ ) + { + if ( *probe_ptr == XEN_MULTIBOOT_MAGIC3 ) + { + table = (struct xen_bin_image_table *) probe_ptr; + /* Checksum correct? */ + if ( (table->magic + table->flags + table->checksum) == 0 ) + return table; + } + } + return NULL; +} + +static int xc_dom_probe_bin_kernel(struct xc_dom_image *dom) +{ + return find_table(dom) ? 0 : -EINVAL; +} + +static int xc_dom_parse_bin_kernel(struct xc_dom_image *dom) +{ + struct xen_bin_image_table *image_info; + char *image = dom->kernel_blob; + size_t image_size = dom->kernel_size; + uint32_t start_addr; + uint32_t load_end_addr; + uint32_t bss_end_addr; + uint32_t pae_flags; + + image_info = find_table(dom); + if ( !image_info ) + return -EINVAL; + + DOMPRINTF("%s: multiboot header fields", __FUNCTION__); + DOMPRINTF(" flags: 0x%" PRIx32 "", image_info->flags); + DOMPRINTF(" header_addr: 0x%" PRIx32 "", image_info->header_addr); + DOMPRINTF(" load_addr: 0x%" PRIx32 "", image_info->load_addr); + DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", image_info->load_end_addr); + DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", image_info->bss_end_addr); + DOMPRINTF(" entry_addr: 0x%" PRIx32 "", image_info->entry_addr); + + /* Check the flags */ + if ( (image_info->flags & FLAGS_MASK) != FLAGS_REQUIRED ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s: xen_bin_image_table flags required " + "0x%08" PRIx32 " found 0x%08" PRIx32 "", + __FUNCTION__, FLAGS_REQUIRED, image_info->flags & FLAGS_MASK); + return -EINVAL; + } + + /* Sanity check on the addresses */ + if ( (image_info->header_addr < image_info->load_addr) || + ((char *) image_info - image) < + (image_info->header_addr - image_info->load_addr) ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid header_addr.", + __FUNCTION__); + return -EINVAL; + } + + start_addr = image_info->header_addr - ((char *)image_info - image); + load_end_addr = image_info->load_end_addr ?: start_addr + image_size; + bss_end_addr = image_info->bss_end_addr ?: load_end_addr; + + DOMPRINTF("%s: calculated addresses", __FUNCTION__); + DOMPRINTF(" start_addr: 0x%" PRIx32 "", start_addr); + DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", load_end_addr); + DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", bss_end_addr); + + if ( (start_addr + image_size) < load_end_addr ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid load_end_addr.", + __FUNCTION__); + return -EINVAL; + } + + if ( bss_end_addr < load_end_addr) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid bss_end_addr.", + __FUNCTION__); + return -EINVAL; + } + + dom->kernel_seg.vstart = image_info->load_addr; + dom->kernel_seg.vend = bss_end_addr; + dom->parms.virt_base = start_addr; + dom->parms.virt_entry = image_info->entry_addr; + + pae_flags = image_info->flags & XEN_MULTIBOOT_FLAG_PAE_MASK; + switch (pae_flags >> XEN_MULTIBOOT_FLAG_PAE_SHIFT) { + case 0: + dom->guest_type = "xen-3.0-x86_32"; + break; + case 1: + dom->guest_type = "xen-3.0-x86_32p"; + break; + case 2: + dom->guest_type = "xen-3.0-x86_64"; + break; + case 3: + /* Kernel detects PAE at runtime. So try to figure whenever + * xen supports PAE and advertise a PAE-capable kernel in case + * it does. */ + dom->guest_type = "xen-3.0-x86_32"; + if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") ) + { + DOMPRINTF("%s: PAE fixup", __FUNCTION__); + dom->guest_type = "xen-3.0-x86_32p"; + dom->parms.pae = XEN_PAE_EXTCR3; + } + break; + } + return 0; +} + +static int xc_dom_load_bin_kernel(struct xc_dom_image *dom) +{ + struct xen_bin_image_table *image_info; + char *image = dom->kernel_blob; + char *dest; + size_t image_size = dom->kernel_size; + size_t dest_size; + uint32_t start_addr; + uint32_t load_end_addr; + uint32_t bss_end_addr; + uint32_t skip, text_size, bss_size; + + image_info = find_table(dom); + if ( !image_info ) + return -EINVAL; + + start_addr = image_info->header_addr - ((char *)image_info - image); + load_end_addr = image_info->load_end_addr ?: start_addr + image_size; + bss_end_addr = image_info->bss_end_addr ?: load_end_addr; + + /* It's possible that we need to skip the first part of the image */ + skip = image_info->load_addr - start_addr; + text_size = load_end_addr - image_info->load_addr; + bss_size = bss_end_addr - load_end_addr; + + DOMPRINTF("%s: calculated sizes", __FUNCTION__); + DOMPRINTF(" skip: 0x%" PRIx32 "", skip); + DOMPRINTF(" text_size: 0x%" PRIx32 "", text_size); + DOMPRINTF(" bss_size: 0x%" PRIx32 "", bss_size); + + dest = xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart, &dest_size); + if ( dest == NULL ) + { + DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart)" + " => NULL", __FUNCTION__); + return -EINVAL; + } + + if ( dest_size < text_size || + dest_size - text_size < bss_size ) + { + DOMPRINTF("%s: mapped region is too small for image", __FUNCTION__); + return -EINVAL; + } + + if ( image_size < skip || + image_size - skip < text_size ) + { + DOMPRINTF("%s: image is too small for declared text size", + __FUNCTION__); + return -EINVAL; + } + + memcpy(dest, image + skip, text_size); + memset(dest + text_size, 0, bss_size); + + return 0; +} + +/* ------------------------------------------------------------------------ */ + +static struct xc_dom_loader bin_loader = { + .name = "multiboot-binary", + .probe = xc_dom_probe_bin_kernel, + .parser = xc_dom_parse_bin_kernel, + .loader = xc_dom_load_bin_kernel, +}; + +static void __init register_loader(void) +{ + xc_dom_register_loader(&bin_loader); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_boot.c b/tools/libs/guest/xg_dom_boot.c new file mode 100644 index 0000000000..1e31e92244 --- /dev/null +++ b/tools/libs/guest/xg_dom_boot.c @@ -0,0 +1,451 @@ +/* + * Xen domain builder -- xen booter. + * + * This is the code which actually boots a fresh + * prepared domain image as xen guest domain. + * + * ==> this is the only domain builder code piece + * where xen hypercalls are allowed <== + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * written 2006 by Gerd Hoffmann . + * + */ + +#include +#include +#include +#include +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" +#include "xc_core.h" +#include +#include + +/* ------------------------------------------------------------------------ */ + +static int setup_hypercall_page(struct xc_dom_image *dom) +{ + DECLARE_DOMCTL; + xen_pfn_t pfn; + int rc; + + if ( dom->parms.virt_hypercall == -1 ) + return 0; + pfn = (dom->parms.virt_hypercall - dom->parms.virt_base) + >> XC_DOM_PAGE_SHIFT(dom); + + DOMPRINTF("%s: vaddr=0x%" PRIx64 " pfn=0x%" PRIpfn "", __FUNCTION__, + dom->parms.virt_hypercall, pfn); + domctl.cmd = XEN_DOMCTL_hypercall_init; + domctl.domain = dom->guest_domid; + domctl.u.hypercall_init.gmfn = xc_dom_p2m(dom, pfn); + rc = do_domctl(dom->xch, &domctl); + if ( rc != 0 ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: HYPERCALL_INIT failed: %d - %s)", + __FUNCTION__, errno, strerror(errno)); + return rc; +} + + +/* ------------------------------------------------------------------------ */ + +int xc_dom_compat_check(struct xc_dom_image *dom) +{ + xen_capabilities_info_t xen_caps; + char *item, *ptr; + int match, found = 0; + + strncpy(xen_caps, dom->xen_caps, XEN_CAPABILITIES_INFO_LEN - 1); + xen_caps[XEN_CAPABILITIES_INFO_LEN - 1] = '\0'; + + for ( item = strtok_r(xen_caps, " ", &ptr); + item != NULL ; item = strtok_r(NULL, " ", &ptr) ) + { + match = !strcmp(dom->guest_type, item); + DOMPRINTF("%s: supported guest type: %s%s", __FUNCTION__, + item, match ? " <= matches" : ""); + if ( match ) + found++; + } + if ( !found ) + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s: guest type %s not supported by xen kernel, sorry", + __FUNCTION__, dom->guest_type); + + return found; +} + +int xc_dom_boot_xen_init(struct xc_dom_image *dom, xc_interface *xch, uint32_t domid) +{ + dom->xch = xch; + dom->guest_domid = domid; + + dom->xen_version = xc_version(xch, XENVER_version, NULL); + if ( xc_version(xch, XENVER_capabilities, &dom->xen_caps) < 0 ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, "can't get xen capabilities"); + return -1; + } + DOMPRINTF("%s: ver %d.%d, caps %s", __FUNCTION__, + dom->xen_version >> 16, dom->xen_version & 0xff, + dom->xen_caps); + return 0; +} + +int xc_dom_boot_mem_init(struct xc_dom_image *dom) +{ + long rc; + + DOMPRINTF_CALLED(dom->xch); + + rc = dom->arch_hooks->meminit(dom); + if ( rc != 0 ) + { + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "%s: can't allocate low memory for domain", + __FUNCTION__); + return rc; + } + + return 0; +} + +void *xc_dom_boot_domU_map(struct xc_dom_image *dom, xen_pfn_t pfn, + xen_pfn_t count) +{ + int page_shift = XC_DOM_PAGE_SHIFT(dom); + privcmd_mmap_entry_t *entries; + void *ptr; + int i; + int err; + + entries = xc_dom_malloc(dom, count * sizeof(privcmd_mmap_entry_t)); + if ( entries == NULL ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn + " [malloc]", __FUNCTION__, pfn, count); + return NULL; + } + + for ( i = 0; i < count; i++ ) + entries[i].mfn = xc_dom_p2m(dom, pfn + i); + + ptr = xc_map_foreign_ranges(dom->xch, dom->guest_domid, + count << page_shift, PROT_READ | PROT_WRITE, 1 << page_shift, + entries, count); + if ( ptr == NULL ) + { + err = errno; + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn + " [mmap, errno=%i (%s)]", __FUNCTION__, pfn, count, + err, strerror(err)); + return NULL; + } + + return ptr; +} + +int xc_dom_boot_image(struct xc_dom_image *dom) +{ + xc_dominfo_t info; + int rc; + + DOMPRINTF_CALLED(dom->xch); + + /* misc stuff*/ + if ( (rc = dom->arch_hooks->bootearly(dom)) != 0 ) + return rc; + + /* collect some info */ + rc = xc_domain_getinfo(dom->xch, dom->guest_domid, 1, &info); + if ( rc < 0 ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: getdomaininfo failed (rc=%d)", __FUNCTION__, rc); + return rc; + } + if ( rc == 0 || info.domid != dom->guest_domid ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: Huh? No domains found (nr_domains=%d) " + "or domid mismatch (%d != %d)", __FUNCTION__, + rc, info.domid, dom->guest_domid); + return -1; + } + dom->shared_info_mfn = info.shared_info_frame; + + /* sanity checks */ + if ( !xc_dom_compat_check(dom) ) + return -1; + + /* initial mm setup */ + if ( dom->arch_hooks->setup_pgtables && + (rc = dom->arch_hooks->setup_pgtables(dom)) != 0 ) + return rc; + + /* start info page */ + if ( dom->arch_hooks->start_info ) + dom->arch_hooks->start_info(dom); + + /* hypercall page */ + if ( (rc = setup_hypercall_page(dom)) != 0 ) + return rc; + xc_dom_log_memory_footprint(dom); + + /* misc x86 stuff */ + if ( (rc = dom->arch_hooks->bootlate(dom)) != 0 ) + return rc; + + /* let the vm run */ + if ( (rc = dom->arch_hooks->vcpu(dom)) != 0 ) + return rc; + xc_dom_unmap_all(dom); + + return rc; +} + +static xen_pfn_t xc_dom_gnttab_setup(xc_interface *xch, uint32_t domid) +{ + gnttab_setup_table_t setup; + DECLARE_HYPERCALL_BUFFER(xen_pfn_t, gmfnp); + int rc; + xen_pfn_t gmfn; + + gmfnp = xc_hypercall_buffer_alloc(xch, gmfnp, sizeof(*gmfnp)); + if (gmfnp == NULL) + return -1; + + setup.dom = domid; + setup.nr_frames = 1; + set_xen_guest_handle(setup.frame_list, gmfnp); + setup.status = 0; + + rc = xc_gnttab_op(xch, GNTTABOP_setup_table, &setup, sizeof(setup), 1); + gmfn = *gmfnp; + xc_hypercall_buffer_free(xch, gmfnp); + + if ( rc != 0 || setup.status != GNTST_okay ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to setup domU grant table " + "[errno=%d, status=%" PRId16 "]\n", + __FUNCTION__, rc != 0 ? errno : 0, setup.status); + return -1; + } + + return gmfn; +} + +static void xc_dom_set_gnttab_entry(xc_interface *xch, + grant_entry_v1_t *gnttab, + unsigned int idx, + uint32_t guest_domid, + uint32_t backend_domid, + xen_pfn_t guest_gfn) +{ + if ( guest_domid == backend_domid || guest_gfn == -1 ) + return; + + xc_dom_printf(xch, "%s: d%d gnt[%u] -> d%d 0x%"PRI_xen_pfn, + __func__, guest_domid, idx, backend_domid, guest_gfn); + + gnttab[idx].flags = GTF_permit_access; + gnttab[idx].domid = backend_domid; + gnttab[idx].frame = guest_gfn; +} + +static int compat_gnttab_seed(xc_interface *xch, uint32_t domid, + xen_pfn_t console_gfn, + xen_pfn_t xenstore_gfn, + uint32_t console_domid, + uint32_t xenstore_domid) +{ + + xen_pfn_t gnttab_gfn; + grant_entry_v1_t *gnttab; + + gnttab_gfn = xc_dom_gnttab_setup(xch, domid); + if ( gnttab_gfn == -1 ) + return -1; + + gnttab = xc_map_foreign_range(xch, + domid, + PAGE_SIZE, + PROT_READ|PROT_WRITE, + gnttab_gfn); + if ( gnttab == NULL ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to map d%d grant table " + "[errno=%d]\n", + __func__, domid, errno); + return -1; + } + + xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_CONSOLE, + domid, console_domid, console_gfn); + xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_XENSTORE, + domid, xenstore_domid, xenstore_gfn); + + if ( munmap(gnttab, PAGE_SIZE) == -1 ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to unmap d%d grant table " + "[errno=%d]\n", + __func__, domid, errno); + return -1; + } + + /* Guest shouldn't really touch its grant table until it has + * enabled its caches. But lets be nice. */ + xc_domain_cacheflush(xch, domid, gnttab_gfn, 1); + + return 0; +} + +static int compat_gnttab_hvm_seed(xc_interface *xch, uint32_t domid, + xen_pfn_t console_gfn, + xen_pfn_t xenstore_gfn, + uint32_t console_domid, + uint32_t xenstore_domid) +{ + int rc; + xen_pfn_t scratch_gfn; + struct xen_add_to_physmap xatp = { + .domid = domid, + .space = XENMAPSPACE_grant_table, + .idx = 0, + }; + struct xen_remove_from_physmap xrfp = { + .domid = domid, + }; + + rc = xc_core_arch_get_scratch_gpfn(xch, domid, &scratch_gfn); + if ( rc < 0 ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to get a scratch gfn from d%d" + "[errno=%d]\n", + __func__, domid, errno); + return -1; + } + xatp.gpfn = scratch_gfn; + xrfp.gpfn = scratch_gfn; + + xc_dom_printf(xch, "%s: d%d: pfn=0x%"PRI_xen_pfn, __func__, + domid, scratch_gfn); + + rc = do_memory_op(xch, XENMEM_add_to_physmap, &xatp, sizeof(xatp)); + if ( rc != 0 ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to add gnttab to d%d physmap " + "[errno=%d]\n", + __func__, domid, errno); + return -1; + } + + rc = compat_gnttab_seed(xch, domid, + console_gfn, xenstore_gfn, + console_domid, xenstore_domid); + if (rc != 0) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to seed gnttab entries for d%d\n", + __func__, domid); + (void) do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp, + sizeof(xrfp)); + return -1; + } + + rc = do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp, sizeof(xrfp)); + if (rc != 0) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to remove gnttab from d%d physmap " + "[errno=%d]\n", + __func__, domid, errno); + return -1; + } + + return 0; +} + +int xc_dom_gnttab_seed(xc_interface *xch, uint32_t guest_domid, + bool is_hvm, xen_pfn_t console_gfn, + xen_pfn_t xenstore_gfn, uint32_t console_domid, + uint32_t xenstore_domid) +{ + xenforeignmemory_handle* fmem = xch->fmem; + xenforeignmemory_resource_handle *fres; + void *addr = NULL; + + fres = xenforeignmemory_map_resource( + fmem, guest_domid, XENMEM_resource_grant_table, + XENMEM_resource_grant_table_id_shared, 0, 1, &addr, + PROT_READ | PROT_WRITE, 0); + if ( !fres ) + { + if ( errno == EOPNOTSUPP ) + return is_hvm ? + compat_gnttab_hvm_seed(xch, guest_domid, + console_gfn, xenstore_gfn, + console_domid, xenstore_domid) : + compat_gnttab_seed(xch, guest_domid, + console_gfn, xenstore_gfn, + console_domid, xenstore_domid); + + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: failed to acquire d%d grant table [errno=%d]\n", + __func__, guest_domid, errno); + return -1; + } + + xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_CONSOLE, + guest_domid, console_domid, console_gfn); + xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_XENSTORE, + guest_domid, xenstore_domid, xenstore_gfn); + + xenforeignmemory_unmap_resource(fmem, fres); + + return 0; +} + +int xc_dom_gnttab_init(struct xc_dom_image *dom) +{ + bool is_hvm = xc_dom_translated(dom); + xen_pfn_t console_gfn = xc_dom_p2m(dom, dom->console_pfn); + xen_pfn_t xenstore_gfn = xc_dom_p2m(dom, dom->xenstore_pfn); + + return xc_dom_gnttab_seed(dom->xch, dom->guest_domid, is_hvm, + console_gfn, xenstore_gfn, + dom->console_domid, dom->xenstore_domid); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_bzimageloader.c b/tools/libs/guest/xg_dom_bzimageloader.c new file mode 100644 index 0000000000..f959a77602 --- /dev/null +++ b/tools/libs/guest/xg_dom_bzimageloader.c @@ -0,0 +1,812 @@ +/* + * Xen domain builder -- bzImage bits + * + * Parse and load bzImage kernel images. + * + * This relies on version 2.08 of the boot protocol, which contains an + * ELF file embedded in the bzImage. The loader extracts this ELF + * image and passes it off to the standard ELF loader. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * written 2006 by Gerd Hoffmann . + * written 2007 by Jeremy Fitzhardinge + * written 2008 by Ian Campbell + * written 2009 by Chris Lalancette + * + */ + +#include +#include +#include + +#include "xg_private.h" +#include "xg_dom_decompress.h" + +#include + +#ifndef __MINIOS__ + +#if defined(HAVE_BZLIB) + +#include + +static int xc_try_bzip2_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + bz_stream stream; + int ret; + char *out_buf; + char *tmp_buf; + int retval = -1; + unsigned int outsize; + uint64_t total; + + stream.bzalloc = NULL; + stream.bzfree = NULL; + stream.opaque = NULL; + + if ( dom->kernel_size == 0) + { + DOMPRINTF("BZIP2: Input is 0 size"); + return -1; + } + + ret = BZ2_bzDecompressInit(&stream, 0, 0); + if ( ret != BZ_OK ) + { + DOMPRINTF("BZIP2: Error initting stream"); + return -1; + } + + /* sigh. We don't know up-front how much memory we are going to need + * for the output buffer. Allocate the output buffer to be equal + * the input buffer to start, and we'll realloc as needed. + */ + outsize = dom->kernel_size; + + /* + * stream.avail_in and outsize are unsigned int, while kernel_size + * is a size_t. Check we aren't overflowing. + */ + if ( outsize != dom->kernel_size ) + { + DOMPRINTF("BZIP2: Input too large"); + goto bzip2_cleanup; + } + + out_buf = malloc(outsize); + if ( out_buf == NULL ) + { + DOMPRINTF("BZIP2: Failed to alloc memory"); + goto bzip2_cleanup; + } + + stream.next_in = dom->kernel_blob; + stream.avail_in = dom->kernel_size; + + stream.next_out = out_buf; + stream.avail_out = dom->kernel_size; + + for ( ; ; ) + { + ret = BZ2_bzDecompress(&stream); + if ( ret == BZ_STREAM_END ) + { + DOMPRINTF("BZIP2: Saw data stream end"); + retval = 0; + break; + } + if ( ret != BZ_OK ) + { + DOMPRINTF("BZIP2: error %d", ret); + free(out_buf); + goto bzip2_cleanup; + } + + if ( stream.avail_out == 0 ) + { + /* Protect against output buffer overflow */ + if ( outsize > UINT_MAX / 2 ) + { + DOMPRINTF("BZIP2: output buffer overflow"); + free(out_buf); + goto bzip2_cleanup; + } + + if ( xc_dom_kernel_check_size(dom, outsize * 2) ) + { + DOMPRINTF("BZIP2: output too large"); + free(out_buf); + goto bzip2_cleanup; + } + + tmp_buf = realloc(out_buf, outsize * 2); + if ( tmp_buf == NULL ) + { + DOMPRINTF("BZIP2: Failed to realloc memory"); + free(out_buf); + goto bzip2_cleanup; + } + out_buf = tmp_buf; + + stream.next_out = out_buf + outsize; + stream.avail_out = (outsize * 2) - outsize; + outsize *= 2; + } + else if ( stream.avail_in == 0 ) + { + /* + * If there is output buffer available then this indicates + * that BZ2_bzDecompress would like more input data to be + * provided. However our complete input buffer is in + * memory and provided upfront so if avail_in is zero this + * actually indicates a truncated input. + */ + DOMPRINTF("BZIP2: not enough input"); + free(out_buf); + goto bzip2_cleanup; + } + } + + total = (((uint64_t)stream.total_out_hi32) << 32) | stream.total_out_lo32; + + if ( xc_dom_register_external(dom, out_buf, total) ) + { + DOMPRINTF("BZIP2: Error registering stream output"); + free(out_buf); + goto bzip2_cleanup; + } + + DOMPRINTF("%s: BZIP2 decompress OK, 0x%zx -> 0x%lx", + __FUNCTION__, *size, (long unsigned int) total); + + *blob = out_buf; + *size = total; + + bzip2_cleanup: + BZ2_bzDecompressEnd(&stream); + + return retval; +} + +#else /* !defined(HAVE_BZLIB) */ + +static int xc_try_bzip2_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: BZIP2 decompress support unavailable", + __FUNCTION__); + return -1; +} + +#endif + +#if defined(HAVE_LZMA) + +#include + +static int _xc_try_lzma_decode( + struct xc_dom_image *dom, void **blob, size_t *size, + lzma_stream *stream, const char *what) +{ + lzma_ret ret; + lzma_action action = LZMA_RUN; + unsigned char *out_buf; + unsigned char *tmp_buf; + int retval = -1; + size_t outsize; + const char *msg; + + if ( dom->kernel_size == 0) + { + DOMPRINTF("%s: Input is 0 size", what); + return -1; + } + + /* sigh. We don't know up-front how much memory we are going to need + * for the output buffer. Allocate the output buffer to be equal + * the input buffer to start, and we'll realloc as needed. + */ + outsize = dom->kernel_size; + out_buf = malloc(outsize); + if ( out_buf == NULL ) + { + DOMPRINTF("%s: Failed to alloc memory", what); + goto lzma_cleanup; + } + + stream->next_in = dom->kernel_blob; + stream->avail_in = dom->kernel_size; + + stream->next_out = out_buf; + stream->avail_out = dom->kernel_size; + + for ( ; ; ) + { + ret = lzma_code(stream, action); + if ( ret == LZMA_STREAM_END ) + { + DOMPRINTF("%s: Saw data stream end", what); + retval = 0; + break; + } + if ( ret != LZMA_OK ) + { + switch ( ret ) + { + case LZMA_MEM_ERROR: + msg = strerror(ENOMEM); + break; + + case LZMA_MEMLIMIT_ERROR: + msg = "Memory usage limit reached"; + break; + + case LZMA_FORMAT_ERROR: + msg = "File format not recognized"; + break; + + case LZMA_OPTIONS_ERROR: + // FIXME: Better message? + msg = "Unsupported compression options"; + break; + + case LZMA_DATA_ERROR: + msg = "File is corrupt"; + break; + + case LZMA_BUF_ERROR: + msg = "Unexpected end of input"; + break; + + default: + msg = "Internal program error (bug)"; + break; + } + DOMPRINTF("%s: %s decompression error: %s", + __FUNCTION__, what, msg); + free(out_buf); + goto lzma_cleanup; + } + + if ( stream->avail_out == 0 ) + { + /* Protect against output buffer overflow */ + if ( outsize > SIZE_MAX / 2 ) + { + DOMPRINTF("%s: output buffer overflow", what); + free(out_buf); + goto lzma_cleanup; + } + + if ( xc_dom_kernel_check_size(dom, outsize * 2) ) + { + DOMPRINTF("%s: output too large", what); + free(out_buf); + goto lzma_cleanup; + } + + tmp_buf = realloc(out_buf, outsize * 2); + if ( tmp_buf == NULL ) + { + DOMPRINTF("%s: Failed to realloc memory", what); + free(out_buf); + goto lzma_cleanup; + } + out_buf = tmp_buf; + + stream->next_out = out_buf + outsize; + stream->avail_out = (outsize * 2) - outsize; + outsize *= 2; + } + } + + if ( xc_dom_register_external(dom, out_buf, stream->total_out) ) + { + DOMPRINTF("%s: Error registering stream output", what); + free(out_buf); + goto lzma_cleanup; + } + + DOMPRINTF("%s: %s decompress OK, 0x%zx -> 0x%zx", + __FUNCTION__, what, *size, (size_t)stream->total_out); + + *blob = out_buf; + *size = stream->total_out; + + lzma_cleanup: + lzma_end(stream); + + return retval; +} + +/* 128 Mb is the minimum size (half-way) documented to work for all inputs. */ +#define LZMA_BLOCK_SIZE (128*1024*1024) + +static int xc_try_xz_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + lzma_stream stream = LZMA_STREAM_INIT; + + if ( lzma_stream_decoder(&stream, LZMA_BLOCK_SIZE, 0) != LZMA_OK ) + { + DOMPRINTF("XZ: Failed to init decoder"); + return -1; + } + + return _xc_try_lzma_decode(dom, blob, size, &stream, "XZ"); +} + +static int xc_try_lzma_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + lzma_stream stream = LZMA_STREAM_INIT; + + if ( lzma_alone_decoder(&stream, LZMA_BLOCK_SIZE) != LZMA_OK ) + { + DOMPRINTF("LZMA: Failed to init decoder"); + return -1; + } + + return _xc_try_lzma_decode(dom, blob, size, &stream, "LZMA"); +} + +#else /* !defined(HAVE_LZMA) */ + +static int xc_try_xz_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: XZ decompress support unavailable", + __FUNCTION__); + return -1; +} + +static int xc_try_lzma_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: LZMA decompress support unavailable", + __FUNCTION__); + return -1; +} + +#endif + +#if defined(HAVE_LZO1X) + +#include + +#define LZOP_HEADER_HAS_FILTER 0x00000800 +#define LZOP_MAX_BLOCK_SIZE (64*1024*1024) + +static inline uint_fast16_t lzo_read_16(const unsigned char *buf) +{ + return buf[1] | (buf[0] << 8); +} + +static inline uint_fast32_t lzo_read_32(const unsigned char *buf) +{ + return lzo_read_16(buf + 2) | ((uint32_t)lzo_read_16(buf) << 16); +} + +static int xc_try_lzo1x_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + int ret; + const unsigned char *cur = dom->kernel_blob; + unsigned char *out_buf = NULL; + size_t left = dom->kernel_size; + const char *msg; + unsigned version; + static const unsigned char magic[] = { + 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a + }; + + /* + * lzo_uint should match size_t. Check that this is the case to be + * sure we won't overflow various lzo_uint fields. + */ + BUILD_BUG_ON(sizeof(lzo_uint) != sizeof(size_t)); + + ret = lzo_init(); + if ( ret != LZO_E_OK ) + { + DOMPRINTF("LZO1x: Failed to init library (%d)\n", ret); + return -1; + } + + if ( left < 16 || memcmp(cur, magic, 9) ) + { + DOMPRINTF("LZO1x: Unrecognized magic\n"); + return -1; + } + + /* get version (2bytes), skip library version (2), + * 'need to be extracted' version (2) and method (1) */ + version = lzo_read_16(cur + 9); + cur += 16; + left -= 16; + + if ( version >= 0x0940 ) + { + /* skip level */ + ++cur; + if ( left ) + --left; + } + + if ( left >= 4 && (lzo_read_32(cur) & LZOP_HEADER_HAS_FILTER) ) + ret = 8; /* flags + filter info */ + else + ret = 4; /* flags */ + + /* skip mode and mtime_low */ + ret += 8; + if ( version >= 0x0940 ) + ret += 4; /* skip mtime_high */ + + /* don't care about the file name, and skip checksum */ + if ( left > ret ) + ret += 1 + cur[ret] + 4; + + if ( left < ret ) + { + DOMPRINTF("LZO1x: Incomplete header\n"); + return -1; + } + cur += ret; + left -= ret; + + for ( *size = 0; ; ) + { + lzo_uint src_len, dst_len, out_len; + unsigned char *tmp_buf; + + msg = "Short input"; + if ( left < 4 ) + break; + + dst_len = lzo_read_32(cur); + if ( !dst_len ) + { + msg = "Error registering stream output"; + if ( xc_dom_register_external(dom, out_buf, *size) ) + break; + + return 0; + } + + if ( dst_len > LZOP_MAX_BLOCK_SIZE ) + { + msg = "Block size too large"; + break; + } + + if ( left < 12 ) + break; + + src_len = lzo_read_32(cur + 4); + cur += 12; /* also skip block checksum info */ + left -= 12; + + msg = "Bad source length"; + if ( src_len <= 0 || src_len > dst_len || src_len > left ) + break; + + msg = "Output buffer overflow"; + if ( *size > SIZE_MAX - dst_len ) + break; + + msg = "Decompressed image too large"; + if ( xc_dom_kernel_check_size(dom, *size + dst_len) ) + break; + + msg = "Failed to (re)alloc memory"; + tmp_buf = realloc(out_buf, *size + dst_len); + if ( tmp_buf == NULL ) + break; + + out_buf = tmp_buf; + out_len = dst_len; + + ret = lzo1x_decompress_safe(cur, src_len, + out_buf + *size, &out_len, NULL); + switch ( ret ) + { + case LZO_E_OK: + msg = "Input underrun"; + if ( out_len != dst_len ) + break; + + *blob = out_buf; + *size += out_len; + cur += src_len; + left -= src_len; + continue; + + case LZO_E_INPUT_NOT_CONSUMED: + msg = "Unconsumed input"; + break; + + case LZO_E_OUTPUT_OVERRUN: + msg = "Output overrun"; + break; + + case LZO_E_INPUT_OVERRUN: + msg = "Input overrun"; + break; + + case LZO_E_LOOKBEHIND_OVERRUN: + msg = "Look-behind overrun"; + break; + + case LZO_E_EOF_NOT_FOUND: + msg = "No EOF marker"; + break; + + case LZO_E_ERROR: + msg = "General error"; + break; + + default: + msg = "Internal program error (bug)"; + break; + } + + break; + } + + free(out_buf); + DOMPRINTF("LZO1x decompression error: %s\n", msg); + + return -1; +} + +#else /* !defined(HAVE_LZO1X) */ + +static int xc_try_lzo1x_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: LZO1x decompress support unavailable\n", + __FUNCTION__); + return -1; +} + +#endif + +#else /* __MINIOS__ */ + +int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size); +int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size); +int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size); +int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size); + +#endif /* !__MINIOS__ */ + +struct setup_header { + uint8_t _pad0[0x1f1]; /* skip uninteresting stuff */ + uint8_t setup_sects; + uint16_t root_flags; + uint32_t syssize; + uint16_t ram_size; + uint16_t vid_mode; + uint16_t root_dev; + uint16_t boot_flag; + uint16_t jump; + uint32_t header; +#define HDR_MAGIC "HdrS" +#define HDR_MAGIC_SZ 4 + uint16_t version; +#define VERSION(h,l) (((h)<<8) | (l)) + uint32_t realmode_swtch; + uint16_t start_sys; + uint16_t kernel_version; + uint8_t type_of_loader; + uint8_t loadflags; + uint16_t setup_move_size; + uint32_t code32_start; + uint32_t ramdisk_image; + uint32_t ramdisk_size; + uint32_t bootsect_kludge; + uint16_t heap_end_ptr; + uint16_t _pad1; + uint32_t cmd_line_ptr; + uint32_t initrd_addr_max; + uint32_t kernel_alignment; + uint8_t relocatable_kernel; + uint8_t _pad2[3]; + uint32_t cmdline_size; + uint32_t hardware_subarch; + uint64_t hardware_subarch_data; + uint32_t payload_offset; + uint32_t payload_length; +} __attribute__((packed)); + +extern struct xc_dom_loader elf_loader; + +static int check_magic(struct xc_dom_image *dom, const void *magic, size_t len) +{ + if (len > dom->kernel_size) + return 0; + + return (memcmp(dom->kernel_blob, magic, len) == 0); +} + +static int xc_dom_probe_bzimage_kernel(struct xc_dom_image *dom) +{ + struct setup_header *hdr; + uint64_t payload_offset, payload_length; + int ret; + + if ( dom->kernel_blob == NULL ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: no kernel image loaded", __FUNCTION__); + return -EINVAL; + } + + if ( dom->kernel_size < sizeof(struct setup_header) ) + { + xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__); + return -EINVAL; + } + + hdr = dom->kernel_blob; + + if ( memcmp(&hdr->header, HDR_MAGIC, HDR_MAGIC_SZ) != 0 ) + { + xc_dom_printf(dom->xch, "%s: kernel is not a bzImage", __FUNCTION__); + return -EINVAL; + } + + if ( hdr->version < VERSION(2,8) ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: boot protocol" + " too old (%04x)", __FUNCTION__, hdr->version); + return -EINVAL; + } + + + /* upcast to 64 bits to avoid overflow */ + /* setup_sects is u8 and so cannot overflow */ + payload_offset = (hdr->setup_sects + 1) * 512; + payload_offset += hdr->payload_offset; + payload_length = hdr->payload_length; + + if ( payload_offset >= dom->kernel_size ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload offset overflow", + __FUNCTION__); + return -EINVAL; + } + if ( (payload_offset + payload_length) > dom->kernel_size ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload length overflow", + __FUNCTION__); + return -EINVAL; + } + + dom->kernel_blob = dom->kernel_blob + payload_offset; + dom->kernel_size = payload_length; + + if ( check_magic(dom, "\037\213", 2) ) + { + ret = xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); + if ( ret == -1 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: unable to" + " gzip decompress kernel", __FUNCTION__); + return -EINVAL; + } + } + else if ( check_magic(dom, "\102\132\150", 3) ) + { + ret = xc_try_bzip2_decode(dom, &dom->kernel_blob, &dom->kernel_size); + if ( ret < 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s unable to BZIP2 decompress kernel", + __FUNCTION__); + return -EINVAL; + } + } + else if ( check_magic(dom, "\3757zXZ", 6) ) + { + ret = xc_try_xz_decode(dom, &dom->kernel_blob, &dom->kernel_size); + if ( ret < 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s unable to XZ decompress kernel", + __FUNCTION__); + return -EINVAL; + } + } + else if ( check_magic(dom, "\135\000", 2) ) + { + ret = xc_try_lzma_decode(dom, &dom->kernel_blob, &dom->kernel_size); + if ( ret < 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s unable to LZMA decompress kernel", + __FUNCTION__); + return -EINVAL; + } + } + else if ( check_magic(dom, "\x89LZO", 5) ) + { + ret = xc_try_lzo1x_decode(dom, &dom->kernel_blob, &dom->kernel_size); + if ( ret < 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s unable to LZO decompress kernel\n", + __FUNCTION__); + return -EINVAL; + } + } + else if ( check_magic(dom, "\x02\x21", 2) ) + { + ret = xc_try_lz4_decode(dom, &dom->kernel_blob, &dom->kernel_size); + if ( ret < 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s unable to LZ4 decompress kernel\n", + __FUNCTION__); + return -EINVAL; + } + } + else + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s: unknown compression format", __FUNCTION__); + return -EINVAL; + } + + return elf_loader.probe(dom); +} + +static int xc_dom_parse_bzimage_kernel(struct xc_dom_image *dom) +{ + return elf_loader.parser(dom); +} + +static int xc_dom_load_bzimage_kernel(struct xc_dom_image *dom) +{ + return elf_loader.loader(dom); +} + +static struct xc_dom_loader bzimage_loader = { + .name = "Linux bzImage", + .probe = xc_dom_probe_bzimage_kernel, + .parser = xc_dom_parse_bzimage_kernel, + .loader = xc_dom_load_bzimage_kernel, +}; + +static void __init register_loader(void) +{ + xc_dom_register_loader(&bzimage_loader); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_compat_linux.c b/tools/libs/guest/xg_dom_compat_linux.c new file mode 100644 index 0000000000..b645f0b14b --- /dev/null +++ b/tools/libs/guest/xg_dom_compat_linux.c @@ -0,0 +1,97 @@ +/* + * Xen domain builder -- compatibility code. + * + * Replacements for xc_linux_build & friends, + * as example code and to make the new builder + * usable as drop-in replacement. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * written 2006 by Gerd Hoffmann . + * + */ + +#include +#include +#include +#include +#include + +#include "xenctrl.h" +#include "xg_private.h" +#include "xenctrl_dom.h" + +/* ------------------------------------------------------------------------ */ + +int xc_linux_build(xc_interface *xch, uint32_t domid, + unsigned int mem_mb, + const char *image_name, + const char *initrd_name, + const char *cmdline, + const char *features, + unsigned long flags, + unsigned int store_evtchn, + unsigned long *store_mfn, + unsigned int console_evtchn, + unsigned long *console_mfn) +{ + struct xc_dom_image *dom; + int rc; + + xc_dom_loginit(xch); + dom = xc_dom_allocate(xch, cmdline, features); + if (dom == NULL) + return -1; + if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 ) + goto out; + if ( initrd_name && strlen(initrd_name) && + ((rc = xc_dom_module_file(dom, initrd_name, NULL)) != 0) ) + goto out; + + dom->flags |= flags; + dom->console_evtchn = console_evtchn; + dom->xenstore_evtchn = store_evtchn; + + if ( (rc = xc_dom_boot_xen_init(dom, xch, domid)) != 0 ) + goto out; + if ( (rc = xc_dom_parse_image(dom)) != 0 ) + goto out; + if ( (rc = xc_dom_mem_init(dom, mem_mb)) != 0 ) + goto out; + if ( (rc = xc_dom_boot_mem_init(dom)) != 0 ) + goto out; + if ( (rc = xc_dom_build_image(dom)) != 0 ) + goto out; + if ( (rc = xc_dom_boot_image(dom)) != 0 ) + goto out; + if ( (rc = xc_dom_gnttab_init(dom)) != 0) + goto out; + + *console_mfn = xc_dom_p2m(dom, dom->console_pfn); + *store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); + + out: + xc_dom_release(dom); + return rc; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_core.c b/tools/libs/guest/xg_dom_core.c new file mode 100644 index 0000000000..1c91cce315 --- /dev/null +++ b/tools/libs/guest/xg_dom_core.c @@ -0,0 +1,1272 @@ +/* + * Xen domain builder -- core bits. + * + * The core code goes here: + * - allocate and release domain structs. + * - memory management functions. + * - misc helper functions. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * written 2006 by Gerd Hoffmann . + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" +#include "_paths.h" + +/* ------------------------------------------------------------------------ */ +/* debugging */ + + + +static const char *default_logfile = XEN_LOG_DIR "/domain-builder-ng.log"; + +int xc_dom_loginit(xc_interface *xch) { + if (xch->dombuild_logger) return 0; + + if (!xch->dombuild_logger_file) { + xch->dombuild_logger_file = fopen(default_logfile, "a"); + if (!xch->dombuild_logger_file) { + PERROR("Could not open logfile `%s'", default_logfile); + return -1; + } + } + + xch->dombuild_logger = xch->dombuild_logger_tofree = + (xentoollog_logger*) + xtl_createlogger_stdiostream(xch->dombuild_logger_file, XTL_DETAIL, + XTL_STDIOSTREAM_SHOW_DATE|XTL_STDIOSTREAM_SHOW_PID); + if (!xch->dombuild_logger) + return -1; + + xc_dom_printf(xch, "### ----- xc domain builder logfile opened -----"); + + return 0; +} + +void xc_dom_printf(xc_interface *xch, const char *fmt, ...) +{ + va_list args; + if (!xch->dombuild_logger) return; + va_start(args, fmt); + xtl_logv(xch->dombuild_logger, XTL_DETAIL, -1, "domainbuilder", fmt, args); + va_end(args); +} + +void xc_dom_panic_func(xc_interface *xch, + const char *file, int line, xc_error_code err, + const char *fmt, ...) +{ + va_list args; + char msg[XC_MAX_ERROR_MSG_LEN]; + + va_start(args, fmt); + vsnprintf(msg, sizeof(msg), fmt, args); + va_end(args); + msg[sizeof(msg)-1] = 0; + + xc_report(xch, + xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler, + XTL_ERROR, err, "panic: %s:%d: %s", + file, line, msg); +} + +static void print_mem(struct xc_dom_image *dom, const char *name, size_t mem) +{ + if ( mem > (32 * 1024 * 1024) ) + DOMPRINTF("%-24s : %zd MB", name, mem / (1024 * 1024)); + else if ( mem > (32 * 1024) ) + DOMPRINTF("%-24s : %zd kB", name, mem / 1024); + else + DOMPRINTF("%-24s : %zd bytes", name, mem); +} + +void xc_dom_log_memory_footprint(struct xc_dom_image *dom) +{ + DOMPRINTF("domain builder memory footprint"); + DOMPRINTF(" allocated"); + print_mem(dom, " malloc", dom->alloc_malloc); + print_mem(dom, " anon mmap", dom->alloc_mem_map); + DOMPRINTF(" mapped"); + print_mem(dom, " file mmap", dom->alloc_file_map); + print_mem(dom, " domU mmap", dom->alloc_domU_map); +} + +/* ------------------------------------------------------------------------ */ +/* simple memory pool */ + +void *xc_dom_malloc(struct xc_dom_image *dom, size_t size) +{ + struct xc_dom_mem *block; + + if ( size > SIZE_MAX - sizeof(*block) ) + { + DOMPRINTF("%s: unreasonable allocation size", __FUNCTION__); + return NULL; + } + block = malloc(sizeof(*block) + size); + if ( block == NULL ) + { + DOMPRINTF("%s: allocation failed", __FUNCTION__); + return NULL; + } + memset(block, 0, sizeof(*block) + size); + block->type = XC_DOM_MEM_TYPE_MALLOC_INTERNAL; + block->next = dom->memblocks; + dom->memblocks = block; + dom->alloc_malloc += sizeof(*block) + size; + if ( size > (100 * 1024) ) + print_mem(dom, __FUNCTION__, size); + return block->memory; +} + +void *xc_dom_malloc_page_aligned(struct xc_dom_image *dom, size_t size) +{ + struct xc_dom_mem *block; + + block = malloc(sizeof(*block)); + if ( block == NULL ) + { + DOMPRINTF("%s: allocation failed", __FUNCTION__); + return NULL; + } + memset(block, 0, sizeof(*block)); + block->len = size; + block->ptr = mmap(NULL, block->len, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, + -1, 0); + if ( block->ptr == MAP_FAILED ) + { + DOMPRINTF("%s: mmap failed", __FUNCTION__); + free(block); + return NULL; + } + block->type = XC_DOM_MEM_TYPE_MMAP; + block->next = dom->memblocks; + dom->memblocks = block; + dom->alloc_malloc += sizeof(*block); + dom->alloc_mem_map += block->len; + if ( size > (100 * 1024) ) + print_mem(dom, __FUNCTION__, size); + return block->ptr; +} + +int xc_dom_register_external(struct xc_dom_image *dom, void *ptr, size_t size) +{ + struct xc_dom_mem *block; + + block = malloc(sizeof(*block)); + if ( block == NULL ) + { + DOMPRINTF("%s: allocation failed", __FUNCTION__); + return -1; + } + memset(block, 0, sizeof(*block)); + block->ptr = ptr; + block->len = size; + block->type = XC_DOM_MEM_TYPE_MALLOC_EXTERNAL; + block->next = dom->memblocks; + dom->memblocks = block; + dom->alloc_malloc += sizeof(*block); + dom->alloc_mem_map += block->len; + return 0; +} + +void *xc_dom_malloc_filemap(struct xc_dom_image *dom, + const char *filename, size_t * size, + const size_t max_size) +{ + struct xc_dom_mem *block = NULL; + int fd = -1; + off_t offset; + + fd = open(filename, O_RDONLY); + if ( fd == -1 ) { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "failed to open file '%s': %s", + filename, strerror(errno)); + goto err; + } + + if ( (lseek(fd, 0, SEEK_SET) == -1) || + ((offset = lseek(fd, 0, SEEK_END)) == -1) ) { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "failed to seek on file '%s': %s", + filename, strerror(errno)); + goto err; + } + + *size = offset; + + if ( max_size && *size > max_size ) + { + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "tried to map file which is too large"); + goto err; + } + + if ( !*size ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "'%s': zero length file", filename); + goto err; + } + + block = malloc(sizeof(*block)); + if ( block == NULL ) { + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "failed to allocate block (%zu bytes)", + sizeof(*block)); + goto err; + } + + memset(block, 0, sizeof(*block)); + block->len = *size; + block->ptr = mmap(NULL, block->len, PROT_READ, + MAP_SHARED, fd, 0); + if ( block->ptr == MAP_FAILED ) { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "failed to mmap file '%s': %s", + filename, strerror(errno)); + goto err; + } + + block->type = XC_DOM_MEM_TYPE_MMAP; + block->next = dom->memblocks; + dom->memblocks = block; + dom->alloc_malloc += sizeof(*block); + dom->alloc_file_map += block->len; + close(fd); + if ( *size > (100 * 1024) ) + print_mem(dom, __FUNCTION__, *size); + return block->ptr; + + err: + if ( fd != -1 ) + close(fd); + free(block); + DOMPRINTF("%s: failed (on file `%s')", __FUNCTION__, filename); + return NULL; +} + +static void xc_dom_free_all(struct xc_dom_image *dom) +{ + struct xc_dom_mem *block; + + while ( (block = dom->memblocks) != NULL ) + { + dom->memblocks = block->next; + switch ( block->type ) + { + case XC_DOM_MEM_TYPE_MALLOC_INTERNAL: + break; + case XC_DOM_MEM_TYPE_MALLOC_EXTERNAL: + free(block->ptr); + break; + case XC_DOM_MEM_TYPE_MMAP: + munmap(block->ptr, block->len); + break; + } + free(block); + } +} + +char *xc_dom_strdup(struct xc_dom_image *dom, const char *str) +{ + size_t len = strlen(str) + 1; + char *nstr = xc_dom_malloc(dom, len); + + if ( nstr == NULL ) + return NULL; + memcpy(nstr, str, len); + return nstr; +} + +/* ------------------------------------------------------------------------ */ +/* decompression buffer sizing */ +int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz) +{ + /* No limit */ + if ( !dom->max_kernel_size ) + return 0; + + if ( sz > dom->max_kernel_size ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "kernel image too large"); + return 1; + } + + return 0; +} + +/* ------------------------------------------------------------------------ */ +/* read files, copy memory blocks, with transparent gunzip */ + +size_t xc_dom_check_gzip(xc_interface *xch, void *blob, size_t ziplen) +{ + unsigned char *gzlen; + size_t unziplen; + + if ( ziplen < 6 ) + /* Too small. We need (i.e. the subsequent code relies on) + * 2 bytes for the magic number plus 4 bytes length. */ + return 0; + + if ( strncmp(blob, "\037\213", 2) ) + /* not gzipped */ + return 0; + + gzlen = blob + ziplen - 4; + unziplen = (size_t)gzlen[3] << 24 | gzlen[2] << 16 | gzlen[1] << 8 | gzlen[0]; + if ( unziplen > XC_DOM_DECOMPRESS_MAX ) + { + xc_dom_printf + (xch, + "%s: size (zip %zd, unzip %zd) looks insane, skip gunzip", + __FUNCTION__, ziplen, unziplen); + return 0; + } + + return unziplen + 16; +} + +int xc_dom_do_gunzip(xc_interface *xch, + void *src, size_t srclen, void *dst, size_t dstlen) +{ + z_stream zStream; + int rc; + + memset(&zStream, 0, sizeof(zStream)); + zStream.next_in = src; + zStream.avail_in = srclen; + zStream.next_out = dst; + zStream.avail_out = dstlen; + rc = inflateInit2(&zStream, (MAX_WBITS + 32)); /* +32 means "handle gzip" */ + if ( rc != Z_OK ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: inflateInit2 failed (rc=%d)", __FUNCTION__, rc); + return -1; + } + rc = inflate(&zStream, Z_FINISH); + inflateEnd(&zStream); + if ( rc != Z_STREAM_END ) + { + xc_dom_panic(xch, XC_INTERNAL_ERROR, + "%s: inflate failed (rc=%d)", __FUNCTION__, rc); + return -1; + } + + xc_dom_printf(xch, "%s: unzip ok, 0x%zx -> 0x%zx", + __FUNCTION__, srclen, dstlen); + return 0; +} + +int xc_dom_try_gunzip(struct xc_dom_image *dom, void **blob, size_t * size) +{ + void *unzip; + size_t unziplen; + + unziplen = xc_dom_check_gzip(dom->xch, *blob, *size); + if ( unziplen == 0 ) + return 0; + + if ( xc_dom_kernel_check_size(dom, unziplen) ) + return 0; + + unzip = xc_dom_malloc(dom, unziplen); + if ( unzip == NULL ) + return -1; + + if ( xc_dom_do_gunzip(dom->xch, *blob, *size, unzip, unziplen) == -1 ) + return -1; + + *blob = unzip; + *size = unziplen; + return 0; +} + +/* ------------------------------------------------------------------------ */ +/* domain memory */ + +void *xc_dom_pfn_to_ptr(struct xc_dom_image *dom, xen_pfn_t pfn, + xen_pfn_t count) +{ + xen_pfn_t count_out_dummy; + return xc_dom_pfn_to_ptr_retcount(dom, pfn, count, &count_out_dummy); +} + +void *xc_dom_pfn_to_ptr_retcount(struct xc_dom_image *dom, xen_pfn_t pfn, + xen_pfn_t count, xen_pfn_t *count_out) +{ + struct xc_dom_phys *phys; + xen_pfn_t offset; + unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom); + char *mode = "unset"; + + *count_out = 0; + + offset = pfn - dom->rambase_pfn; + if ( offset > dom->total_pages || /* multiple checks to avoid overflows */ + count > dom->total_pages || + offset > dom->total_pages - count ) + { + DOMPRINTF("%s: pfn %"PRI_xen_pfn" out of range (0x%" PRIpfn " > 0x%" PRIpfn ")", + __FUNCTION__, pfn, offset, dom->total_pages); + return NULL; + } + + /* already allocated? */ + for ( phys = dom->phys_pages; phys != NULL; phys = phys->next ) + { + if ( pfn >= (phys->first + phys->count) ) + continue; + if ( count ) + { + /* size given: must be completely within the already allocated block */ + if ( (pfn + count) <= phys->first ) + continue; + if ( (pfn < phys->first) || + ((pfn + count) > (phys->first + phys->count)) ) + { + DOMPRINTF("%s: request overlaps allocated block" + " (req 0x%" PRIpfn "+0x%" PRIpfn "," + " blk 0x%" PRIpfn "+0x%" PRIpfn ")", + __FUNCTION__, pfn, count, phys->first, + phys->count); + return NULL; + } + *count_out = count; + } + else + { + /* no size given: block must be allocated already, + just hand out a pointer to it */ + if ( pfn < phys->first ) + continue; + if ( pfn >= phys->first + phys->count ) + continue; + *count_out = phys->count - (pfn - phys->first); + } + return phys->ptr + ((pfn - phys->first) << page_shift); + } + + /* allocating is allowed with size specified only */ + if ( count == 0 ) + { + DOMPRINTF("%s: no block found, no size given," + " can't malloc (pfn 0x%" PRIpfn ")", + __FUNCTION__, pfn); + return NULL; + } + + /* not found, no overlap => allocate */ + phys = xc_dom_malloc(dom, sizeof(*phys)); + if ( phys == NULL ) + return NULL; + memset(phys, 0, sizeof(*phys)); + phys->first = pfn; + phys->count = count; + + if ( dom->guest_domid ) + { + mode = "domU mapping"; + phys->ptr = xc_dom_boot_domU_map(dom, phys->first, phys->count); + if ( phys->ptr == NULL ) + return NULL; + dom->alloc_domU_map += phys->count << page_shift; + } + else + { + int err; + + mode = "anonymous memory"; + phys->ptr = mmap(NULL, phys->count << page_shift, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, + -1, 0); + if ( phys->ptr == MAP_FAILED ) + { + err = errno; + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "%s: oom: can't allocate 0x%" PRIpfn " pages" + " [mmap, errno=%i (%s)]", + __FUNCTION__, count, err, strerror(err)); + return NULL; + } + dom->alloc_mem_map += phys->count << page_shift; + } + +#if 1 + DOMPRINTF("%s: %s: pfn 0x%" PRIpfn "+0x%" PRIpfn " at %p", + __FUNCTION__, mode, phys->first, phys->count, phys->ptr); +#endif + phys->next = dom->phys_pages; + dom->phys_pages = phys; + return phys->ptr; +} + +static int xc_dom_chk_alloc_pages(struct xc_dom_image *dom, char *name, + xen_pfn_t pages) +{ + unsigned int page_size = XC_DOM_PAGE_SIZE(dom); + + if ( pages > dom->total_pages || /* multiple test avoids overflow probs */ + dom->pfn_alloc_end - dom->rambase_pfn > dom->total_pages || + pages > dom->total_pages - dom->pfn_alloc_end + dom->rambase_pfn ) + { + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "%s: segment %s too large (0x%"PRIpfn" > " + "0x%"PRIpfn" - 0x%"PRIpfn" pages)", __FUNCTION__, name, + pages, dom->total_pages, + dom->pfn_alloc_end - dom->rambase_pfn); + return -1; + } + + dom->pfn_alloc_end += pages; + dom->virt_alloc_end += pages * page_size; + + if ( dom->allocate ) + dom->allocate(dom); + + return 0; +} + +static int xc_dom_alloc_pad(struct xc_dom_image *dom, xen_vaddr_t boundary) +{ + unsigned int page_size = XC_DOM_PAGE_SIZE(dom); + xen_pfn_t pages; + + if ( boundary & (page_size - 1) ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: segment boundary isn't page aligned (0x%" PRIx64 ")", + __FUNCTION__, boundary); + return -1; + } + if ( boundary < dom->virt_alloc_end ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: segment boundary too low (0x%" PRIx64 " < 0x%" PRIx64 + ")", __FUNCTION__, boundary, dom->virt_alloc_end); + return -1; + } + pages = (boundary - dom->virt_alloc_end) / page_size; + + return xc_dom_chk_alloc_pages(dom, "padding", pages); +} + +int xc_dom_alloc_segment(struct xc_dom_image *dom, + struct xc_dom_seg *seg, char *name, + xen_vaddr_t start, xen_vaddr_t size) +{ + unsigned int page_size = XC_DOM_PAGE_SIZE(dom); + xen_pfn_t pages; + void *ptr; + + if ( start && xc_dom_alloc_pad(dom, start) ) + return -1; + + pages = (size + page_size - 1) / page_size; + start = dom->virt_alloc_end; + + seg->pfn = dom->pfn_alloc_end; + seg->pages = pages; + + if ( xc_dom_chk_alloc_pages(dom, name, pages) ) + return -1; + + /* map and clear pages */ + ptr = xc_dom_seg_to_ptr(dom, seg); + if ( ptr == NULL ) + return -1; + memset(ptr, 0, pages * page_size); + + seg->vstart = start; + seg->vend = dom->virt_alloc_end; + + DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " -> 0x%" PRIx64 + " (pfn 0x%" PRIpfn " + 0x%" PRIpfn " pages)", + __FUNCTION__, name, seg->vstart, seg->vend, seg->pfn, pages); + + return 0; +} + +xen_pfn_t xc_dom_alloc_page(struct xc_dom_image *dom, char *name) +{ + xen_vaddr_t start; + xen_pfn_t pfn; + + start = dom->virt_alloc_end; + pfn = dom->pfn_alloc_end - dom->rambase_pfn; + + if ( xc_dom_chk_alloc_pages(dom, name, 1) ) + return INVALID_PFN; + + DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " (pfn 0x%" PRIpfn ")", + __FUNCTION__, name, start, pfn); + return pfn; +} + +void xc_dom_unmap_one(struct xc_dom_image *dom, xen_pfn_t pfn) +{ + unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom); + struct xc_dom_phys *phys, *prev = NULL; + + for ( phys = dom->phys_pages; phys != NULL; phys = phys->next ) + { + if ( (pfn >= phys->first) && (pfn < (phys->first + phys->count)) ) + break; + prev = phys; + } + if ( !phys ) + { + DOMPRINTF("%s: Huh? no mapping with pfn 0x%" PRIpfn "", + __FUNCTION__, pfn); + return; + } + + munmap(phys->ptr, phys->count << page_shift); + if ( prev ) + prev->next = phys->next; + else + dom->phys_pages = phys->next; + + xc_domain_cacheflush(dom->xch, dom->guest_domid, phys->first, phys->count); +} + +void xc_dom_unmap_all(struct xc_dom_image *dom) +{ + while ( dom->phys_pages ) + xc_dom_unmap_one(dom, dom->phys_pages->first); +} + +/* ------------------------------------------------------------------------ */ +/* pluggable kernel loaders */ + +static struct xc_dom_loader *first_loader = NULL; +static struct xc_dom_arch *first_hook = NULL; + +void xc_dom_register_loader(struct xc_dom_loader *loader) +{ + loader->next = first_loader; + first_loader = loader; +} + +static struct xc_dom_loader *xc_dom_find_loader(struct xc_dom_image *dom) +{ + struct xc_dom_loader *loader = first_loader; + + while ( loader != NULL ) + { + DOMPRINTF("%s: trying %s loader ... ", __FUNCTION__, loader->name); + if ( loader->probe(dom) == 0 ) + { + DOMPRINTF("loader probe OK"); + return loader; + } + DOMPRINTF("loader probe failed"); + loader = loader->next; + } + xc_dom_panic(dom->xch, + XC_INVALID_KERNEL, "%s: no loader found", __FUNCTION__); + return NULL; +} + +void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks) +{ + hooks->next = first_hook; + first_hook = hooks; +} + +int xc_dom_set_arch_hooks(struct xc_dom_image *dom) +{ + struct xc_dom_arch *hooks = first_hook; + + while ( hooks != NULL ) + { + if ( !strcmp(hooks->guest_type, dom->guest_type) ) + { + if ( hooks->arch_private_size ) + { + dom->arch_private = malloc(hooks->arch_private_size); + if ( dom->arch_private == NULL ) + return -1; + memset(dom->arch_private, 0, hooks->arch_private_size); + dom->alloc_malloc += hooks->arch_private_size; + } + dom->arch_hooks = hooks; + return 0; + } + hooks = hooks->next; + } + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s: not found (type %s)", __FUNCTION__, dom->guest_type); + return -1; +} + +/* ------------------------------------------------------------------------ */ +/* public interface */ + +void xc_dom_release(struct xc_dom_image *dom) +{ + DOMPRINTF_CALLED(dom->xch); + if ( dom->phys_pages ) + xc_dom_unmap_all(dom); + xc_dom_free_all(dom); + free(dom->arch_private); + free(dom); +} + +struct xc_dom_image *xc_dom_allocate(xc_interface *xch, + const char *cmdline, const char *features) +{ + struct xc_dom_image *dom; + + xc_dom_printf(xch, "%s: cmdline=\"%s\", features=\"%s\"", + __FUNCTION__, cmdline ? cmdline : "", + features ? features : ""); + dom = malloc(sizeof(*dom)); + if ( !dom ) + goto err; + + memset(dom, 0, sizeof(*dom)); + dom->xch = xch; + + dom->max_kernel_size = XC_DOM_DECOMPRESS_MAX; + dom->max_module_size = XC_DOM_DECOMPRESS_MAX; + dom->max_devicetree_size = XC_DOM_DECOMPRESS_MAX; + + if ( cmdline ) + dom->cmdline = xc_dom_strdup(dom, cmdline); + if ( features ) + elf_xen_parse_features(features, dom->f_requested, NULL); + + dom->parms.virt_base = UNSET_ADDR; + dom->parms.virt_entry = UNSET_ADDR; + dom->parms.virt_hypercall = UNSET_ADDR; + dom->parms.virt_hv_start_low = UNSET_ADDR; + dom->parms.elf_paddr_offset = UNSET_ADDR; + dom->parms.p2m_base = UNSET_ADDR; + + dom->flags = SIF_VIRT_P2M_4TOOLS; + + dom->alloc_malloc += sizeof(*dom); + return dom; + + err: + if ( dom ) + xc_dom_release(dom); + return NULL; +} + +int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz) +{ + DOMPRINTF("%s: kernel_max_size=%zx", __FUNCTION__, sz); + dom->max_kernel_size = sz; + return 0; +} + +int xc_dom_module_max_size(struct xc_dom_image *dom, size_t sz) +{ + DOMPRINTF("%s: module_max_size=%zx", __FUNCTION__, sz); + dom->max_module_size = sz; + return 0; +} + +int xc_dom_devicetree_max_size(struct xc_dom_image *dom, size_t sz) +{ + DOMPRINTF("%s: devicetree_max_size=%zx", __FUNCTION__, sz); + dom->max_devicetree_size = sz; + return 0; +} + +int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename) +{ + DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); + dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size, + dom->max_kernel_size); + if ( dom->kernel_blob == NULL ) + return -1; + return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); +} + +int xc_dom_module_file(struct xc_dom_image *dom, const char *filename, const char *cmdline) +{ + unsigned int mod = dom->num_modules++; + + DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); + dom->modules[mod].blob = + xc_dom_malloc_filemap(dom, filename, &dom->modules[mod].size, + dom->max_module_size); + + if ( dom->modules[mod].blob == NULL ) + return -1; + + if ( cmdline ) + { + dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline); + + if ( dom->modules[mod].cmdline == NULL ) + return -1; + } + else + { + dom->modules[mod].cmdline = NULL; + } + + return 0; +} + +int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename) +{ +#if defined (__arm__) || defined(__aarch64__) + DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); + dom->devicetree_blob = + xc_dom_malloc_filemap(dom, filename, &dom->devicetree_size, + dom->max_devicetree_size); + + if ( dom->devicetree_blob == NULL ) + return -1; + return 0; +#else + errno = -EINVAL; + return -1; +#endif +} + +int xc_dom_kernel_mem(struct xc_dom_image *dom, const void *mem, size_t memsize) +{ + DOMPRINTF_CALLED(dom->xch); + dom->kernel_blob = (void *)mem; + dom->kernel_size = memsize; + return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); +} + +int xc_dom_module_mem(struct xc_dom_image *dom, const void *mem, + size_t memsize, const char *cmdline) +{ + unsigned int mod = dom->num_modules++; + + DOMPRINTF_CALLED(dom->xch); + + dom->modules[mod].blob = (void *)mem; + dom->modules[mod].size = memsize; + + if ( cmdline ) + { + dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline); + + if ( dom->modules[mod].cmdline == NULL ) + return -1; + } + else + { + dom->modules[mod].cmdline = NULL; + } + + return 0; +} + +int xc_dom_devicetree_mem(struct xc_dom_image *dom, const void *mem, + size_t memsize) +{ + DOMPRINTF_CALLED(dom->xch); + dom->devicetree_blob = (void *)mem; + dom->devicetree_size = memsize; + return 0; +} + +int xc_dom_parse_image(struct xc_dom_image *dom) +{ + int i; + + DOMPRINTF_CALLED(dom->xch); + + /* parse kernel image */ + dom->kernel_loader = xc_dom_find_loader(dom); + if ( dom->kernel_loader == NULL ) + goto err; + if ( dom->kernel_loader->parser(dom) != 0 ) + goto err; + if ( dom->guest_type == NULL ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: guest_type not set", __FUNCTION__); + goto err; + } + + /* check features */ + for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ ) + { + dom->f_active[i] |= dom->f_requested[i]; /* cmd line */ + dom->f_active[i] |= dom->parms.f_required[i]; /* kernel */ + if ( (dom->f_active[i] & dom->parms.f_supported[i]) != + dom->f_active[i] ) + { + xc_dom_panic(dom->xch, XC_INVALID_PARAM, + "%s: unsupported feature requested", __FUNCTION__); + goto err; + } + } + return 0; + + err: + return -1; +} + +int xc_dom_rambase_init(struct xc_dom_image *dom, uint64_t rambase) +{ + dom->rambase_pfn = rambase >> XC_PAGE_SHIFT; + dom->pfn_alloc_end = dom->rambase_pfn; + DOMPRINTF("%s: RAM starts at %"PRI_xen_pfn, + __FUNCTION__, dom->rambase_pfn); + return 0; +} + +int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb) +{ + unsigned int page_shift; + xen_pfn_t nr_pages; + + if ( xc_dom_set_arch_hooks(dom) ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set", + __FUNCTION__); + return -1; + } + + page_shift = XC_DOM_PAGE_SHIFT(dom); + nr_pages = mem_mb << (20 - page_shift); + + DOMPRINTF("%s: mem %d MB, pages 0x%" PRIpfn " pages, %dk each", + __FUNCTION__, mem_mb, nr_pages, 1 << (page_shift-10)); + dom->total_pages = nr_pages; + + DOMPRINTF("%s: 0x%" PRIpfn " pages", + __FUNCTION__, dom->total_pages); + + return 0; +} + +static int xc_dom_build_module(struct xc_dom_image *dom, unsigned int mod) +{ + size_t unziplen, modulelen; + void *modulemap; + char name[10]; + + if ( !dom->modules[mod].seg.vstart ) + unziplen = xc_dom_check_gzip(dom->xch, + dom->modules[mod].blob, dom->modules[mod].size); + else + unziplen = 0; + + modulelen = max(unziplen, dom->modules[mod].size); + if ( dom->max_module_size ) + { + if ( unziplen && modulelen > dom->max_module_size ) + { + modulelen = min(unziplen, dom->modules[mod].size); + if ( unziplen > modulelen ) + unziplen = 0; + } + if ( modulelen > dom->max_module_size ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "module %u image too large", mod); + goto err; + } + } + + snprintf(name, sizeof(name), "module%u", mod); + if ( xc_dom_alloc_segment(dom, &dom->modules[mod].seg, name, + dom->modules[mod].seg.vstart, modulelen) != 0 ) + goto err; + modulemap = xc_dom_seg_to_ptr(dom, &dom->modules[mod].seg); + if ( modulemap == NULL ) + { + DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->modules[%u].seg) => NULL", + __FUNCTION__, mod); + goto err; + } + if ( unziplen ) + { + if ( xc_dom_do_gunzip(dom->xch, dom->modules[mod].blob, dom->modules[mod].size, + modulemap, unziplen) != -1 ) + return 0; + if ( dom->modules[mod].size > modulelen ) + goto err; + } + + /* Fall back to handing over the raw blob. */ + memcpy(modulemap, dom->modules[mod].blob, dom->modules[mod].size); + /* If an unzip attempt was made, the buffer may no longer be all zero. */ + if ( unziplen > dom->modules[mod].size ) + memset(modulemap + dom->modules[mod].size, 0, + unziplen - dom->modules[mod].size); + + return 0; + + err: + return -1; +} + +static int populate_acpi_pages(struct xc_dom_image *dom, + xen_pfn_t *extents, + unsigned int num_pages) +{ + int rc; + xc_interface *xch = dom->xch; + uint32_t domid = dom->guest_domid; + unsigned long idx; + unsigned long first_high_idx = 4UL << (30 - PAGE_SHIFT); /* 4GB */ + + for ( ; num_pages; num_pages--, extents++ ) + { + + if ( xc_domain_populate_physmap(xch, domid, 1, 0, 0, extents) == 1 ) + continue; + + if ( dom->highmem_end ) + { + idx = --dom->highmem_end; + if ( idx == first_high_idx ) + dom->highmem_end = 0; + } + else + { + idx = --dom->lowmem_end; + } + + rc = xc_domain_add_to_physmap(xch, domid, + XENMAPSPACE_gmfn, + idx, *extents); + if ( rc ) + return rc; + } + + return 0; +} + +static int xc_dom_load_acpi(struct xc_dom_image *dom) +{ + int j, i = 0; + unsigned num_pages; + xen_pfn_t *extents, base; + void *ptr; + + while ( (i < MAX_ACPI_MODULES) && dom->acpi_modules[i].length ) + { + DOMPRINTF("%s: %d bytes at address %" PRIx64, __FUNCTION__, + dom->acpi_modules[i].length, + dom->acpi_modules[i].guest_addr_out); + + num_pages = (dom->acpi_modules[i].length + + (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK) + + (XC_PAGE_SIZE - 1)) >> XC_PAGE_SHIFT; + extents = malloc(num_pages * sizeof(*extents)); + if ( !extents ) + { + DOMPRINTF("%s: Out of memory", __FUNCTION__); + goto err; + } + + base = dom->acpi_modules[i].guest_addr_out >> XC_PAGE_SHIFT; + for ( j = 0; j < num_pages; j++ ) + extents[j] = base + j; + if ( populate_acpi_pages(dom, extents, num_pages) ) + { + DOMPRINTF("%s: Can populate ACPI pages", __FUNCTION__); + goto err; + } + + ptr = xc_map_foreign_range(dom->xch, dom->guest_domid, + XC_PAGE_SIZE * num_pages, + PROT_READ | PROT_WRITE, base); + if ( !ptr ) + { + DOMPRINTF("%s: Can't map %d pages at 0x%"PRI_xen_pfn, + __FUNCTION__, num_pages, base); + goto err; + } + + memcpy((uint8_t *)ptr + + (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK), + dom->acpi_modules[i].data, dom->acpi_modules[i].length); + munmap(ptr, XC_PAGE_SIZE * num_pages); + + free(extents); + i++; + } + + return 0; + +err: + free(extents); + return -1; +} + +int xc_dom_build_image(struct xc_dom_image *dom) +{ + unsigned int page_size; + bool unmapped_initrd; + unsigned int mod; + + DOMPRINTF_CALLED(dom->xch); + + /* check for arch hooks */ + if ( dom->arch_hooks == NULL ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set", + __FUNCTION__); + goto err; + } + page_size = XC_DOM_PAGE_SIZE(dom); + if ( dom->parms.virt_base != UNSET_ADDR ) + dom->virt_alloc_end = dom->parms.virt_base; + + /* load kernel */ + if ( xc_dom_alloc_segment(dom, &dom->kernel_seg, "kernel", + dom->kernel_seg.vstart, + dom->kernel_seg.vend - + dom->kernel_seg.vstart) != 0 ) + goto err; + if ( dom->kernel_loader->loader(dom) != 0 ) + goto err; + + /* Don't load ramdisk / other modules now if no initial mapping required. */ + for ( mod = 0; mod < dom->num_modules; mod++ ) + { + unmapped_initrd = (dom->parms.unmapped_initrd && + !dom->modules[mod].seg.vstart); + + if ( dom->modules[mod].blob && !unmapped_initrd ) + { + if ( xc_dom_build_module(dom, mod) != 0 ) + goto err; + + if ( mod == 0 ) + { + dom->initrd_start = dom->modules[mod].seg.vstart; + dom->initrd_len = + dom->modules[mod].seg.vend - dom->modules[mod].seg.vstart; + } + } + } + + /* load devicetree */ + if ( dom->devicetree_blob ) + { + void *devicetreemap; + + if ( xc_dom_alloc_segment(dom, &dom->devicetree_seg, "devicetree", + dom->devicetree_seg.vstart, + dom->devicetree_size) != 0 ) + goto err; + devicetreemap = xc_dom_seg_to_ptr(dom, &dom->devicetree_seg); + if ( devicetreemap == NULL ) + { + DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->devicetree_seg) => NULL", + __FUNCTION__); + goto err; + } + memcpy(devicetreemap, dom->devicetree_blob, dom->devicetree_size); + } + + /* load ACPI tables */ + if ( xc_dom_load_acpi(dom) != 0 ) + goto err; + + /* allocate other pages */ + if ( !dom->arch_hooks->p2m_base_supported || + dom->parms.p2m_base >= dom->parms.virt_base || + (dom->parms.p2m_base & (XC_DOM_PAGE_SIZE(dom) - 1)) ) + dom->parms.p2m_base = UNSET_ADDR; + if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base == UNSET_ADDR && + dom->arch_hooks->alloc_p2m_list(dom) != 0 ) + goto err; + if ( dom->arch_hooks->alloc_magic_pages(dom) != 0 ) + goto err; + if ( dom->arch_hooks->alloc_pgtables && + dom->arch_hooks->alloc_pgtables(dom) != 0 ) + goto err; + if ( dom->alloc_bootstack ) + { + dom->bootstack_pfn = xc_dom_alloc_page(dom, "boot stack"); + if ( dom->bootstack_pfn == INVALID_PFN ) + goto err; + } + + DOMPRINTF("%-20s: virt_alloc_end : 0x%" PRIx64 "", + __FUNCTION__, dom->virt_alloc_end); + DOMPRINTF("%-20s: virt_pgtab_end : 0x%" PRIx64 "", + __FUNCTION__, dom->virt_pgtab_end); + + /* Make sure all memory mapped by initial page tables is available */ + if ( dom->virt_pgtab_end && xc_dom_alloc_pad(dom, dom->virt_pgtab_end) ) + return -1; + + for ( mod = 0; mod < dom->num_modules; mod++ ) + { + unmapped_initrd = (dom->parms.unmapped_initrd && + !dom->modules[mod].seg.vstart); + + /* Load ramdisk / other modules if no initial mapping required. */ + if ( dom->modules[mod].blob && unmapped_initrd ) + { + if ( xc_dom_build_module(dom, mod) != 0 ) + goto err; + + if ( mod == 0 ) + { + dom->flags |= SIF_MOD_START_PFN; + dom->initrd_start = dom->modules[mod].seg.pfn; + dom->initrd_len = page_size * dom->modules[mod].seg.pages; + } + } + } + + /* Allocate p2m list if outside of initial kernel mapping. */ + if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base != UNSET_ADDR ) + { + if ( dom->arch_hooks->alloc_p2m_list(dom) != 0 ) + goto err; + dom->p2m_seg.vstart = dom->parms.p2m_base; + } + + return 0; + + err: + return -1; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_decompress.h b/tools/libs/guest/xg_dom_decompress.h new file mode 100644 index 0000000000..c5ab2e59eb --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress.h @@ -0,0 +1,8 @@ +#ifndef __MINIOS__ +# include "xenctrl_dom.h" +#else +# include "xg_dom_decompress_unsafe.h" +#endif + +int xc_try_lz4_decode(struct xc_dom_image *dom, void **blob, size_t *size); + diff --git a/tools/libs/guest/xg_dom_decompress_lz4.c b/tools/libs/guest/xg_dom_decompress_lz4.c new file mode 100644 index 0000000000..97ba620d86 --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress_lz4.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include + +#include "xg_private.h" +#include "xg_dom_decompress.h" + +#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +#define likely(a) a +#define unlikely(a) a + +static inline uint_fast16_t le16_to_cpup(const unsigned char *buf) +{ + return buf[0] | (buf[1] << 8); +} + +static inline uint_fast32_t le32_to_cpup(const unsigned char *buf) +{ + return le16_to_cpup(buf) | ((uint32_t)le16_to_cpup(buf + 2) << 16); +} + +#include "../../xen/include/xen/lz4.h" +#include "../../xen/common/decompress.h" + +#ifndef __MINIOS__ + +#include "../../xen/common/lz4/decompress.c" + +#define ARCHIVE_MAGICNUMBER 0x184C2102 + +int xc_try_lz4_decode( + struct xc_dom_image *dom, void **blob, size_t *psize) +{ + int ret = -1; + unsigned char *inp = *blob, *output, *outp; + ssize_t size = *psize - 4; + size_t out_len, dest_len, chunksize; + const char *msg; + + if (size < 4) { + msg = "input too small"; + goto exit_0; + } + + out_len = get_unaligned_le32(inp + size); + if (xc_dom_kernel_check_size(dom, out_len)) { + msg = "Decompressed image too large"; + goto exit_0; + } + + output = malloc(out_len); + if (!output) { + msg = "Could not allocate output buffer"; + goto exit_0; + } + outp = output; + + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + } else { + msg = "invalid header"; + goto exit_2; + } + + for (;;) { + if (size < 4) { + msg = "missing data"; + goto exit_2; + } + chunksize = get_unaligned_le32(inp); + if (chunksize == ARCHIVE_MAGICNUMBER) { + inp += 4; + size -= 4; + continue; + } + inp += 4; + size -= 4; + if (chunksize > size) { + msg = "insufficient input data"; + goto exit_2; + } + + dest_len = out_len - (outp - output); + ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp, + &dest_len); + if (ret < 0) { + msg = "decoding failed"; + goto exit_2; + } + + ret = -1; + outp += dest_len; + size -= chunksize; + + if (size == 0) + { + if ( xc_dom_register_external(dom, output, out_len) ) + { + msg = "Error registering stream output"; + goto exit_2; + } + *blob = output; + *psize = out_len; + return 0; + } + + if (size < 0) { + msg = "data corrupted"; + goto exit_2; + } + + inp += chunksize; + } + +exit_2: + free(output); +exit_0: + DOMPRINTF("LZ4 decompression error: %s\n", msg); + return ret; +} + +#else /* __MINIOS__ */ + +#include "../../xen/common/unlz4.c" + +int xc_try_lz4_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + return xc_dom_decompress_unsafe(unlz4, dom, blob, size); +} + +#endif diff --git a/tools/libs/guest/xg_dom_decompress_unsafe.c b/tools/libs/guest/xg_dom_decompress_unsafe.c new file mode 100644 index 0000000000..21d964787d --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress_unsafe.c @@ -0,0 +1,48 @@ +#include +#include +#include + +#include "xg_private.h" +#include "xg_dom_decompress_unsafe.h" + +static struct xc_dom_image *unsafe_dom; +static unsigned char *output_blob; +static unsigned int output_size; + +static void unsafe_error(const char *msg) +{ + xc_dom_panic(unsafe_dom->xch, XC_INVALID_KERNEL, "%s", msg); +} + +static int unsafe_flush(void *src, unsigned int size) +{ + void *n = realloc(output_blob, output_size + size); + if (!n) + return -1; + output_blob = n; + + memcpy(&output_blob[output_size], src, size); + output_size += size; + return size; +} + +int xc_dom_decompress_unsafe( + decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size) +{ + int ret; + + unsafe_dom = dom; + output_blob = NULL; + output_size = 0; + + ret = fn(dom->kernel_blob, dom->kernel_size, NULL, unsafe_flush, NULL, NULL, unsafe_error); + + if (ret) + free(output_blob); + else { + *blob = output_blob; + *size = output_size; + } + + return ret; +} diff --git a/tools/libs/guest/xg_dom_decompress_unsafe.h b/tools/libs/guest/xg_dom_decompress_unsafe.h new file mode 100644 index 0000000000..fb84b6add8 --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress_unsafe.h @@ -0,0 +1,20 @@ +#include "xenctrl_dom.h" + +typedef int decompress_fn(unsigned char *inbuf, unsigned int len, + int (*fill)(void*, unsigned int), + int (*flush)(void*, unsigned int), + unsigned char *outbuf, unsigned int *posp, + void (*error)(const char *x)); + +int xc_dom_decompress_unsafe( + decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size) + __attribute__((visibility("internal"))); + +int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size) + __attribute__((visibility("internal"))); +int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size) + __attribute__((visibility("internal"))); +int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size) + __attribute__((visibility("internal"))); +int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size) + __attribute__((visibility("internal"))); diff --git a/tools/libs/guest/xg_dom_decompress_unsafe_bzip2.c b/tools/libs/guest/xg_dom_decompress_unsafe_bzip2.c new file mode 100644 index 0000000000..9d3709e6cc --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress_unsafe_bzip2.c @@ -0,0 +1,14 @@ +#include +#include +#include + +#include "xg_private.h" +#include "xg_dom_decompress_unsafe.h" + +#include "../../xen/common/bunzip2.c" + +int xc_try_bzip2_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + return xc_dom_decompress_unsafe(bunzip2, dom, blob, size); +} diff --git a/tools/libs/guest/xg_dom_decompress_unsafe_lzma.c b/tools/libs/guest/xg_dom_decompress_unsafe_lzma.c new file mode 100644 index 0000000000..5d178f0c43 --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress_unsafe_lzma.c @@ -0,0 +1,14 @@ +#include +#include +#include + +#include "xg_private.h" +#include "xg_dom_decompress_unsafe.h" + +#include "../../xen/common/unlzma.c" + +int xc_try_lzma_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + return xc_dom_decompress_unsafe(unlzma, dom, blob, size); +} diff --git a/tools/libs/guest/xg_dom_decompress_unsafe_lzo1x.c b/tools/libs/guest/xg_dom_decompress_unsafe_lzo1x.c new file mode 100644 index 0000000000..a4f8ebd42d --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress_unsafe_lzo1x.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include + +#include "xg_private.h" +#include "xg_dom_decompress_unsafe.h" + +typedef uint8_t u8; +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint64_t u64; + +#define likely(a) a +#define noinline +#define unlikely(a) a + +static inline u16 be16_to_cpup(const u16 *p) +{ + u16 v = *p; +#if BYTE_ORDER == LITTLE_ENDIAN + return (((v & 0x00ffU) << 8) | + ((v & 0xff00U) >> 8)); +#else + return v; +#endif +} + +static inline u32 be32_to_cpup(const u32 *p) +{ + u32 v = *p; +#if BYTE_ORDER == LITTLE_ENDIAN + return (((v & 0x000000ffUL) << 24) | + ((v & 0x0000ff00UL) << 8) | + ((v & 0x00ff0000UL) >> 8) | + ((v & 0xff000000UL) >> 24)); +#else + return v; +#endif +} + +#include "../../xen/common/lzo.c" +#include "../../xen/common/unlzo.c" + +int xc_try_lzo1x_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + return xc_dom_decompress_unsafe(unlzo, dom, blob, size); +} diff --git a/tools/libs/guest/xg_dom_decompress_unsafe_xz.c b/tools/libs/guest/xg_dom_decompress_unsafe_xz.c new file mode 100644 index 0000000000..ff6824b38d --- /dev/null +++ b/tools/libs/guest/xg_dom_decompress_unsafe_xz.c @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include +#include + +#include "xg_private.h" +#include "xg_dom_decompress_unsafe.h" + +// TODO +#define XZ_DEC_X86 + +typedef char bool_t; +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint32_t __le32; + +static inline u32 cpu_to_le32(const u32 v) +{ +#if BYTE_ORDER == BIG_ENDIAN + return (((v & 0x000000ffUL) << 24) | + ((v & 0x0000ff00UL) << 8) | + ((v & 0x00ff0000UL) >> 8) | + ((v & 0xff000000UL) >> 24)); +#else + return v; +#endif +} + +static inline u32 le32_to_cpup(const u32 *p) +{ + return cpu_to_le32(*p); +} + +#define __force +#define always_inline + +#include "../../xen/common/unxz.c" + +int xc_try_xz_decode( + struct xc_dom_image *dom, void **blob, size_t *size) +{ + return xc_dom_decompress_unsafe(unxz, dom, blob, size); +} diff --git a/tools/libs/guest/xg_dom_elfloader.c b/tools/libs/guest/xg_dom_elfloader.c new file mode 100644 index 0000000000..7043c3bbba --- /dev/null +++ b/tools/libs/guest/xg_dom_elfloader.c @@ -0,0 +1,249 @@ +/* + * Xen domain builder -- ELF bits. + * + * Parse and load ELF kernel images. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * written 2006 by Gerd Hoffmann . + * + */ + +#include +#include +#include +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" +#include "xc_bitops.h" + +#define XEN_VER "xen-3.0" + +/* ------------------------------------------------------------------------ */ + +static void log_callback(struct elf_binary *elf, void *caller_data, + bool iserr, const char *fmt, va_list al) { + xc_interface *xch = caller_data; + + xc_reportv(xch, + xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler, + iserr ? XTL_ERROR : XTL_DETAIL, + iserr ? XC_INVALID_KERNEL : XC_ERROR_NONE, + fmt, al); +} + +void xc_elf_set_logfile(xc_interface *xch, struct elf_binary *elf, + int verbose) { + elf_set_log(elf, log_callback, xch, verbose /* convert to bool */); +} + +/* ------------------------------------------------------------------------ */ + +static char *xc_dom_guest_type(struct xc_dom_image *dom, + struct elf_binary *elf) +{ + uint64_t machine = elf_uval(elf, elf->ehdr, e_machine); + + if ( dom->container_type == XC_DOM_HVM_CONTAINER && + dom->parms.phys_entry != UNSET_ADDR32 ) + return "hvm-3.0-x86_32"; + if ( dom->container_type == XC_DOM_HVM_CONTAINER ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s: image not capable of booting inside a HVM container", + __FUNCTION__); + return NULL; + } + + switch ( machine ) + { + case EM_386: + switch ( dom->parms.pae ) + { + case XEN_PAE_BIMODAL: + if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") ) + return "xen-3.0-x86_32p"; + return "xen-3.0-x86_32"; + case XEN_PAE_EXTCR3: + case XEN_PAE_YES: + return "xen-3.0-x86_32p"; + case XEN_PAE_NO: + default: + return "xen-3.0-x86_32"; + } + case EM_X86_64: + return "xen-3.0-x86_64"; + default: + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s: unknown image type %"PRIu64, + __FUNCTION__, machine); + return NULL; + } +} + +/* ------------------------------------------------------------------------ */ +/* parse elf binary */ + +static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose) +{ + if ( dom->kernel_blob == NULL ) + { + if ( verbose ) + xc_dom_panic(dom->xch, + XC_INTERNAL_ERROR, "%s: no kernel image loaded", + __FUNCTION__); + return -EINVAL; + } + + if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) ) + { + if ( verbose ) + xc_dom_panic(dom->xch, + XC_INVALID_KERNEL, "%s: kernel is not an ELF image", + __FUNCTION__); + return -EINVAL; + } + return 0; +} + +static elf_negerrnoval xc_dom_probe_elf_kernel(struct xc_dom_image *dom) +{ + struct elf_binary elf; + int rc; + + rc = check_elf_kernel(dom, 0); + if ( rc != 0 ) + return rc; + + rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size); + if ( rc != 0 ) + return rc; + + /* + * We need to check that it contains Xen ELFNOTES, + * or else we might be trying to load a plain ELF. + */ + elf_parse_binary(&elf); + rc = elf_xen_parse(&elf, &dom->parms); + if ( rc != 0 ) + return rc; + + return 0; +} + +static elf_negerrnoval xc_dom_parse_elf_kernel(struct xc_dom_image *dom) +{ + struct elf_binary *elf; + elf_negerrnoval rc; + + rc = check_elf_kernel(dom, 1); + if ( rc != 0 ) + return rc; + + elf = xc_dom_malloc(dom, sizeof(*elf)); + if ( elf == NULL ) + return -ENOMEM; + dom->private_loader = elf; + rc = elf_init(elf, dom->kernel_blob, dom->kernel_size) != 0 ? -EINVAL : 0; + xc_elf_set_logfile(dom->xch, elf, 1); + if ( rc != 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image", + __FUNCTION__); + return rc; + } + + /* parse binary and get xen meta info */ + elf_parse_binary(elf); + if ( elf_xen_parse(elf, &dom->parms) != 0 ) + { + rc = -EINVAL; + goto out; + } + + if ( elf_xen_feature_get(XENFEAT_dom0, dom->parms.f_required) ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Kernel does not" + " support unprivileged (DomU) operation", __FUNCTION__); + rc = -EINVAL; + goto out; + } + + /* find kernel segment */ + dom->kernel_seg.vstart = dom->parms.virt_kstart; + dom->kernel_seg.vend = dom->parms.virt_kend; + + dom->guest_type = xc_dom_guest_type(dom, elf); + if ( dom->guest_type == NULL ) + return -EINVAL; + DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "", + __FUNCTION__, dom->guest_type, + dom->kernel_seg.vstart, dom->kernel_seg.vend); + rc = 0; +out: + if ( elf_check_broken(elf) ) + DOMPRINTF("%s: ELF broken: %s", __FUNCTION__, + elf_check_broken(elf)); + + return rc; +} + +static elf_errorstatus xc_dom_load_elf_kernel(struct xc_dom_image *dom) +{ + struct elf_binary *elf = dom->private_loader; + elf_errorstatus rc; + xen_pfn_t pages; + + elf->dest_base = xc_dom_seg_to_ptr_pages(dom, &dom->kernel_seg, &pages); + if ( elf->dest_base == NULL ) + { + DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom,dom->kernel_seg)" + " => NULL", __FUNCTION__); + return -1; + } + elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom); + + rc = elf_load_binary(elf); + if ( rc < 0 ) + { + DOMPRINTF("%s: failed to load elf binary", __FUNCTION__); + return rc; + } + return 0; +} + +/* ------------------------------------------------------------------------ */ + +struct xc_dom_loader elf_loader = { + .name = "ELF-generic", + .probe = xc_dom_probe_elf_kernel, + .parser = xc_dom_parse_elf_kernel, + .loader = xc_dom_load_elf_kernel, +}; + +static void __init register_loader(void) +{ + xc_dom_register_loader(&elf_loader); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_hvmloader.c b/tools/libs/guest/xg_dom_hvmloader.c new file mode 100644 index 0000000000..995a0f3dc3 --- /dev/null +++ b/tools/libs/guest/xg_dom_hvmloader.c @@ -0,0 +1,264 @@ +/* + * Xen domain builder -- HVM specific bits. + * + * Parse and load ELF firmware images for HVM domains. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" +#include "xc_bitops.h" + +/* ------------------------------------------------------------------------ */ +/* parse elf binary */ + +static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose) +{ + if ( dom->kernel_blob == NULL ) + { + if ( verbose ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: no kernel image loaded", __func__); + return -EINVAL; + } + + if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) ) + { + if ( verbose ) + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, + "%s: kernel is not an ELF image", __func__); + return -EINVAL; + } + return 0; +} + +static elf_negerrnoval xc_dom_probe_hvm_kernel(struct xc_dom_image *dom) +{ + struct elf_binary elf; + int rc; + + /* This loader is designed for HVM guest firmware. */ + if ( dom->container_type != XC_DOM_HVM_CONTAINER ) + return -EINVAL; + + rc = check_elf_kernel(dom, 0); + if ( rc != 0 ) + return rc; + + rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size); + if ( rc != 0 ) + return rc; + + /* + * We need to check that there are no Xen ELFNOTES, or + * else we might be trying to load a PV kernel. + */ + elf_parse_binary(&elf); + rc = elf_xen_parse(&elf, &dom->parms); + if ( rc == 0 ) + return -EINVAL; + + return 0; +} + +static elf_errorstatus xc_dom_parse_hvm_kernel(struct xc_dom_image *dom) + /* + * This function sometimes returns -1 for error and sometimes + * an errno value. ?!?! + */ +{ + struct elf_binary *elf; + elf_errorstatus rc; + + rc = check_elf_kernel(dom, 1); + if ( rc != 0 ) + return rc; + + elf = xc_dom_malloc(dom, sizeof(*elf)); + if ( elf == NULL ) + return -1; + dom->private_loader = elf; + rc = elf_init(elf, dom->kernel_blob, dom->kernel_size); + xc_elf_set_logfile(dom->xch, elf, 1); + if ( rc != 0 ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image", + __func__); + return rc; + } + + if ( !elf_32bit(elf) ) + { + xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: ELF image is not 32bit", + __func__); + return -EINVAL; + } + + /* parse binary and get xen meta info */ + elf_parse_binary(elf); + + /* find kernel segment */ + dom->kernel_seg.vstart = elf->pstart; + dom->kernel_seg.vend = elf->pend; + + dom->guest_type = "hvm-3.0-x86_32"; + + if ( elf_check_broken(elf) ) + DOMPRINTF("%s: ELF broken: %s", __func__, elf_check_broken(elf)); + + return rc; +} + +static int module_init_one(struct xc_dom_image *dom, + struct xc_hvm_firmware_module *module, + char *name) +{ + struct xc_dom_seg seg; + void *dest; + + if ( module->length && !module->guest_addr_out ) + { + if ( xc_dom_alloc_segment(dom, &seg, name, 0, module->length) ) + goto err; + dest = xc_dom_seg_to_ptr(dom, &seg); + if ( dest == NULL ) + { + DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &seg) => NULL", + __FUNCTION__); + goto err; + } + memcpy(dest, module->data, module->length); + module->guest_addr_out = seg.vstart; + + assert(dom->mmio_start > 0 && dom->mmio_start < UINT32_MAX); + if ( module->guest_addr_out > dom->mmio_start || + module->guest_addr_out + module->length > dom->mmio_start ) + { + DOMPRINTF("%s: Module %s would be loaded abrove 4GB", + __FUNCTION__, name); + goto err; + } + } + + return 0; +err: + return -1; +} + +static int modules_init(struct xc_dom_image *dom) +{ + int rc; + + rc = module_init_one(dom, &dom->system_firmware_module, + "System Firmware module"); + if ( rc ) goto err; + /* Only one module can be added */ + rc = module_init_one(dom, &dom->acpi_modules[0], "ACPI module"); + if ( rc ) goto err; + rc = module_init_one(dom, &dom->smbios_module, "SMBIOS module"); + if ( rc ) goto err; + + return 0; +err: + return -1; +} + +static elf_errorstatus xc_dom_load_hvm_kernel(struct xc_dom_image *dom) +{ + struct elf_binary *elf = dom->private_loader; + privcmd_mmap_entry_t *entries = NULL; + size_t pages = (elf->pend - elf->pstart + PAGE_SIZE - 1) >> PAGE_SHIFT; + elf_errorstatus rc; + int i; + + /* Map address space for initial elf image. */ + entries = calloc(pages, sizeof(privcmd_mmap_entry_t)); + if ( entries == NULL ) + return -ENOMEM; + + for ( i = 0; i < pages; i++ ) + entries[i].mfn = (elf->pstart >> PAGE_SHIFT) + i; + + elf->dest_base = xc_map_foreign_ranges( + dom->xch, dom->guest_domid, pages << PAGE_SHIFT, + PROT_READ | PROT_WRITE, 1 << PAGE_SHIFT, + entries, pages); + if ( elf->dest_base == NULL ) + { + DOMPRINTF("%s: unable to map guest memory space", __func__); + rc = -EFAULT; + goto error; + } + + elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom); + + rc = elf_load_binary(elf); + if ( rc < 0 ) + { + DOMPRINTF("%s: failed to load elf binary", __func__); + goto error; + } + + munmap(elf->dest_base, elf->dest_size); + + rc = modules_init(dom); + if ( rc != 0 ) + { + DOMPRINTF("%s: unable to load modules.", __func__); + goto error; + } + + dom->parms.phys_entry = elf_uval(elf, elf->ehdr, e_entry); + + free(entries); + return 0; + + error: + assert(rc != 0); + free(entries); + return rc; +} + +/* ------------------------------------------------------------------------ */ + +struct xc_dom_loader hvm_loader = { + .name = "HVM-generic", + .probe = xc_dom_probe_hvm_kernel, + .parser = xc_dom_parse_hvm_kernel, + .loader = xc_dom_load_hvm_kernel, +}; + +static void __init register_loader(void) +{ + xc_dom_register_loader(&hvm_loader); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_dom_x86.c b/tools/libs/guest/xg_dom_x86.c new file mode 100644 index 0000000000..842dbcccdd --- /dev/null +++ b/tools/libs/guest/xg_dom_x86.c @@ -0,0 +1,1945 @@ +/* + * Xen domain builder -- i386 and x86_64 bits. + * + * Most architecture-specific code for x86 goes here. + * - prepare page tables. + * - fill architecture-specific structs. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * written 2006 by Gerd Hoffmann . + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "xg_private.h" +#include "xenctrl_dom.h" +#include "xenctrl.h" + +/* ------------------------------------------------------------------------ */ + +#define SUPERPAGE_BATCH_SIZE 512 + +#define SUPERPAGE_2MB_SHIFT 9 +#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) +#define SUPERPAGE_1GB_SHIFT 18 +#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) + +#define X86_CR0_PE 0x01 +#define X86_CR0_ET 0x10 + +#define X86_DR6_DEFAULT 0xffff0ff0u +#define X86_DR7_DEFAULT 0x00000400u + +#define MTRR_TYPE_WRBACK 6 +#define MTRR_DEF_TYPE_ENABLE (1u << 11) + +#define SPECIALPAGE_PAGING 0 +#define SPECIALPAGE_ACCESS 1 +#define SPECIALPAGE_SHARING 2 +#define SPECIALPAGE_BUFIOREQ 3 +#define SPECIALPAGE_XENSTORE 4 +#define SPECIALPAGE_IOREQ 5 +#define SPECIALPAGE_IDENT_PT 6 +#define SPECIALPAGE_CONSOLE 7 +#define special_pfn(x) \ + (X86_HVM_END_SPECIAL_REGION - X86_HVM_NR_SPECIAL_PAGES + (x)) + +#define NR_IOREQ_SERVER_PAGES 8 +#define ioreq_server_pfn(x) (special_pfn(0) - NR_IOREQ_SERVER_PAGES + (x)) + +#define bits_to_mask(bits) (((xen_vaddr_t)1 << (bits))-1) +#define round_down(addr, mask) ((addr) & ~(mask)) +#define round_up(addr, mask) ((addr) | (mask)) +#define round_pg_up(addr) (((addr) + PAGE_SIZE_X86 - 1) & ~(PAGE_SIZE_X86 - 1)) + +#define HVMLOADER_MODULE_MAX_COUNT 2 +#define HVMLOADER_MODULE_CMDLINE_SIZE MAX_GUEST_CMDLINE + +struct xc_dom_params { + unsigned levels; + xen_vaddr_t vaddr_mask; + x86_pgentry_t lvl_prot[4]; +}; + +struct xc_dom_x86_mapping_lvl { + xen_vaddr_t from; + xen_vaddr_t to; + xen_pfn_t pfn; + unsigned int pgtables; +}; + +struct xc_dom_x86_mapping { + struct xc_dom_x86_mapping_lvl area; + struct xc_dom_x86_mapping_lvl lvls[4]; +}; + +struct xc_dom_image_x86 { + unsigned n_mappings; +#define MAPPING_MAX 2 + struct xc_dom_x86_mapping maps[MAPPING_MAX]; + const struct xc_dom_params *params; + + /* PV: Pointer to the in-guest P2M. */ + void *p2m_guest; +}; + +/* get guest IO ABI protocol */ +const char *xc_domain_get_native_protocol(xc_interface *xch, + uint32_t domid) +{ + int ret; + uint32_t guest_width; + const char *protocol; + + ret = xc_domain_get_guest_width(xch, domid, &guest_width); + + if ( ret ) + return NULL; + + switch (guest_width) { + case 4: /* 32 bit guest */ + protocol = XEN_IO_PROTO_ABI_X86_32; + break; + case 8: /* 64 bit guest */ + protocol = XEN_IO_PROTO_ABI_X86_64; + break; + default: + protocol = NULL; + } + + return protocol; +} + +static int count_pgtables(struct xc_dom_image *dom, xen_vaddr_t from, + xen_vaddr_t to, xen_pfn_t pfn) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + struct xc_dom_x86_mapping *map, *map_cmp; + xen_pfn_t pfn_end; + xen_vaddr_t mask; + unsigned bits; + int l, m; + + if ( domx86->n_mappings == MAPPING_MAX ) + { + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "%s: too many mappings\n", __FUNCTION__); + return -ENOMEM; + } + map = domx86->maps + domx86->n_mappings; + + pfn_end = pfn + ((to - from) >> PAGE_SHIFT_X86); + if ( pfn_end >= dom->p2m_size ) + { + xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, + "%s: not enough memory for initial mapping (%#"PRIpfn" > %#"PRIpfn")", + __FUNCTION__, pfn_end, dom->p2m_size); + return -ENOMEM; + } + for ( m = 0; m < domx86->n_mappings; m++ ) + { + map_cmp = domx86->maps + m; + if ( from < map_cmp->area.to && to > map_cmp->area.from ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: overlapping mappings\n", __FUNCTION__); + return -EINVAL; + } + } + + memset(map, 0, sizeof(*map)); + map->area.from = from & domx86->params->vaddr_mask; + map->area.to = to & domx86->params->vaddr_mask; + + for ( l = domx86->params->levels - 1; l >= 0; l-- ) + { + map->lvls[l].pfn = dom->pfn_alloc_end + map->area.pgtables; + if ( l == domx86->params->levels - 1 ) + { + /* Top level page table in first mapping only. */ + if ( domx86->n_mappings == 0 ) + { + map->lvls[l].from = 0; + map->lvls[l].to = domx86->params->vaddr_mask; + map->lvls[l].pgtables = 1; + map->area.pgtables++; + } + continue; + } + + bits = PAGE_SHIFT_X86 + (l + 1) * PGTBL_LEVEL_SHIFT_X86; + mask = bits_to_mask(bits); + map->lvls[l].from = map->area.from & ~mask; + map->lvls[l].to = map->area.to | mask; + + if ( domx86->params->levels == PGTBL_LEVELS_I386 && + domx86->n_mappings == 0 && to < 0xc0000000 && l == 1 ) + { + DOMPRINTF("%s: PAE: extra l2 page table for l3#3", __FUNCTION__); + map->lvls[l].to = domx86->params->vaddr_mask; + } + + for ( m = 0; m < domx86->n_mappings; m++ ) + { + map_cmp = domx86->maps + m; + if ( map_cmp->lvls[l].from == map_cmp->lvls[l].to ) + continue; + if ( map->lvls[l].from >= map_cmp->lvls[l].from && + map->lvls[l].to <= map_cmp->lvls[l].to ) + { + map->lvls[l].from = 0; + map->lvls[l].to = 0; + break; + } + assert(map->lvls[l].from >= map_cmp->lvls[l].from || + map->lvls[l].to <= map_cmp->lvls[l].to); + if ( map->lvls[l].from >= map_cmp->lvls[l].from && + map->lvls[l].from <= map_cmp->lvls[l].to ) + map->lvls[l].from = map_cmp->lvls[l].to + 1; + if ( map->lvls[l].to >= map_cmp->lvls[l].from && + map->lvls[l].to <= map_cmp->lvls[l].to ) + map->lvls[l].to = map_cmp->lvls[l].from - 1; + } + if ( map->lvls[l].from < map->lvls[l].to ) + map->lvls[l].pgtables = + ((map->lvls[l].to - map->lvls[l].from) >> bits) + 1; + DOMPRINTF("%s: 0x%016" PRIx64 "/%d: 0x%016" PRIx64 " -> 0x%016" PRIx64 + ", %d table(s)", __FUNCTION__, mask, bits, + map->lvls[l].from, map->lvls[l].to, map->lvls[l].pgtables); + map->area.pgtables += map->lvls[l].pgtables; + } + + return 0; +} + +static int alloc_pgtables_pv(struct xc_dom_image *dom) +{ + int pages, extra_pages; + xen_vaddr_t try_virt_end; + struct xc_dom_image_x86 *domx86 = dom->arch_private; + struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings; + + extra_pages = dom->alloc_bootstack ? 1 : 0; + extra_pages += (512 * 1024) / PAGE_SIZE_X86; /* 512kB padding */ + pages = extra_pages; + for ( ; ; ) + { + try_virt_end = round_up(dom->virt_alloc_end + pages * PAGE_SIZE_X86, + bits_to_mask(22)); /* 4MB alignment */ + + if ( count_pgtables(dom, dom->parms.virt_base, try_virt_end, 0) ) + return -1; + + pages = map->area.pgtables + extra_pages; + if ( dom->virt_alloc_end + pages * PAGE_SIZE_X86 <= try_virt_end + 1 ) + break; + } + map->area.pfn = 0; + domx86->n_mappings++; + dom->virt_pgtab_end = try_virt_end + 1; + + return xc_dom_alloc_segment(dom, &dom->pgtables_seg, "page tables", 0, + map->area.pgtables * PAGE_SIZE_X86); +} + +/* ------------------------------------------------------------------------ */ +/* i386 pagetables */ + +static int alloc_pgtables_x86_32_pae(struct xc_dom_image *dom) +{ + static const struct xc_dom_params x86_32_params = { + .levels = PGTBL_LEVELS_I386, + .vaddr_mask = bits_to_mask(VIRT_BITS_I386), + .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED, + /* + * 64bit Xen runs 32bit PV guests with the PAE entries in an L3 + * pagetable. They don't behave exactly like native PAE paging. + */ + .lvl_prot[1 ... 2] = + _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, + }; + struct xc_dom_image_x86 *domx86 = dom->arch_private; + + domx86->params = &x86_32_params; + + return alloc_pgtables_pv(dom); +} + +#define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86) +#define pgentry_to_pfn(entry) ((xen_pfn_t)((entry) >> PAGE_SHIFT_X86)) + +/* + * Move the l3 page table page below 4G for guests which do not + * support the extended-cr3 format. The l3 is currently empty so we + * do not need to preserve the current contents. + */ +static xen_pfn_t move_l3_below_4G(struct xc_dom_image *dom, + xen_pfn_t l3pfn, + xen_pfn_t l3mfn) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + uint32_t *p2m_guest = domx86->p2m_guest; + xen_pfn_t new_l3mfn; + struct xc_mmu *mmu; + void *l3tab; + + mmu = xc_alloc_mmu_updates(dom->xch, dom->guest_domid); + if ( mmu == NULL ) + { + DOMPRINTF("%s: failed at %d", __FUNCTION__, __LINE__); + return l3mfn; + } + + xc_dom_unmap_one(dom, l3pfn); + + new_l3mfn = xc_make_page_below_4G(dom->xch, dom->guest_domid, l3mfn); + if ( !new_l3mfn ) + goto out; + + p2m_guest[l3pfn] = dom->pv_p2m[l3pfn] = new_l3mfn; + + if ( xc_add_mmu_update(dom->xch, mmu, + (((unsigned long long)new_l3mfn) + << XC_DOM_PAGE_SHIFT(dom)) | + MMU_MACHPHYS_UPDATE, l3pfn) ) + goto out; + + if ( xc_flush_mmu_updates(dom->xch, mmu) ) + goto out; + + /* + * This ensures that the entire pgtables_seg is mapped by a single + * mmap region. arch_setup_bootlate() relies on this to be able to + * unmap and pin the pagetables. + */ + if ( xc_dom_seg_to_ptr(dom, &dom->pgtables_seg) == NULL ) + goto out; + + l3tab = xc_dom_pfn_to_ptr(dom, l3pfn, 1); + if ( l3tab == NULL ) + { + DOMPRINTF("%s: xc_dom_pfn_to_ptr(dom, l3pfn, 1) => NULL", + __FUNCTION__); + goto out; /* our one call site will call xc_dom_panic and fail */ + } + memset(l3tab, 0, XC_DOM_PAGE_SIZE(dom)); + + DOMPRINTF("%s: successfully relocated L3 below 4G. " + "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn"=>%#"PRIpfn")", + __FUNCTION__, l3pfn, l3mfn, new_l3mfn); + + l3mfn = new_l3mfn; + + out: + free(mmu); + + return l3mfn; +} + +static x86_pgentry_t *get_pg_table(struct xc_dom_image *dom, int m, int l) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + struct xc_dom_x86_mapping *map; + x86_pgentry_t *pg; + + map = domx86->maps + m; + pg = xc_dom_pfn_to_ptr(dom, map->lvls[l].pfn, 0); + if ( pg ) + return pg; + + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: xc_dom_pfn_to_ptr failed", __FUNCTION__); + return NULL; +} + +static x86_pgentry_t get_pg_prot(struct xc_dom_image *dom, int l, xen_pfn_t pfn) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + struct xc_dom_x86_mapping *map; + xen_pfn_t pfn_s, pfn_e; + x86_pgentry_t prot; + unsigned m; + + prot = domx86->params->lvl_prot[l]; + if ( l > 0 ) + return prot; + + for ( m = 0; m < domx86->n_mappings; m++ ) + { + map = domx86->maps + m; + pfn_s = map->lvls[domx86->params->levels - 1].pfn; + pfn_e = map->area.pgtables + pfn_s; + if ( pfn >= pfn_s && pfn < pfn_e ) + return prot & ~_PAGE_RW; + } + + return prot; +} + +static int setup_pgtables_pv(struct xc_dom_image *dom) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + struct xc_dom_x86_mapping *map1, *map2; + struct xc_dom_x86_mapping_lvl *lvl; + xen_vaddr_t from, to; + xen_pfn_t pfn, p, p_s, p_e; + x86_pgentry_t *pg; + unsigned m1, m2; + int l; + + for ( l = domx86->params->levels - 1; l >= 0; l-- ) + for ( m1 = 0; m1 < domx86->n_mappings; m1++ ) + { + map1 = domx86->maps + m1; + from = map1->lvls[l].from; + to = map1->lvls[l].to; + pg = get_pg_table(dom, m1, l); + if ( !pg ) + return -1; + for ( m2 = 0; m2 < domx86->n_mappings; m2++ ) + { + map2 = domx86->maps + m2; + lvl = (l > 0) ? map2->lvls + l - 1 : &map2->area; + if ( l > 0 && lvl->pgtables == 0 ) + continue; + if ( lvl->from >= to || lvl->to <= from ) + continue; + p_s = (max(from, lvl->from) - from) >> + (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86); + p_e = (min(to, lvl->to) - from) >> + (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86); + pfn = ((max(from, lvl->from) - lvl->from) >> + (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86)) + lvl->pfn; + for ( p = p_s; p <= p_e; p++ ) + { + pg[p] = pfn_to_paddr(xc_dom_p2m(dom, pfn)) | + get_pg_prot(dom, l, pfn); + pfn++; + } + } + } + + return 0; +} + +static int setup_pgtables_x86_32_pae(struct xc_dom_image *dom) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + uint32_t *p2m_guest = domx86->p2m_guest; + xen_pfn_t l3mfn, l3pfn, i; + + /* Copy dom->pv_p2m[] into the guest. */ + for ( i = 0; i < dom->p2m_size; ++i ) + { + if ( dom->pv_p2m[i] != INVALID_PFN ) + p2m_guest[i] = dom->pv_p2m[i]; + else + p2m_guest[i] = -1; + } + + l3pfn = domx86->maps[0].lvls[2].pfn; + l3mfn = xc_dom_p2m(dom, l3pfn); + if ( dom->parms.pae == XEN_PAE_YES ) + { + if ( l3mfn >= 0x100000 ) + l3mfn = move_l3_below_4G(dom, l3pfn, l3mfn); + + if ( l3mfn >= 0x100000 ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,"%s: cannot move L3" + " below 4G. extended-cr3 not supported by guest. " + "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn")", + __FUNCTION__, l3pfn, l3mfn); + return -EINVAL; + } + } + + return setup_pgtables_pv(dom); +} + +/* ------------------------------------------------------------------------ */ +/* x86_64 pagetables */ + +static int alloc_pgtables_x86_64(struct xc_dom_image *dom) +{ + const static struct xc_dom_params x86_64_params = { + .levels = PGTBL_LEVELS_X86_64, + .vaddr_mask = bits_to_mask(VIRT_BITS_X86_64), + .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED, + .lvl_prot[1 ... 3] = + _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, + }; + struct xc_dom_image_x86 *domx86 = dom->arch_private; + + domx86->params = &x86_64_params; + + return alloc_pgtables_pv(dom); +} + +static int setup_pgtables_x86_64(struct xc_dom_image *dom) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + uint64_t *p2m_guest = domx86->p2m_guest; + xen_pfn_t i; + + /* Copy dom->pv_p2m[] into the guest. */ + for ( i = 0; i < dom->p2m_size; ++i ) + { + if ( dom->pv_p2m[i] != INVALID_PFN ) + p2m_guest[i] = dom->pv_p2m[i]; + else + p2m_guest[i] = -1; + } + + return setup_pgtables_pv(dom); +} + +/* ------------------------------------------------------------------------ */ + +static int alloc_p2m_list(struct xc_dom_image *dom, size_t p2m_alloc_size) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + + if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach", + 0, p2m_alloc_size) ) + return -1; + + domx86->p2m_guest = xc_dom_seg_to_ptr(dom, &dom->p2m_seg); + if ( domx86->p2m_guest == NULL ) + return -1; + + return 0; +} + +static int alloc_p2m_list_x86_32(struct xc_dom_image *dom) +{ + size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn; + + p2m_alloc_size = round_pg_up(p2m_alloc_size); + return alloc_p2m_list(dom, p2m_alloc_size); +} + +static int alloc_p2m_list_x86_64(struct xc_dom_image *dom) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings; + size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn; + xen_vaddr_t from, to; + unsigned lvl; + + p2m_alloc_size = round_pg_up(p2m_alloc_size); + if ( dom->parms.p2m_base != UNSET_ADDR ) + { + from = dom->parms.p2m_base; + to = from + p2m_alloc_size - 1; + if ( count_pgtables(dom, from, to, dom->pfn_alloc_end) ) + return -1; + + map->area.pfn = dom->pfn_alloc_end; + for ( lvl = 0; lvl < 4; lvl++ ) + map->lvls[lvl].pfn += p2m_alloc_size >> PAGE_SHIFT_X86; + domx86->n_mappings++; + p2m_alloc_size += map->area.pgtables << PAGE_SHIFT_X86; + } + + return alloc_p2m_list(dom, p2m_alloc_size); +} + +/* ------------------------------------------------------------------------ */ + +static int alloc_magic_pages_pv(struct xc_dom_image *dom) +{ + dom->start_info_pfn = xc_dom_alloc_page(dom, "start info"); + if ( dom->start_info_pfn == INVALID_PFN ) + return -1; + + dom->xenstore_pfn = xc_dom_alloc_page(dom, "xenstore"); + if ( dom->xenstore_pfn == INVALID_PFN ) + return -1; + xc_clear_domain_page(dom->xch, dom->guest_domid, + xc_dom_p2m(dom, dom->xenstore_pfn)); + + dom->console_pfn = xc_dom_alloc_page(dom, "console"); + if ( dom->console_pfn == INVALID_PFN ) + return -1; + xc_clear_domain_page(dom->xch, dom->guest_domid, + xc_dom_p2m(dom, dom->console_pfn)); + + dom->alloc_bootstack = 1; + + return 0; +} + +static void build_hvm_info(void *hvm_info_page, struct xc_dom_image *dom) +{ + struct hvm_info_table *hvm_info = (struct hvm_info_table *) + (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET); + uint8_t sum; + int i; + + memset(hvm_info_page, 0, PAGE_SIZE); + + /* Fill in the header. */ + memcpy(hvm_info->signature, "HVM INFO", sizeof(hvm_info->signature)); + hvm_info->length = sizeof(struct hvm_info_table); + + /* Sensible defaults: these can be overridden by the caller. */ + hvm_info->apic_mode = 1; + hvm_info->nr_vcpus = 1; + memset(hvm_info->vcpu_online, 0xff, sizeof(hvm_info->vcpu_online)); + + /* Memory parameters. */ + hvm_info->low_mem_pgend = dom->lowmem_end >> PAGE_SHIFT; + hvm_info->high_mem_pgend = dom->highmem_end >> PAGE_SHIFT; + hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0); + + /* Finish with the checksum. */ + for ( i = 0, sum = 0; i < hvm_info->length; i++ ) + sum += ((uint8_t *)hvm_info)[i]; + hvm_info->checksum = -sum; +} + +static int alloc_magic_pages_hvm(struct xc_dom_image *dom) +{ + unsigned long i; + uint32_t *ident_pt, domid = dom->guest_domid; + int rc; + xen_pfn_t special_array[X86_HVM_NR_SPECIAL_PAGES]; + xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES]; + xc_interface *xch = dom->xch; + size_t start_info_size = sizeof(struct hvm_start_info); + + /* Allocate and clear special pages. */ + for ( i = 0; i < X86_HVM_NR_SPECIAL_PAGES; i++ ) + special_array[i] = special_pfn(i); + + rc = xc_domain_populate_physmap_exact(xch, domid, X86_HVM_NR_SPECIAL_PAGES, + 0, 0, special_array); + if ( rc != 0 ) + { + DOMPRINTF("Could not allocate special pages."); + goto error_out; + } + + if ( xc_clear_domain_pages(xch, domid, special_pfn(0), + X86_HVM_NR_SPECIAL_PAGES) ) + goto error_out; + + xc_hvm_param_set(xch, domid, HVM_PARAM_STORE_PFN, + special_pfn(SPECIALPAGE_XENSTORE)); + xc_hvm_param_set(xch, domid, HVM_PARAM_BUFIOREQ_PFN, + special_pfn(SPECIALPAGE_BUFIOREQ)); + xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_PFN, + special_pfn(SPECIALPAGE_IOREQ)); + xc_hvm_param_set(xch, domid, HVM_PARAM_CONSOLE_PFN, + special_pfn(SPECIALPAGE_CONSOLE)); + xc_hvm_param_set(xch, domid, HVM_PARAM_PAGING_RING_PFN, + special_pfn(SPECIALPAGE_PAGING)); + xc_hvm_param_set(xch, domid, HVM_PARAM_MONITOR_RING_PFN, + special_pfn(SPECIALPAGE_ACCESS)); + xc_hvm_param_set(xch, domid, HVM_PARAM_SHARING_RING_PFN, + special_pfn(SPECIALPAGE_SHARING)); + + start_info_size += + sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; + + start_info_size += + HVMLOADER_MODULE_CMDLINE_SIZE * HVMLOADER_MODULE_MAX_COUNT; + + start_info_size += + dom->e820_entries * sizeof(struct hvm_memmap_table_entry); + + if ( !dom->device_model ) + { + if ( dom->cmdline ) + { + dom->cmdline_size = ROUNDUP(strlen(dom->cmdline) + 1, 8); + start_info_size += dom->cmdline_size; + } + } + else + { + /* + * Allocate and clear additional ioreq server pages. The default + * server will use the IOREQ and BUFIOREQ special pages above. + */ + for ( i = 0; i < NR_IOREQ_SERVER_PAGES; i++ ) + ioreq_server_array[i] = ioreq_server_pfn(i); + + rc = xc_domain_populate_physmap_exact(xch, domid, NR_IOREQ_SERVER_PAGES, 0, + 0, ioreq_server_array); + if ( rc != 0 ) + { + DOMPRINTF("Could not allocate ioreq server pages."); + goto error_out; + } + + if ( xc_clear_domain_pages(xch, domid, ioreq_server_pfn(0), + NR_IOREQ_SERVER_PAGES) ) + goto error_out; + + /* Tell the domain where the pages are and how many there are */ + xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_SERVER_PFN, + ioreq_server_pfn(0)); + xc_hvm_param_set(xch, domid, HVM_PARAM_NR_IOREQ_SERVER_PAGES, + NR_IOREQ_SERVER_PAGES); + } + + rc = xc_dom_alloc_segment(dom, &dom->start_info_seg, + "HVM start info", 0, start_info_size); + if ( rc != 0 ) + { + DOMPRINTF("Unable to reserve memory for the start info"); + goto out; + } + + /* + * Identity-map page table is required for running with CR0.PG=0 when + * using Intel EPT. Create a 32-bit non-PAE page directory of superpages. + */ + if ( (ident_pt = xc_map_foreign_range( + xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE, + special_pfn(SPECIALPAGE_IDENT_PT))) == NULL ) + goto error_out; + for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ ) + ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + munmap(ident_pt, PAGE_SIZE); + xc_hvm_param_set(xch, domid, HVM_PARAM_IDENT_PT, + special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT); + + dom->console_pfn = special_pfn(SPECIALPAGE_CONSOLE); + xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn); + + dom->xenstore_pfn = special_pfn(SPECIALPAGE_XENSTORE); + xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn); + + dom->parms.virt_hypercall = -1; + + rc = 0; + goto out; + error_out: + rc = -1; + out: + + return rc; +} + +/* ------------------------------------------------------------------------ */ + +static int start_info_x86_32(struct xc_dom_image *dom) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + start_info_x86_32_t *start_info = + xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1); + xen_pfn_t shinfo = + xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn; + + DOMPRINTF_CALLED(dom->xch); + + if ( start_info == NULL ) + { + DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__); + return -1; /* our caller throws away our return value :-/ */ + } + + memset(start_info, 0, sizeof(*start_info)); + strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic)); + start_info->magic[sizeof(start_info->magic) - 1] = '\0'; + start_info->nr_pages = dom->total_pages; + start_info->shared_info = shinfo << PAGE_SHIFT_X86; + start_info->pt_base = dom->pgtables_seg.vstart; + start_info->nr_pt_frames = domx86->maps[0].area.pgtables; + start_info->mfn_list = dom->p2m_seg.vstart; + + start_info->flags = dom->flags; + start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); + start_info->store_evtchn = dom->xenstore_evtchn; + start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); + start_info->console.domU.evtchn = dom->console_evtchn; + + if ( dom->modules[0].blob ) + { + start_info->mod_start = dom->initrd_start; + start_info->mod_len = dom->initrd_len; + } + + if ( dom->cmdline ) + { + strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE); + start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0'; + } + + return 0; +} + +static int start_info_x86_64(struct xc_dom_image *dom) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + start_info_x86_64_t *start_info = + xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1); + xen_pfn_t shinfo = + xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn; + + DOMPRINTF_CALLED(dom->xch); + + if ( start_info == NULL ) + { + DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__); + return -1; /* our caller throws away our return value :-/ */ + } + + memset(start_info, 0, sizeof(*start_info)); + strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic)); + start_info->magic[sizeof(start_info->magic) - 1] = '\0'; + start_info->nr_pages = dom->total_pages; + start_info->shared_info = shinfo << PAGE_SHIFT_X86; + start_info->pt_base = dom->pgtables_seg.vstart; + start_info->nr_pt_frames = domx86->maps[0].area.pgtables; + start_info->mfn_list = dom->p2m_seg.vstart; + if ( dom->parms.p2m_base != UNSET_ADDR ) + { + start_info->first_p2m_pfn = dom->p2m_seg.pfn; + start_info->nr_p2m_frames = dom->p2m_seg.pages; + } + + start_info->flags = dom->flags; + start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); + start_info->store_evtchn = dom->xenstore_evtchn; + start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); + start_info->console.domU.evtchn = dom->console_evtchn; + + if ( dom->modules[0].blob ) + { + start_info->mod_start = dom->initrd_start; + start_info->mod_len = dom->initrd_len; + } + + if ( dom->cmdline ) + { + strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE); + start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0'; + } + + return 0; +} + +static int shared_info_x86_32(struct xc_dom_image *dom, void *ptr) +{ + shared_info_x86_32_t *shared_info = ptr; + int i; + + DOMPRINTF_CALLED(dom->xch); + + memset(shared_info, 0, sizeof(*shared_info)); + for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) + shared_info->vcpu_info[i].evtchn_upcall_mask = 1; + return 0; +} + +static int shared_info_x86_64(struct xc_dom_image *dom, void *ptr) +{ + shared_info_x86_64_t *shared_info = ptr; + int i; + + DOMPRINTF_CALLED(dom->xch); + + memset(shared_info, 0, sizeof(*shared_info)); + for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) + shared_info->vcpu_info[i].evtchn_upcall_mask = 1; + return 0; +} + +/* ------------------------------------------------------------------------ */ + +static int vcpu_x86_32(struct xc_dom_image *dom) +{ + vcpu_guest_context_any_t any_ctx; + vcpu_guest_context_x86_32_t *ctxt = &any_ctx.x32; + xen_pfn_t cr3_pfn; + int rc; + + DOMPRINTF_CALLED(dom->xch); + + /* clear everything */ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->user_regs.eip = dom->parms.virt_entry; + ctxt->user_regs.esp = + dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86; + ctxt->user_regs.esi = + dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86; + ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */ + + ctxt->debugreg[6] = X86_DR6_DEFAULT; + ctxt->debugreg[7] = X86_DR7_DEFAULT; + + ctxt->flags = VGCF_in_kernel_X86_32 | VGCF_online_X86_32; + if ( dom->parms.pae == XEN_PAE_EXTCR3 || + dom->parms.pae == XEN_PAE_BIMODAL ) + ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3); + + cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn); + ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_32(cr3_pfn); + DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "", + __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn); + + ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_32; + ctxt->user_regs.es = FLAT_KERNEL_DS_X86_32; + ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_32; + ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_32; + ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_32; + ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_32; + + ctxt->kernel_ss = ctxt->user_regs.ss; + ctxt->kernel_sp = ctxt->user_regs.esp; + + rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); + if ( rc != 0 ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); + + return rc; +} + +static int vcpu_x86_64(struct xc_dom_image *dom) +{ + vcpu_guest_context_any_t any_ctx; + vcpu_guest_context_x86_64_t *ctxt = &any_ctx.x64; + xen_pfn_t cr3_pfn; + int rc; + + DOMPRINTF_CALLED(dom->xch); + + /* clear everything */ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->user_regs.rip = dom->parms.virt_entry; + ctxt->user_regs.rsp = + dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86; + ctxt->user_regs.rsi = + dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86; + ctxt->user_regs.rflags = 1 << 9; /* Interrupt Enable */ + + ctxt->debugreg[6] = X86_DR6_DEFAULT; + ctxt->debugreg[7] = X86_DR7_DEFAULT; + + ctxt->flags = VGCF_in_kernel_X86_64 | VGCF_online_X86_64; + cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn); + ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_64(cr3_pfn); + DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "", + __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn); + + ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_64; + ctxt->user_regs.es = FLAT_KERNEL_DS_X86_64; + ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_64; + ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_64; + ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_64; + ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_64; + + ctxt->kernel_ss = ctxt->user_regs.ss; + ctxt->kernel_sp = ctxt->user_regs.esp; + + rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); + if ( rc != 0 ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); + + return rc; +} + +const static void *hvm_get_save_record(const void *ctx, unsigned int type, + unsigned int instance) +{ + const struct hvm_save_descriptor *header; + + for ( header = ctx; + header->typecode != HVM_SAVE_CODE(END); + ctx += sizeof(*header) + header->length, header = ctx ) + if ( header->typecode == type && header->instance == instance ) + return ctx + sizeof(*header); + + return NULL; +} + +static int vcpu_hvm(struct xc_dom_image *dom) +{ + struct { + struct hvm_save_descriptor header_d; + HVM_SAVE_TYPE(HEADER) header; + struct hvm_save_descriptor cpu_d; + HVM_SAVE_TYPE(CPU) cpu; + struct hvm_save_descriptor end_d; + HVM_SAVE_TYPE(END) end; + } bsp_ctx; + uint8_t *full_ctx = NULL; + int rc; + + DOMPRINTF_CALLED(dom->xch); + + assert(dom->max_vcpus); + + /* + * Get the full HVM context in order to have the header, it is not + * possible to get the header with getcontext_partial, and crafting one + * from userspace is also not an option since cpuid is trapped and + * modified by Xen. + */ + + rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, NULL, 0); + if ( rc <= 0 ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: unable to fetch HVM context size (rc=%d)", + __func__, rc); + goto out; + } + + full_ctx = calloc(1, rc); + if ( full_ctx == NULL ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: unable to allocate memory for HVM context (rc=%d)", + __func__, rc); + rc = -ENOMEM; + goto out; + } + + rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, full_ctx, rc); + if ( rc <= 0 ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: unable to fetch HVM context (rc=%d)", + __func__, rc); + goto out; + } + + /* Copy the header to our partial context. */ + memset(&bsp_ctx, 0, sizeof(bsp_ctx)); + memcpy(&bsp_ctx, full_ctx, + sizeof(struct hvm_save_descriptor) + HVM_SAVE_LENGTH(HEADER)); + + /* Set the CPU descriptor. */ + bsp_ctx.cpu_d.typecode = HVM_SAVE_CODE(CPU); + bsp_ctx.cpu_d.instance = 0; + bsp_ctx.cpu_d.length = HVM_SAVE_LENGTH(CPU); + + /* Set the cached part of the relevant segment registers. */ + bsp_ctx.cpu.cs_base = 0; + bsp_ctx.cpu.ds_base = 0; + bsp_ctx.cpu.es_base = 0; + bsp_ctx.cpu.ss_base = 0; + bsp_ctx.cpu.tr_base = 0; + bsp_ctx.cpu.cs_limit = ~0u; + bsp_ctx.cpu.ds_limit = ~0u; + bsp_ctx.cpu.es_limit = ~0u; + bsp_ctx.cpu.ss_limit = ~0u; + bsp_ctx.cpu.tr_limit = 0x67; + bsp_ctx.cpu.cs_arbytes = 0xc9b; + bsp_ctx.cpu.ds_arbytes = 0xc93; + bsp_ctx.cpu.es_arbytes = 0xc93; + bsp_ctx.cpu.ss_arbytes = 0xc93; + bsp_ctx.cpu.tr_arbytes = 0x8b; + + /* Set the control registers. */ + bsp_ctx.cpu.cr0 = X86_CR0_PE | X86_CR0_ET; + + /* Set the IP. */ + bsp_ctx.cpu.rip = dom->parms.phys_entry; + + bsp_ctx.cpu.dr6 = X86_DR6_DEFAULT; + bsp_ctx.cpu.dr7 = X86_DR7_DEFAULT; + + if ( dom->start_info_seg.pfn ) + bsp_ctx.cpu.rbx = dom->start_info_seg.pfn << PAGE_SHIFT; + + /* Set the end descriptor. */ + bsp_ctx.end_d.typecode = HVM_SAVE_CODE(END); + bsp_ctx.end_d.instance = 0; + bsp_ctx.end_d.length = HVM_SAVE_LENGTH(END); + + /* TODO: maybe this should be a firmware option instead? */ + if ( !dom->device_model ) + { + struct { + struct hvm_save_descriptor header_d; + HVM_SAVE_TYPE(HEADER) header; + struct hvm_save_descriptor mtrr_d; + HVM_SAVE_TYPE(MTRR) mtrr; + struct hvm_save_descriptor end_d; + HVM_SAVE_TYPE(END) end; + } mtrr = { + .header_d = bsp_ctx.header_d, + .header = bsp_ctx.header, + .mtrr_d.typecode = HVM_SAVE_CODE(MTRR), + .mtrr_d.length = HVM_SAVE_LENGTH(MTRR), + .end_d = bsp_ctx.end_d, + .end = bsp_ctx.end, + }; + const HVM_SAVE_TYPE(MTRR) *mtrr_record = + hvm_get_save_record(full_ctx, HVM_SAVE_CODE(MTRR), 0); + unsigned int i; + + if ( !mtrr_record ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: unable to get MTRR save record", __func__); + goto out; + } + + memcpy(&mtrr.mtrr, mtrr_record, sizeof(mtrr.mtrr)); + + /* + * Enable MTRR, set default type to WB. + * TODO: add MMIO areas as UC when passthrough is supported. + */ + mtrr.mtrr.msr_mtrr_def_type = MTRR_TYPE_WRBACK | MTRR_DEF_TYPE_ENABLE; + + for ( i = 0; i < dom->max_vcpus; i++ ) + { + mtrr.mtrr_d.instance = i; + rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid, + (uint8_t *)&mtrr, sizeof(mtrr)); + if ( rc != 0 ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc); + } + } + + /* + * Loading the BSP context should be done in the last call to setcontext, + * since each setcontext call will put all vCPUs down. + */ + rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid, + (uint8_t *)&bsp_ctx, sizeof(bsp_ctx)); + if ( rc != 0 ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc); + + out: + free(full_ctx); + return rc; +} + +/* ------------------------------------------------------------------------ */ + +static int x86_compat(xc_interface *xch, uint32_t domid, char *guest_type) +{ + static const struct { + char *guest; + uint32_t size; + } types[] = { + { "xen-3.0-x86_32p", 32 }, + { "xen-3.0-x86_64", 64 }, + }; + DECLARE_DOMCTL; + int i,rc; + + memset(&domctl, 0, sizeof(domctl)); + domctl.domain = domid; + domctl.cmd = XEN_DOMCTL_set_address_size; + for ( i = 0; i < ARRAY_SIZE(types); i++ ) + if ( !strcmp(types[i].guest, guest_type) ) + domctl.u.address_size.size = types[i].size; + if ( domctl.u.address_size.size == 0 ) + /* nothing to do */ + return 0; + + xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__, + guest_type, domctl.u.address_size.size); + rc = do_domctl(xch, &domctl); + if ( rc != 0 ) + xc_dom_printf(xch, "%s: warning: failed (rc=%d)", + __FUNCTION__, rc); + return rc; +} + +static int meminit_pv(struct xc_dom_image *dom) +{ + int rc; + xen_pfn_t pfn, allocsz, mfn, total, pfn_base; + int i, j, k; + xen_vmemrange_t dummy_vmemrange[1]; + unsigned int dummy_vnode_to_pnode[1]; + xen_vmemrange_t *vmemranges; + unsigned int *vnode_to_pnode; + unsigned int nr_vmemranges, nr_vnodes; + + rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type); + if ( rc ) + return rc; + + /* try to claim pages for early warning of insufficient memory avail */ + if ( dom->claim_enabled ) + { + rc = xc_domain_claim_pages(dom->xch, dom->guest_domid, + dom->total_pages); + if ( rc ) + return rc; + } + + /* Setup dummy vNUMA information if it's not provided. Note + * that this is a valid state if libxl doesn't provide any + * vNUMA information. + * + * The dummy values make libxc allocate all pages from + * arbitrary physical nodes. This is the expected behaviour if + * no vNUMA configuration is provided to libxc. + * + * Note that the following hunk is just for the convenience of + * allocation code. No defaulting happens in libxc. + */ + if ( dom->nr_vmemranges == 0 ) + { + nr_vmemranges = 1; + vmemranges = dummy_vmemrange; + vmemranges[0].start = 0; + vmemranges[0].end = (uint64_t)dom->total_pages << PAGE_SHIFT; + vmemranges[0].flags = 0; + vmemranges[0].nid = 0; + + nr_vnodes = 1; + vnode_to_pnode = dummy_vnode_to_pnode; + vnode_to_pnode[0] = XC_NUMA_NO_NODE; + } + else + { + nr_vmemranges = dom->nr_vmemranges; + nr_vnodes = dom->nr_vnodes; + vmemranges = dom->vmemranges; + vnode_to_pnode = dom->vnode_to_pnode; + } + + total = dom->p2m_size = 0; + for ( i = 0; i < nr_vmemranges; i++ ) + { + total += ((vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT); + dom->p2m_size = max(dom->p2m_size, + (xen_pfn_t)(vmemranges[i].end >> PAGE_SHIFT)); + } + if ( total != dom->total_pages ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: vNUMA page count mismatch (0x%"PRIpfn" != 0x%"PRIpfn")", + __func__, total, dom->total_pages); + return -EINVAL; + } + + dom->pv_p2m = xc_dom_malloc(dom, sizeof(*dom->pv_p2m) * dom->p2m_size); + if ( dom->pv_p2m == NULL ) + return -EINVAL; + for ( pfn = 0; pfn < dom->p2m_size; pfn++ ) + dom->pv_p2m[pfn] = INVALID_PFN; + + /* allocate guest memory */ + for ( i = 0; i < nr_vmemranges; i++ ) + { + unsigned int memflags; + uint64_t pages, super_pages; + unsigned int pnode = vnode_to_pnode[vmemranges[i].nid]; + xen_pfn_t extents[SUPERPAGE_BATCH_SIZE]; + xen_pfn_t pfn_base_idx; + + memflags = 0; + if ( pnode != XC_NUMA_NO_NODE ) + memflags |= XENMEMF_exact_node(pnode); + + pages = (vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT; + super_pages = pages >> SUPERPAGE_2MB_SHIFT; + pfn_base = vmemranges[i].start >> PAGE_SHIFT; + + for ( pfn = pfn_base; pfn < pfn_base+pages; pfn++ ) + dom->pv_p2m[pfn] = pfn; + + pfn_base_idx = pfn_base; + while ( super_pages ) { + uint64_t count = min_t(uint64_t, super_pages, SUPERPAGE_BATCH_SIZE); + super_pages -= count; + + for ( pfn = pfn_base_idx, j = 0; + pfn < pfn_base_idx + (count << SUPERPAGE_2MB_SHIFT); + pfn += SUPERPAGE_2MB_NR_PFNS, j++ ) + extents[j] = dom->pv_p2m[pfn]; + rc = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count, + SUPERPAGE_2MB_SHIFT, memflags, + extents); + if ( rc < 0 ) + return rc; + + /* Expand the returned mfns into the p2m array. */ + pfn = pfn_base_idx; + for ( j = 0; j < rc; j++ ) + { + mfn = extents[j]; + for ( k = 0; k < SUPERPAGE_2MB_NR_PFNS; k++, pfn++ ) + dom->pv_p2m[pfn] = mfn + k; + } + pfn_base_idx = pfn; + } + + for ( j = pfn_base_idx - pfn_base; j < pages; j += allocsz ) + { + allocsz = min_t(uint64_t, 1024 * 1024, pages - j); + rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid, + allocsz, 0, memflags, &dom->pv_p2m[pfn_base + j]); + + if ( rc ) + { + if ( pnode != XC_NUMA_NO_NODE ) + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: failed to allocate 0x%"PRIx64" pages (v=%d, p=%d)", + __func__, pages, i, pnode); + else + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: failed to allocate 0x%"PRIx64" pages", + __func__, pages); + return rc; + } + } + rc = 0; + } + + /* Ensure no unclaimed pages are left unused. + * OK to call if hadn't done the earlier claim call. */ + xc_domain_claim_pages(dom->xch, dom->guest_domid, 0 /* cancel claim */); + + return rc; +} + +/* + * Check whether there exists mmio hole in the specified memory range. + * Returns 1 if exists, else returns 0. + */ +static int check_mmio_hole(uint64_t start, uint64_t memsize, + uint64_t mmio_start, uint64_t mmio_size) +{ + if ( start + memsize <= mmio_start || start >= mmio_start + mmio_size ) + return 0; + else + return 1; +} + +static int meminit_hvm(struct xc_dom_image *dom) +{ + unsigned long i, vmemid, nr_pages = dom->total_pages; + unsigned long p2m_size; + unsigned long target_pages = dom->target_pages; + unsigned long cur_pages, cur_pfn; + int rc; + unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, + stat_1gb_pages = 0; + unsigned int memflags = 0; + int claim_enabled = dom->claim_enabled; + uint64_t total_pages; + xen_vmemrange_t dummy_vmemrange[2]; + unsigned int dummy_vnode_to_pnode[1]; + xen_vmemrange_t *vmemranges; + unsigned int *vnode_to_pnode; + unsigned int nr_vmemranges, nr_vnodes; + xc_interface *xch = dom->xch; + uint32_t domid = dom->guest_domid; + + if ( nr_pages > target_pages ) + memflags |= XENMEMF_populate_on_demand; + + if ( dom->nr_vmemranges == 0 ) + { + /* Build dummy vnode information + * + * Guest physical address space layout: + * [0, hole_start) [hole_start, 4G) [4G, highmem_end) + * + * Of course if there is no high memory, the second vmemrange + * has no effect on the actual result. + */ + + dummy_vmemrange[0].start = 0; + dummy_vmemrange[0].end = dom->lowmem_end; + dummy_vmemrange[0].flags = 0; + dummy_vmemrange[0].nid = 0; + nr_vmemranges = 1; + + if ( dom->highmem_end > (1ULL << 32) ) + { + dummy_vmemrange[1].start = 1ULL << 32; + dummy_vmemrange[1].end = dom->highmem_end; + dummy_vmemrange[1].flags = 0; + dummy_vmemrange[1].nid = 0; + + nr_vmemranges++; + } + + dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE; + nr_vnodes = 1; + vmemranges = dummy_vmemrange; + vnode_to_pnode = dummy_vnode_to_pnode; + } + else + { + if ( nr_pages > target_pages ) + { + DOMPRINTF("Cannot enable vNUMA and PoD at the same time"); + goto error_out; + } + + nr_vmemranges = dom->nr_vmemranges; + nr_vnodes = dom->nr_vnodes; + vmemranges = dom->vmemranges; + vnode_to_pnode = dom->vnode_to_pnode; + } + + total_pages = 0; + p2m_size = 0; + for ( i = 0; i < nr_vmemranges; i++ ) + { + DOMPRINTF("range: start=0x%"PRIx64" end=0x%"PRIx64, vmemranges[i].start, vmemranges[i].end); + + total_pages += ((vmemranges[i].end - vmemranges[i].start) + >> PAGE_SHIFT); + p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ? + p2m_size : (vmemranges[i].end >> PAGE_SHIFT); + } + + if ( total_pages != nr_pages ) + { + DOMPRINTF("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%lx)", + total_pages, nr_pages); + goto error_out; + } + + dom->p2m_size = p2m_size; + + /* + * Try to claim pages for early warning of insufficient memory available. + * This should go before xc_domain_set_pod_target, becuase that function + * actually allocates memory for the guest. Claiming after memory has been + * allocated is pointless. + */ + if ( claim_enabled ) { + rc = xc_domain_claim_pages(xch, domid, + target_pages - dom->vga_hole_size); + if ( rc != 0 ) + { + DOMPRINTF("Could not allocate memory for HVM guest as we cannot claim memory!"); + goto error_out; + } + } + + if ( memflags & XENMEMF_populate_on_demand ) + { + /* + * Subtract VGA_HOLE_SIZE from target_pages for the VGA + * "hole". Xen will adjust the PoD cache size so that domain + * tot_pages will be target_pages - VGA_HOLE_SIZE after + * this call. + */ + rc = xc_domain_set_pod_target(xch, domid, + target_pages - dom->vga_hole_size, + NULL, NULL, NULL); + if ( rc != 0 ) + { + DOMPRINTF("Could not set PoD target for HVM guest.\n"); + goto error_out; + } + } + + /* + * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. + * + * We attempt to allocate 1GB pages if possible. It falls back on 2MB + * pages if 1GB allocation fails. 4KB pages will be used eventually if + * both fail. + */ + if ( dom->device_model ) + { + xen_pfn_t extents[0xa0]; + + for ( i = 0; i < ARRAY_SIZE(extents); ++i ) + extents[i] = i; + + rc = xc_domain_populate_physmap_exact( + xch, domid, 0xa0, 0, memflags, extents); + if ( rc != 0 ) + { + DOMPRINTF("Could not populate low memory (< 0xA0).\n"); + goto error_out; + } + } + + stat_normal_pages = 0; + for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ ) + { + unsigned int new_memflags = memflags; + uint64_t end_pages; + unsigned int vnode = vmemranges[vmemid].nid; + unsigned int pnode = vnode_to_pnode[vnode]; + + if ( pnode != XC_NUMA_NO_NODE ) + new_memflags |= XENMEMF_exact_node(pnode); + + end_pages = vmemranges[vmemid].end >> PAGE_SHIFT; + /* + * Consider vga hole belongs to the vmemrange that covers + * 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just + * before this loop. + */ + if ( vmemranges[vmemid].start == 0 && dom->device_model ) + { + cur_pages = 0xc0; + stat_normal_pages += 0xc0; + } + else + cur_pages = vmemranges[vmemid].start >> PAGE_SHIFT; + + rc = 0; + while ( (rc == 0) && (end_pages > cur_pages) ) + { + /* Clip count to maximum 1GB extent. */ + unsigned long count = end_pages - cur_pages; + unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS; + + if ( count > max_pages ) + count = max_pages; + + cur_pfn = cur_pages; + + /* Take care the corner cases of super page tails */ + if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) && + (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) ) + count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1); + else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) && + (count > SUPERPAGE_1GB_NR_PFNS) ) + count &= ~(SUPERPAGE_1GB_NR_PFNS - 1); + + /* Attemp to allocate 1GB super page. Because in each pass + * we only allocate at most 1GB, we don't have to clip + * super page boundaries. + */ + if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 && + /* Check if there exists MMIO hole in the 1GB memory + * range */ + !check_mmio_hole(cur_pfn << PAGE_SHIFT, + SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT, + dom->mmio_start, dom->mmio_size) ) + { + long done; + unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT; + xen_pfn_t sp_extents[nr_extents]; + + for ( i = 0; i < nr_extents; i++ ) + sp_extents[i] = cur_pages + (i << SUPERPAGE_1GB_SHIFT); + + done = xc_domain_populate_physmap(xch, domid, nr_extents, + SUPERPAGE_1GB_SHIFT, + new_memflags, sp_extents); + + if ( done > 0 ) + { + stat_1gb_pages += done; + done <<= SUPERPAGE_1GB_SHIFT; + cur_pages += done; + count -= done; + } + } + + if ( count != 0 ) + { + /* Clip count to maximum 8MB extent. */ + max_pages = SUPERPAGE_2MB_NR_PFNS * 4; + if ( count > max_pages ) + count = max_pages; + + /* Clip partial superpage extents to superpage + * boundaries. */ + if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) && + (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) ) + count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1); + else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) && + (count > SUPERPAGE_2MB_NR_PFNS) ) + count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */ + + /* Attempt to allocate superpage extents. */ + if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 ) + { + long done; + unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT; + xen_pfn_t sp_extents[nr_extents]; + + for ( i = 0; i < nr_extents; i++ ) + sp_extents[i] = cur_pages + (i << SUPERPAGE_2MB_SHIFT); + + done = xc_domain_populate_physmap(xch, domid, nr_extents, + SUPERPAGE_2MB_SHIFT, + new_memflags, sp_extents); + + if ( done > 0 ) + { + stat_2mb_pages += done; + done <<= SUPERPAGE_2MB_SHIFT; + cur_pages += done; + count -= done; + } + } + } + + /* Fall back to 4kB extents. */ + if ( count != 0 ) + { + xen_pfn_t extents[count]; + + for ( i = 0; i < count; ++i ) + extents[i] = cur_pages + i; + + rc = xc_domain_populate_physmap_exact( + xch, domid, count, 0, new_memflags, extents); + cur_pages += count; + stat_normal_pages += count; + } + } + + if ( rc != 0 ) + { + DOMPRINTF("Could not allocate memory for HVM guest."); + goto error_out; + } + } + + DPRINTF("PHYSICAL MEMORY ALLOCATION:\n"); + DPRINTF(" 4KB PAGES: 0x%016lx\n", stat_normal_pages); + DPRINTF(" 2MB PAGES: 0x%016lx\n", stat_2mb_pages); + DPRINTF(" 1GB PAGES: 0x%016lx\n", stat_1gb_pages); + + rc = 0; + goto out; + error_out: + rc = -1; + out: + + /* ensure no unclaimed pages are left unused */ + xc_domain_claim_pages(xch, domid, 0 /* cancels the claim */); + + return rc; +} + +/* ------------------------------------------------------------------------ */ + +static int bootearly(struct xc_dom_image *dom) +{ + if ( dom->container_type == XC_DOM_PV_CONTAINER && + elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active) ) + { + DOMPRINTF("PV Autotranslate guests no longer supported"); + errno = EOPNOTSUPP; + return -1; + } + + return 0; +} + +static int bootlate_pv(struct xc_dom_image *dom) +{ + static const struct { + char *guest; + unsigned long pgd_type; + } types[] = { + { "xen-3.0-x86_32", MMUEXT_PIN_L2_TABLE}, + { "xen-3.0-x86_32p", MMUEXT_PIN_L3_TABLE}, + { "xen-3.0-x86_64", MMUEXT_PIN_L4_TABLE}, + }; + unsigned long pgd_type = 0; + shared_info_t *shared_info; + xen_pfn_t shinfo; + int i, rc; + + for ( i = 0; i < ARRAY_SIZE(types); i++ ) + if ( !strcmp(types[i].guest, dom->guest_type) ) + pgd_type = types[i].pgd_type; + + /* Drop references to all initial page tables before pinning. */ + xc_dom_unmap_one(dom, dom->pgtables_seg.pfn); + xc_dom_unmap_one(dom, dom->p2m_seg.pfn); + rc = pin_table(dom->xch, pgd_type, + xc_dom_p2m(dom, dom->pgtables_seg.pfn), + dom->guest_domid); + if ( rc != 0 ) + { + xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, + "%s: pin_table failed (pfn 0x%" PRIpfn ", rc=%d)", + __FUNCTION__, dom->pgtables_seg.pfn, rc); + return rc; + } + shinfo = dom->shared_info_mfn; + + /* setup shared_info page */ + DOMPRINTF("%s: shared_info: pfn 0x%" PRIpfn ", mfn 0x%" PRIpfn "", + __FUNCTION__, dom->shared_info_pfn, dom->shared_info_mfn); + shared_info = xc_map_foreign_range(dom->xch, dom->guest_domid, + PAGE_SIZE_X86, + PROT_READ | PROT_WRITE, + shinfo); + if ( shared_info == NULL ) + return -1; + dom->arch_hooks->shared_info(dom, shared_info); + munmap(shared_info, PAGE_SIZE_X86); + + return 0; +} + +/* + * The memory layout of the start_info page and the modules, and where the + * addresses are stored: + * + * /----------------------------------\ + * | struct hvm_start_info | + * +----------------------------------+ <- start_info->modlist_paddr + * | struct hvm_modlist_entry[0] | + * +----------------------------------+ + * | struct hvm_modlist_entry[1] | + * +----------------------------------+ <- modlist[0].cmdline_paddr + * | cmdline of module 0 | + * | char[HVMLOADER_MODULE_NAME_SIZE] | + * +----------------------------------+ <- modlist[1].cmdline_paddr + * | cmdline of module 1 | + * +----------------------------------+ + */ +static void add_module_to_list(struct xc_dom_image *dom, + struct xc_hvm_firmware_module *module, + const char *cmdline, + struct hvm_modlist_entry *modlist, + struct hvm_start_info *start_info) +{ + uint32_t index = start_info->nr_modules; + void *modules_cmdline_start = modlist + HVMLOADER_MODULE_MAX_COUNT; + uint64_t modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + + ((uintptr_t)modlist - (uintptr_t)start_info); + uint64_t modules_cmdline_paddr = modlist_paddr + + sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; + + if ( module->length == 0 ) + return; + + assert(start_info->nr_modules < HVMLOADER_MODULE_MAX_COUNT); + + modlist[index].paddr = module->guest_addr_out; + modlist[index].size = module->length; + + if ( cmdline ) + { + assert(strnlen(cmdline, HVMLOADER_MODULE_CMDLINE_SIZE) + < HVMLOADER_MODULE_CMDLINE_SIZE); + strncpy(modules_cmdline_start + HVMLOADER_MODULE_CMDLINE_SIZE * index, + cmdline, HVMLOADER_MODULE_CMDLINE_SIZE); + modlist[index].cmdline_paddr = modules_cmdline_paddr + + HVMLOADER_MODULE_CMDLINE_SIZE * index; + } + + start_info->nr_modules++; +} + +static int bootlate_hvm(struct xc_dom_image *dom) +{ + uint32_t domid = dom->guest_domid; + xc_interface *xch = dom->xch; + struct hvm_start_info *start_info; + size_t modsize; + struct hvm_modlist_entry *modlist; + struct hvm_memmap_table_entry *memmap; + unsigned int i; + + start_info = xc_map_foreign_range(xch, domid, dom->start_info_seg.pages << + XC_DOM_PAGE_SHIFT(dom), + PROT_READ | PROT_WRITE, + dom->start_info_seg.pfn); + if ( start_info == NULL ) + { + DOMPRINTF("Unable to map HVM start info page"); + return -1; + } + + modlist = (void*)(start_info + 1) + dom->cmdline_size; + + if ( !dom->device_model ) + { + if ( dom->cmdline ) + { + char *cmdline = (void*)(start_info + 1); + + strncpy(cmdline, dom->cmdline, dom->cmdline_size); + start_info->cmdline_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + + ((uintptr_t)cmdline - (uintptr_t)start_info); + } + + /* ACPI module 0 is the RSDP */ + start_info->rsdp_paddr = dom->acpi_modules[0].guest_addr_out ? : 0; + } + else + { + add_module_to_list(dom, &dom->system_firmware_module, "firmware", + modlist, start_info); + } + + for ( i = 0; i < dom->num_modules; i++ ) + { + struct xc_hvm_firmware_module mod; + uint64_t base = dom->parms.virt_base != UNSET_ADDR ? + dom->parms.virt_base : 0; + + mod.guest_addr_out = + dom->modules[i].seg.vstart - base; + mod.length = + dom->modules[i].seg.vend - dom->modules[i].seg.vstart; + + DOMPRINTF("Adding module %u guest_addr %"PRIx64" len %u", + i, mod.guest_addr_out, mod.length); + + add_module_to_list(dom, &mod, dom->modules[i].cmdline, + modlist, start_info); + } + + if ( start_info->nr_modules ) + { + start_info->modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + + ((uintptr_t)modlist - (uintptr_t)start_info); + } + + /* + * Check a couple of XEN_HVM_MEMMAP_TYPEs to verify consistency with + * their corresponding e820 numerical values. + */ + BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_RAM != E820_RAM); + BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_ACPI != E820_ACPI); + + modsize = HVMLOADER_MODULE_MAX_COUNT * + (sizeof(*modlist) + HVMLOADER_MODULE_CMDLINE_SIZE); + memmap = (void*)modlist + modsize; + + start_info->memmap_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + + ((uintptr_t)modlist - (uintptr_t)start_info) + modsize; + start_info->memmap_entries = dom->e820_entries; + for ( i = 0; i < dom->e820_entries; i++ ) + { + memmap[i].addr = dom->e820[i].addr; + memmap[i].size = dom->e820[i].size; + memmap[i].type = dom->e820[i].type; + } + + start_info->magic = XEN_HVM_START_MAGIC_VALUE; + start_info->version = 1; + + munmap(start_info, dom->start_info_seg.pages << XC_DOM_PAGE_SHIFT(dom)); + + if ( dom->device_model ) + { + void *hvm_info_page; + + if ( (hvm_info_page = xc_map_foreign_range( + xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE, + HVM_INFO_PFN)) == NULL ) + return -1; + build_hvm_info(hvm_info_page, dom); + munmap(hvm_info_page, PAGE_SIZE); + } + + return 0; +} + +bool xc_dom_translated(const struct xc_dom_image *dom) +{ + /* HVM guests are translated. PV guests are not. */ + return dom->container_type == XC_DOM_HVM_CONTAINER; +} + +/* ------------------------------------------------------------------------ */ + +static struct xc_dom_arch xc_dom_32_pae = { + .guest_type = "xen-3.0-x86_32p", + .native_protocol = XEN_IO_PROTO_ABI_X86_32, + .page_shift = PAGE_SHIFT_X86, + .sizeof_pfn = 4, + .p2m_base_supported = 0, + .arch_private_size = sizeof(struct xc_dom_image_x86), + .alloc_magic_pages = alloc_magic_pages_pv, + .alloc_pgtables = alloc_pgtables_x86_32_pae, + .alloc_p2m_list = alloc_p2m_list_x86_32, + .setup_pgtables = setup_pgtables_x86_32_pae, + .start_info = start_info_x86_32, + .shared_info = shared_info_x86_32, + .vcpu = vcpu_x86_32, + .meminit = meminit_pv, + .bootearly = bootearly, + .bootlate = bootlate_pv, +}; + +static struct xc_dom_arch xc_dom_64 = { + .guest_type = "xen-3.0-x86_64", + .native_protocol = XEN_IO_PROTO_ABI_X86_64, + .page_shift = PAGE_SHIFT_X86, + .sizeof_pfn = 8, + .p2m_base_supported = 1, + .arch_private_size = sizeof(struct xc_dom_image_x86), + .alloc_magic_pages = alloc_magic_pages_pv, + .alloc_pgtables = alloc_pgtables_x86_64, + .alloc_p2m_list = alloc_p2m_list_x86_64, + .setup_pgtables = setup_pgtables_x86_64, + .start_info = start_info_x86_64, + .shared_info = shared_info_x86_64, + .vcpu = vcpu_x86_64, + .meminit = meminit_pv, + .bootearly = bootearly, + .bootlate = bootlate_pv, +}; + +static struct xc_dom_arch xc_hvm_32 = { + .guest_type = "hvm-3.0-x86_32", + .native_protocol = XEN_IO_PROTO_ABI_X86_32, + .page_shift = PAGE_SHIFT_X86, + .sizeof_pfn = 4, + .alloc_magic_pages = alloc_magic_pages_hvm, + .vcpu = vcpu_hvm, + .meminit = meminit_hvm, + .bootearly = bootearly, + .bootlate = bootlate_hvm, +}; + +static void __init register_arch_hooks(void) +{ + xc_dom_register_arch_hooks(&xc_dom_32_pae); + xc_dom_register_arch_hooks(&xc_dom_64); + xc_dom_register_arch_hooks(&xc_hvm_32); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_domain.c b/tools/libs/guest/xg_domain.c new file mode 100644 index 0000000000..58713cd35d --- /dev/null +++ b/tools/libs/guest/xg_domain.c @@ -0,0 +1,149 @@ +/****************************************************************************** + * xg_domain.c + * + * API for manipulating and obtaining information on domains. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * Copyright (c) 2003, K A Fraser. + */ + +#include "xg_private.h" +#include "xc_core.h" + +int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *minfo) +{ + struct domain_info_context _di = { .guest_width = minfo->guest_width, + .p2m_size = minfo->p2m_size}; + struct domain_info_context *dinfo = &_di; + + free(minfo->pfn_type); + if ( minfo->p2m_table ) + munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE); + minfo->p2m_table = NULL; + + return 0; +} + +int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid, + struct xc_domain_meminfo *minfo) +{ + struct domain_info_context _di; + struct domain_info_context *dinfo = &_di; + + xc_dominfo_t info; + shared_info_any_t *live_shinfo; + xen_capabilities_info_t xen_caps = ""; + int i; + + /* Only be initialized once */ + if ( minfo->pfn_type || minfo->p2m_table ) + { + errno = EINVAL; + return -1; + } + + if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ) + { + PERROR("Could not get domain info"); + return -1; + } + + if ( xc_domain_get_guest_width(xch, domid, &minfo->guest_width) ) + { + PERROR("Could not get domain address size"); + return -1; + } + _di.guest_width = minfo->guest_width; + + /* Get page table levels (see get_platform_info() in xg_save_restore.h */ + if ( xc_version(xch, XENVER_capabilities, &xen_caps) ) + { + PERROR("Could not get Xen capabilities (for page table levels)"); + return -1; + } + if ( strstr(xen_caps, "xen-3.0-x86_64") ) + /* Depends on whether it's a compat 32-on-64 guest */ + minfo->pt_levels = ( (minfo->guest_width == 8) ? 4 : 3 ); + else if ( strstr(xen_caps, "xen-3.0-x86_32p") ) + minfo->pt_levels = 3; + else if ( strstr(xen_caps, "xen-3.0-x86_32") ) + minfo->pt_levels = 2; + else + { + errno = EFAULT; + return -1; + } + + /* We need the shared info page for mapping the P2M */ + live_shinfo = xc_map_foreign_range(xch, domid, PAGE_SIZE, PROT_READ, + info.shared_info_frame); + if ( !live_shinfo ) + { + PERROR("Could not map the shared info frame (MFN 0x%lx)", + info.shared_info_frame); + return -1; + } + + if ( xc_core_arch_map_p2m_writable(xch, minfo->guest_width, &info, + live_shinfo, &minfo->p2m_table, + &minfo->p2m_size) ) + { + PERROR("Could not map the P2M table"); + munmap(live_shinfo, PAGE_SIZE); + return -1; + } + munmap(live_shinfo, PAGE_SIZE); + _di.p2m_size = minfo->p2m_size; + + /* Make space and prepare for getting the PFN types */ + minfo->pfn_type = calloc(sizeof(*minfo->pfn_type), minfo->p2m_size); + if ( !minfo->pfn_type ) + { + PERROR("Could not allocate memory for the PFN types"); + goto failed; + } + for ( i = 0; i < minfo->p2m_size; i++ ) + minfo->pfn_type[i] = xc_pfn_to_mfn(i, minfo->p2m_table, + minfo->guest_width); + + /* Retrieve PFN types in batches */ + for ( i = 0; i < minfo->p2m_size ; i+=1024 ) + { + int count = ((minfo->p2m_size - i ) > 1024 ) ? + 1024: (minfo->p2m_size - i); + + if ( xc_get_pfn_type_batch(xch, domid, count, minfo->pfn_type + i) ) + { + PERROR("Could not get %d-eth batch of PFN types", (i+1)/1024); + goto failed; + } + } + + return 0; + +failed: + if ( minfo->pfn_type ) + { + free(minfo->pfn_type); + minfo->pfn_type = NULL; + } + if ( minfo->p2m_table ) + { + munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE); + minfo->p2m_table = NULL; + } + + return -1; +} diff --git a/tools/libs/guest/xg_nomigrate.c b/tools/libs/guest/xg_nomigrate.c new file mode 100644 index 0000000000..6795c62ddc --- /dev/null +++ b/tools/libs/guest/xg_nomigrate.c @@ -0,0 +1,50 @@ +/****************************************************************************** + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + * + * Copyright (c) 2011, Citrix Systems + */ + +#include +#include +#include +#include + +int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t flags, + struct save_callbacks *callbacks, + xc_stream_type_t stream_type, int recv_fd) +{ + errno = ENOSYS; + return -1; +} + +int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, + unsigned int store_evtchn, unsigned long *store_mfn, + uint32_t store_domid, unsigned int console_evtchn, + unsigned long *console_mfn, uint32_t console_domid, + xc_stream_type_t stream_type, + struct restore_callbacks *callbacks, int send_back_fd) +{ + errno = ENOSYS; + return -1; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_offline_page.c b/tools/libs/guest/xg_offline_page.c new file mode 100644 index 0000000000..77e8889b11 --- /dev/null +++ b/tools/libs/guest/xg_offline_page.c @@ -0,0 +1,708 @@ +/****************************************************************************** + * xc_offline_page.c + * + * Helper functions to offline/online one page + * + * Copyright (c) 2003, K A Fraser. + * Copyright (c) 2009, Intel Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "xc_private.h" +#include "xenctrl_dom.h" +#include "xg_private.h" +#include "xg_save_restore.h" + +struct pte_backup_entry +{ + xen_pfn_t table_mfn; + int offset; +}; + +#define DEFAULT_BACKUP_COUNT 1024 +struct pte_backup +{ + struct pte_backup_entry *entries; + int max; + int cur; +}; + +static struct domain_info_context _dinfo; +static struct domain_info_context *dinfo = &_dinfo; + +int xc_mark_page_online(xc_interface *xch, unsigned long start, + unsigned long end, uint32_t *status) +{ + DECLARE_SYSCTL; + DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH); + int ret = -1; + + if ( !status || (end < start) ) + { + errno = EINVAL; + return -1; + } + if ( xc_hypercall_bounce_pre(xch, status) ) + { + ERROR("Could not bounce memory for xc_mark_page_online\n"); + return -1; + } + + sysctl.cmd = XEN_SYSCTL_page_offline_op; + sysctl.u.page_offline.start = start; + sysctl.u.page_offline.cmd = sysctl_page_online; + sysctl.u.page_offline.end = end; + set_xen_guest_handle(sysctl.u.page_offline.status, status); + ret = xc_sysctl(xch, &sysctl); + + xc_hypercall_bounce_post(xch, status); + + return ret; +} + +int xc_mark_page_offline(xc_interface *xch, unsigned long start, + unsigned long end, uint32_t *status) +{ + DECLARE_SYSCTL; + DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH); + int ret = -1; + + if ( !status || (end < start) ) + { + errno = EINVAL; + return -1; + } + if ( xc_hypercall_bounce_pre(xch, status) ) + { + ERROR("Could not bounce memory for xc_mark_page_offline"); + return -1; + } + + sysctl.cmd = XEN_SYSCTL_page_offline_op; + sysctl.u.page_offline.start = start; + sysctl.u.page_offline.cmd = sysctl_page_offline; + sysctl.u.page_offline.end = end; + set_xen_guest_handle(sysctl.u.page_offline.status, status); + ret = xc_sysctl(xch, &sysctl); + + xc_hypercall_bounce_post(xch, status); + + return ret; +} + +int xc_query_page_offline_status(xc_interface *xch, unsigned long start, + unsigned long end, uint32_t *status) +{ + DECLARE_SYSCTL; + DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH); + int ret = -1; + + if ( !status || (end < start) ) + { + errno = EINVAL; + return -1; + } + if ( xc_hypercall_bounce_pre(xch, status) ) + { + ERROR("Could not bounce memory for xc_query_page_offline_status\n"); + return -1; + } + + sysctl.cmd = XEN_SYSCTL_page_offline_op; + sysctl.u.page_offline.start = start; + sysctl.u.page_offline.cmd = sysctl_query_page_offline; + sysctl.u.page_offline.end = end; + set_xen_guest_handle(sysctl.u.page_offline.status, status); + ret = xc_sysctl(xch, &sysctl); + + xc_hypercall_bounce_post(xch, status); + + return ret; +} + + /* + * There should no update to the grant when domain paused + */ +static int xc_is_page_granted_v1(xc_interface *xch, xen_pfn_t gpfn, + grant_entry_v1_t *gnttab, int gnt_num) +{ + int i = 0; + + if (!gnttab) + return 0; + + for (i = 0; i < gnt_num; i++) + if ( ((gnttab[i].flags & GTF_type_mask) != GTF_invalid) && + (gnttab[i].frame == gpfn) ) + break; + + return (i != gnt_num); +} + +static int xc_is_page_granted_v2(xc_interface *xch, xen_pfn_t gpfn, + grant_entry_v2_t *gnttab, int gnt_num) +{ + int i = 0; + + if (!gnttab) + return 0; + + for (i = 0; i < gnt_num; i++) + if ( ((gnttab[i].hdr.flags & GTF_type_mask) != GTF_invalid) && + (gnttab[i].full_page.frame == gpfn) ) + break; + + return (i != gnt_num); +} + +static int backup_ptes(xen_pfn_t table_mfn, int offset, + struct pte_backup *backup) +{ + if (!backup) + return -EINVAL; + + if (backup->max == backup->cur) + { + backup->entries = realloc(backup->entries, + backup->max * 2 * sizeof(struct pte_backup_entry)); + if (backup->entries == NULL) + return -1; + else + backup->max *= 2; + } + + backup->entries[backup->cur].table_mfn = table_mfn; + backup->entries[backup->cur++].offset = offset; + + return 0; +} + +/* + * return: + * 1 when MMU update is required + * 0 when no changes + * <0 when error happen + */ +typedef int (*pte_func)(xc_interface *xch, + uint64_t pte, uint64_t *new_pte, + unsigned long table_mfn, int table_offset, + struct pte_backup *backup, + unsigned long no_use); + +static int __clear_pte(xc_interface *xch, + uint64_t pte, uint64_t *new_pte, + unsigned long table_mfn, int table_offset, + struct pte_backup *backup, + unsigned long mfn) +{ + /* If no new_pte pointer, same as no changes needed */ + if (!new_pte || !backup) + return -EINVAL; + + if ( !(pte & _PAGE_PRESENT)) + return 0; + + /* XXX Check for PSE bit here */ + /* Hit one entry */ + if ( ((pte >> PAGE_SHIFT_X86) & MFN_MASK_X86) == mfn) + { + *new_pte = pte & ~_PAGE_PRESENT; + if (!backup_ptes(table_mfn, table_offset, backup)) + return 1; + } + + return 0; +} + +static int __update_pte(xc_interface *xch, + uint64_t pte, uint64_t *new_pte, + unsigned long table_mfn, int table_offset, + struct pte_backup *backup, + unsigned long new_mfn) +{ + int index; + + if (!new_pte) + return 0; + + for (index = 0; index < backup->cur; index ++) + if ( (backup->entries[index].table_mfn == table_mfn) && + (backup->entries[index].offset == table_offset) ) + break; + + if (index != backup->cur) + { + if (pte & _PAGE_PRESENT) + ERROR("Page present while in backup ptes\n"); + pte &= ~MFN_MASK_X86; + pte |= (new_mfn << PAGE_SHIFT_X86) | _PAGE_PRESENT; + *new_pte = pte; + return 1; + } + + return 0; +} + +static int change_pte(xc_interface *xch, uint32_t domid, + struct xc_domain_meminfo *minfo, + struct pte_backup *backup, + struct xc_mmu *mmu, + pte_func func, + unsigned long data) +{ + int pte_num, rc; + uint64_t i; + void *content = NULL; + + pte_num = PAGE_SIZE / ((minfo->pt_levels == 2) ? 4 : 8); + + for (i = 0; i < minfo->p2m_size; i++) + { + xen_pfn_t table_mfn = xc_pfn_to_mfn(i, minfo->p2m_table, + minfo->guest_width); + uint64_t pte, new_pte; + int j; + + if ( (table_mfn == INVALID_PFN) || + ((minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == + XEN_DOMCTL_PFINFO_XTAB) ) + continue; + + if ( minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) + { + content = xc_map_foreign_range(xch, domid, PAGE_SIZE, + PROT_READ, table_mfn); + if (!content) + goto failed; + + for (j = 0; j < pte_num; j++) + { + if ( minfo->pt_levels == 2 ) + pte = ((const uint32_t*)content)[j]; + else + pte = ((const uint64_t*)content)[j]; + + rc = func(xch, pte, &new_pte, table_mfn, j, backup, data); + + switch (rc) + { + case 1: + if ( xc_add_mmu_update(xch, mmu, + table_mfn << PAGE_SHIFT | + j * ( (minfo->pt_levels == 2) ? + sizeof(uint32_t): sizeof(uint64_t)) | + MMU_PT_UPDATE_PRESERVE_AD, + new_pte) ) + goto failed; + break; + + case 0: + break; + + default: + goto failed; + } + } + + munmap(content, PAGE_SIZE); + content = NULL; + } + } + + if ( xc_flush_mmu_updates(xch, mmu) ) + goto failed; + + return 0; +failed: + /* XXX Shall we take action if we have fail to swap? */ + if (content) + munmap(content, PAGE_SIZE); + + return -1; +} + +static int update_pte(xc_interface *xch, uint32_t domid, + struct xc_domain_meminfo *minfo, + struct pte_backup *backup, + struct xc_mmu *mmu, + unsigned long new_mfn) +{ + return change_pte(xch, domid, minfo, backup, mmu, + __update_pte, new_mfn); +} + +static int clear_pte(xc_interface *xch, uint32_t domid, + struct xc_domain_meminfo *minfo, + struct pte_backup *backup, + struct xc_mmu *mmu, + xen_pfn_t mfn) +{ + return change_pte(xch, domid, minfo, backup, mmu, + __clear_pte, mfn); +} + +/* + * Check if a page can be exchanged successfully + */ + +static int is_page_exchangable(xc_interface *xch, uint32_t domid, xen_pfn_t mfn, + xc_dominfo_t *info) +{ + uint32_t status; + int rc; + + /* domain checking */ + if ( !domid || (domid > DOMID_FIRST_RESERVED) ) + { + DPRINTF("Dom0's page can't be LM"); + return 0; + } + if (info->hvm) + { + DPRINTF("Currently we can only live change PV guest's page\n"); + return 0; + } + + /* Check if pages are offline pending or not */ + rc = xc_query_page_offline_status(xch, mfn, mfn, &status); + + if ( rc || !(status & PG_OFFLINE_STATUS_OFFLINE_PENDING) ) + { + ERROR("Page %lx is not offline pending %x\n", + mfn, status); + return 0; + } + + return 1; +} + +xen_pfn_t *xc_map_m2p(xc_interface *xch, + unsigned long max_mfn, + int prot, + unsigned long *mfn0) +{ + privcmd_mmap_entry_t *entries; + unsigned long m2p_chunks, m2p_size; + xen_pfn_t *m2p; + xen_pfn_t *extent_start; + int i; + + m2p = NULL; + m2p_size = M2P_SIZE(max_mfn); + m2p_chunks = M2P_CHUNKS(max_mfn); + + extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t)); + if ( !extent_start ) + { + ERROR("failed to allocate space for m2p mfns"); + goto err0; + } + + if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) ) + { + PERROR("xc_get_m2p_mfns"); + goto err1; + } + + entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t)); + if (entries == NULL) + { + ERROR("failed to allocate space for mmap entries"); + goto err1; + } + + for ( i = 0; i < m2p_chunks; i++ ) + entries[i].mfn = extent_start[i]; + + m2p = xc_map_foreign_ranges(xch, DOMID_XEN, + m2p_size, prot, M2P_CHUNK_SIZE, + entries, m2p_chunks); + if (m2p == NULL) + { + PERROR("xc_mmap_foreign_ranges failed"); + goto err2; + } + + if (mfn0) + *mfn0 = entries[0].mfn; + +err2: + free(entries); +err1: + free(extent_start); + +err0: + return m2p; +} + +/* The domain should be suspended when called here */ +int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn) +{ + xc_dominfo_t info; + struct xc_domain_meminfo minfo; + struct xc_mmu *mmu = NULL; + struct pte_backup old_ptes = {NULL, 0, 0}; + grant_entry_v1_t *gnttab_v1 = NULL; + grant_entry_v2_t *gnttab_v2 = NULL; + struct mmuext_op mops; + int gnt_num, unpined = 0; + void *old_p, *backup = NULL; + int rc, result = -1; + uint32_t status; + xen_pfn_t new_mfn, gpfn; + xen_pfn_t *m2p_table; + unsigned long max_mfn; + + if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ) + { + ERROR("Could not get domain info"); + return -1; + } + + if (!info.shutdown || info.shutdown_reason != SHUTDOWN_suspend) + { + errno = EINVAL; + ERROR("Can't exchange page unless domain is suspended\n"); + return -1; + } + if (!is_page_exchangable(xch, domid, mfn, &info)) + { + ERROR("Could not exchange page\n"); + return -1; + } + + /* Map M2P and obtain gpfn */ + rc = xc_maximum_ram_page(xch, &max_mfn); + if ( rc || !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, NULL)) ) + { + PERROR("Failed to map live M2P table"); + return -1; + } + gpfn = m2p_table[mfn]; + + /* Map domain's memory information */ + memset(&minfo, 0, sizeof(minfo)); + if ( xc_map_domain_meminfo(xch, domid, &minfo) ) + { + PERROR("Could not map domain's memory information\n"); + goto failed; + } + + /* For translation macros */ + dinfo->guest_width = minfo.guest_width; + dinfo->p2m_size = minfo.p2m_size; + + /* Don't exchange CR3 for PAE guest in PAE host environment */ + if (minfo.guest_width > sizeof(long)) + { + if ( (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) == + XEN_DOMCTL_PFINFO_L3TAB ) + goto failed; + } + + gnttab_v2 = xc_gnttab_map_table_v2(xch, domid, &gnt_num); + if (!gnttab_v2) + { + gnttab_v1 = xc_gnttab_map_table_v1(xch, domid, &gnt_num); + if (!gnttab_v1) + { + ERROR("Failed to map grant table\n"); + goto failed; + } + } + + if (gnttab_v1 + ? xc_is_page_granted_v1(xch, mfn, gnttab_v1, gnt_num) + : xc_is_page_granted_v2(xch, mfn, gnttab_v2, gnt_num)) + { + ERROR("Page %lx is granted now\n", mfn); + goto failed; + } + + /* allocate required data structure */ + backup = malloc(PAGE_SIZE); + if (!backup) + { + ERROR("Failed to allocate backup pages pointer\n"); + goto failed; + } + + old_ptes.max = DEFAULT_BACKUP_COUNT; + old_ptes.entries = malloc(sizeof(struct pte_backup_entry) * + DEFAULT_BACKUP_COUNT); + + if (!old_ptes.entries) + { + ERROR("Faield to allocate backup\n"); + goto failed; + } + old_ptes.cur = 0; + + /* Unpin the page if it is pined */ + if (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LPINTAB) + { + mops.cmd = MMUEXT_UNPIN_TABLE; + mops.arg1.mfn = mfn; + + if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 ) + { + ERROR("Failed to unpin page %lx", mfn); + goto failed; + } + mops.arg1.mfn = mfn; + unpined = 1; + } + + /* backup the content */ + old_p = xc_map_foreign_range(xch, domid, PAGE_SIZE, + PROT_READ, mfn); + if (!old_p) + { + ERROR("Failed to map foreign page %lx\n", mfn); + goto failed; + } + + memcpy(backup, old_p, PAGE_SIZE); + munmap(old_p, PAGE_SIZE); + + mmu = xc_alloc_mmu_updates(xch, domid); + if ( mmu == NULL ) + { + ERROR("%s: failed at %d\n", __FUNCTION__, __LINE__); + goto failed; + } + + /* Firstly update all pte to be invalid to remove the reference */ + rc = clear_pte(xch, domid, &minfo, &old_ptes, mmu, mfn); + + if (rc) + { + ERROR("clear pte failed\n"); + goto failed; + } + + rc = xc_domain_memory_exchange_pages(xch, domid, + 1, 0, &mfn, + 1, 0, &new_mfn); + + if (rc) + { + ERROR("Exchange the page failed\n"); + /* Exchange fail means there are refere to the page still */ + rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, mfn); + if (rc) + result = -2; + goto failed; + } + + rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, new_mfn); + + if (rc) + { + ERROR("update pte failed guest may be broken now\n"); + /* No recover action now for swap fail */ + result = -2; + goto failed; + } + + /* Check if pages are offlined already */ + rc = xc_query_page_offline_status(xch, mfn, mfn, + &status); + + if (rc) + { + ERROR("Fail to query offline status\n"); + }else if ( !(status & PG_OFFLINE_STATUS_OFFLINED) ) + { + ERROR("page is still online or pending\n"); + goto failed; + } + else + { + void *new_p; + IPRINTF("Now page is offlined %lx\n", mfn); + /* Update the p2m table */ + minfo.p2m_table[gpfn] = new_mfn; + + new_p = xc_map_foreign_range(xch, domid, PAGE_SIZE, + PROT_READ|PROT_WRITE, new_mfn); + if ( new_p == NULL ) + { + ERROR("failed to map new_p for copy, guest may be broken?"); + goto failed; + } + memcpy(new_p, backup, PAGE_SIZE); + munmap(new_p, PAGE_SIZE); + mops.arg1.mfn = new_mfn; + result = 0; + } + +failed: + + if (unpined && (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB)) + { + switch ( minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) + { + case XEN_DOMCTL_PFINFO_L1TAB: + mops.cmd = MMUEXT_PIN_L1_TABLE; + break; + + case XEN_DOMCTL_PFINFO_L2TAB: + mops.cmd = MMUEXT_PIN_L2_TABLE; + break; + + case XEN_DOMCTL_PFINFO_L3TAB: + mops.cmd = MMUEXT_PIN_L3_TABLE; + break; + + case XEN_DOMCTL_PFINFO_L4TAB: + mops.cmd = MMUEXT_PIN_L4_TABLE; + break; + + default: + ERROR("Unpined for non pate table page\n"); + break; + } + + if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 ) + { + ERROR("failed to pin the mfn again\n"); + result = -2; + } + } + + free(mmu); + + free(old_ptes.entries); + + free(backup); + + if (gnttab_v1) + munmap(gnttab_v1, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v1_t))); + if (gnttab_v2) + munmap(gnttab_v2, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v2_t))); + + xc_unmap_domain_meminfo(xch, &minfo); + munmap(m2p_table, M2P_SIZE(max_mfn)); + + return result; +} diff --git a/tools/libs/guest/xg_private.c b/tools/libs/guest/xg_private.c new file mode 100644 index 0000000000..2073dba2ef --- /dev/null +++ b/tools/libs/guest/xg_private.c @@ -0,0 +1,198 @@ +/****************************************************************************** + * xg_private.c + * + * Helper functions for the rest of the library. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + */ + +#include +#include +#include + +#include "xg_private.h" + +char *xc_read_image(xc_interface *xch, + const char *filename, unsigned long *size) +{ + int kernel_fd = -1; + gzFile kernel_gfd = NULL; + char *image = NULL, *tmp; + unsigned int bytes; + + if ( (filename == NULL) || (size == NULL) ) + return NULL; + + if ( (kernel_fd = open(filename, O_RDONLY)) < 0 ) + { + PERROR("Could not open kernel image '%s'", filename); + goto out; + } + + if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL ) + { + PERROR("Could not allocate decompression state for state file"); + goto out; + } + + *size = 0; + +#define CHUNK 1*1024*1024 + while(1) + { + if ( (tmp = realloc(image, *size + CHUNK)) == NULL ) + { + PERROR("Could not allocate memory for kernel image"); + free(image); + image = NULL; + goto out; + } + image = tmp; + + bytes = gzread(kernel_gfd, image + *size, CHUNK); + switch (bytes) + { + case -1: + PERROR("Error reading kernel image"); + free(image); + image = NULL; + goto out; + case 0: /* EOF */ + if ( *size == 0 ) + { + PERROR("Could not read kernel image"); + free(image); + image = NULL; + } + goto out; + default: + *size += bytes; + break; + } + } +#undef CHUNK + + out: + if ( image ) + { + /* Shrink allocation to fit image. */ + tmp = realloc(image, *size); + if ( tmp ) + image = tmp; + } + + if ( kernel_gfd != NULL ) + gzclose(kernel_gfd); + else if ( kernel_fd >= 0 ) + close(kernel_fd); + return image; +} + +char *xc_inflate_buffer(xc_interface *xch, + const char *in_buf, unsigned long in_size, + unsigned long *out_size) +{ + int sts; + z_stream zStream; + unsigned long out_len; + char *out_buf; + + /* Not compressed? Then return the original buffer. */ + if ( ((unsigned char)in_buf[0] != 0x1F) || + ((unsigned char)in_buf[1] != 0x8B) ) + { + if ( out_size != NULL ) + *out_size = in_size; + return (char *)in_buf; + } + + out_len = (unsigned char)in_buf[in_size-4] + + (256 * ((unsigned char)in_buf[in_size-3] + + (256 * ((unsigned char)in_buf[in_size-2] + + (256 * (unsigned char)in_buf[in_size-1]))))); + + memset(&zStream, 0, sizeof(zStream)); + out_buf = malloc(out_len + 16); /* Leave a little extra space */ + if ( out_buf == NULL ) + { + ERROR("Error mallocing buffer\n"); + return NULL; + } + + zStream.next_in = (unsigned char *)in_buf; + zStream.avail_in = in_size; + zStream.next_out = (unsigned char *)out_buf; + zStream.avail_out = out_len+16; + sts = inflateInit2(&zStream, (MAX_WBITS+32)); /* +32 means "handle gzip" */ + if ( sts != Z_OK ) + { + ERROR("inflateInit failed, sts %d\n", sts); + free(out_buf); + return NULL; + } + + /* Inflate in one pass/call */ + sts = inflate(&zStream, Z_FINISH); + inflateEnd(&zStream); + if ( sts != Z_STREAM_END ) + { + ERROR("inflate failed, sts %d\n", sts); + free(out_buf); + return NULL; + } + + if ( out_size != NULL ) + *out_size = out_len; + + return out_buf; +} + +/*******************/ + +int pin_table( + xc_interface *xch, unsigned int type, unsigned long mfn, uint32_t dom) +{ + struct mmuext_op op; + + op.cmd = type; + op.arg1.mfn = mfn; + + if ( xc_mmuext_op(xch, &op, 1, dom) < 0 ) + return 1; + + return 0; +} + +/* This is shared between save and restore, and may generally be useful. */ +unsigned long csum_page(void *page) +{ + int i; + unsigned long *p = page; + unsigned long long sum=0; + + for ( i = 0; i < (PAGE_SIZE/sizeof(unsigned long)); i++ ) + sum += p[i]; + + return sum ^ (sum>>32); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_private.h b/tools/libs/guest/xg_private.h new file mode 100644 index 0000000000..0000b2b9b6 --- /dev/null +++ b/tools/libs/guest/xg_private.h @@ -0,0 +1,124 @@ +/* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + */ + +#ifndef XG_PRIVATE_H +#define XG_PRIVATE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xc_private.h" +#include "xenguest.h" + +#include +#include + +#ifndef ELFSIZE +#include +#if UINT_MAX == ULONG_MAX +#define ELFSIZE 32 +#else +#define ELFSIZE 64 +#endif +#endif + +char *xc_read_image(xc_interface *xch, + const char *filename, unsigned long *size); +char *xc_inflate_buffer(xc_interface *xch, + const char *in_buf, + unsigned long in_size, + unsigned long *out_size); + +unsigned long csum_page (void * page); + +#define _PAGE_PRESENT 0x001 +#define _PAGE_RW 0x002 +#define _PAGE_USER 0x004 +#define _PAGE_PWT 0x008 +#define _PAGE_PCD 0x010 +#define _PAGE_ACCESSED 0x020 +#define _PAGE_DIRTY 0x040 +#define _PAGE_PAT 0x080 +#define _PAGE_PSE 0x080 +#define _PAGE_GLOBAL 0x100 + +#define VIRT_BITS_I386 32 +#define VIRT_BITS_X86_64 48 + +#define PGTBL_LEVELS_I386 3 +#define PGTBL_LEVELS_X86_64 4 + +#define PGTBL_LEVEL_SHIFT_X86 9 + +#define L1_PAGETABLE_SHIFT_PAE 12 +#define L2_PAGETABLE_SHIFT_PAE 21 +#define L3_PAGETABLE_SHIFT_PAE 30 +#define L1_PAGETABLE_ENTRIES_PAE 512 +#define L2_PAGETABLE_ENTRIES_PAE 512 +#define L3_PAGETABLE_ENTRIES_PAE 4 + +#define L1_PAGETABLE_SHIFT_X86_64 12 +#define L2_PAGETABLE_SHIFT_X86_64 21 +#define L3_PAGETABLE_SHIFT_X86_64 30 +#define L4_PAGETABLE_SHIFT_X86_64 39 +#define L1_PAGETABLE_ENTRIES_X86_64 512 +#define L2_PAGETABLE_ENTRIES_X86_64 512 +#define L3_PAGETABLE_ENTRIES_X86_64 512 +#define L4_PAGETABLE_ENTRIES_X86_64 512 + +typedef uint64_t x86_pgentry_t; + +#define PAGE_SHIFT_ARM 12 +#define PAGE_SIZE_ARM (1UL << PAGE_SHIFT_ARM) +#define PAGE_MASK_ARM (~(PAGE_SIZE_ARM-1)) + +#define PAGE_SHIFT_X86 12 +#define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86) +#define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1)) + +#define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT) + +static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m, + unsigned gwidth) +{ + if ( gwidth == sizeof(uint64_t) ) + /* 64 bit guest. Need to truncate their pfns for 32 bit toolstacks. */ + return ((uint64_t *)p2m)[pfn]; + else + { + /* 32 bit guest. Need to expand INVALID_MFN for 64 bit toolstacks. */ + uint32_t mfn = ((uint32_t *)p2m)[pfn]; + + return mfn == ~0U ? INVALID_MFN : mfn; + } +} + + +/* Masks for PTE<->PFN conversions */ +#define MADDR_BITS_X86 ((dinfo->guest_width == 8) ? 52 : 44) +#define MFN_MASK_X86 ((1ULL << (MADDR_BITS_X86 - PAGE_SHIFT_X86)) - 1) +#define MADDR_MASK_X86 (MFN_MASK_X86 << PAGE_SHIFT_X86) + +int pin_table(xc_interface *xch, unsigned int type, unsigned long mfn, + uint32_t dom); + +#endif /* XG_PRIVATE_H */ diff --git a/tools/libs/guest/xg_save_restore.h b/tools/libs/guest/xg_save_restore.h new file mode 100644 index 0000000000..88120eb54b --- /dev/null +++ b/tools/libs/guest/xg_save_restore.h @@ -0,0 +1,134 @@ +/* + * Definitions and utilities for save / restore. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + */ + +#include "xc_private.h" + +#include +#include + +/* +** We process save/restore/migrate in batches of pages; the below +** determines how many pages we (at maximum) deal with in each batch. +*/ +#define MAX_BATCH_SIZE 1024 /* up to 1024 pages (4MB) at a time */ + +/* When pinning page tables at the end of restore, we also use batching. */ +#define MAX_PIN_BATCH 1024 + +/* +** Determine various platform information required for save/restore, in +** particular: +** +** - the maximum MFN on this machine, used to compute the size of +** the M2P table; +** +** - the starting virtual address of the the hypervisor; we use this +** to determine which parts of guest address space(s) do and don't +** require canonicalization during save/restore; and +** +** - the number of page-table levels for save/ restore. This should +** be a property of the domain, but for the moment we just read it +** from the hypervisor. +** +** - The width of a guest word (unsigned long), in bytes. +** +** Returns 1 on success, 0 on failure. +*/ +static inline int get_platform_info(xc_interface *xch, uint32_t dom, + /* OUT */ unsigned long *max_mfn, + /* OUT */ unsigned long *hvirt_start, + /* OUT */ unsigned int *pt_levels, + /* OUT */ unsigned int *guest_width) +{ + xen_capabilities_info_t xen_caps = ""; + xen_platform_parameters_t xen_params; + + if (xc_version(xch, XENVER_platform_parameters, &xen_params) != 0) + return 0; + + if (xc_version(xch, XENVER_capabilities, &xen_caps) != 0) + return 0; + + if (xc_maximum_ram_page(xch, max_mfn)) + return 0; + + *hvirt_start = xen_params.virt_start; + + if ( xc_domain_get_guest_width(xch, dom, guest_width) != 0) + return 0; + + /* 64-bit tools will see the 64-bit hvirt_start, but 32-bit guests + * will be using the compat one. */ + if ( *guest_width < sizeof (unsigned long) ) + /* XXX need to fix up a way of extracting this value from Xen if + * XXX it becomes variable for domU */ + *hvirt_start = 0xf5800000; + + if (strstr(xen_caps, "xen-3.0-x86_64")) + /* Depends on whether it's a compat 32-on-64 guest */ + *pt_levels = ( (*guest_width == 8) ? 4 : 3 ); + else if (strstr(xen_caps, "xen-3.0-x86_32p")) + *pt_levels = 3; + else + return 0; + + return 1; +} + + +/* +** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables. +** The M2P simply holds the corresponding PFN, while the top bit of a P2M +** entry tell us whether or not the the PFN is currently mapped. +*/ + +#define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10)) + + +/* +** The M2P is made up of some number of 'chunks' of at least 2MB in size. +** The below definitions and utility function(s) deal with mapping the M2P +** regarldess of the underlying machine memory size or architecture. +*/ +#define M2P_SHIFT L2_PAGETABLE_SHIFT_PAE +#define M2P_CHUNK_SIZE (1 << M2P_SHIFT) +#define M2P_SIZE(_m) ROUNDUP(((_m) * sizeof(xen_pfn_t)), M2P_SHIFT) +#define M2P_CHUNKS(_m) (M2P_SIZE((_m)) >> M2P_SHIFT) + +#define UNFOLD_CR3(_c) \ + ((uint64_t)((dinfo->guest_width == 8) \ + ? ((_c) >> 12) \ + : (((uint32_t)(_c) >> 12) | ((uint32_t)(_c) << 20)))) + +#define FOLD_CR3(_c) \ + ((uint64_t)((dinfo->guest_width == 8) \ + ? ((uint64_t)(_c)) << 12 \ + : (((uint32_t)(_c) << 12) | ((uint32_t)(_c) >> 20)))) + +#define MEMCPY_FIELD(_d, _s, _f, _w) do { \ + if ((_w) == 8) \ + memcpy(&(_d)->x64._f, &(_s)->x64._f,sizeof((_d)->x64._f)); \ + else \ + memcpy(&(_d)->x32._f, &(_s)->x32._f,sizeof((_d)->x32._f)); \ +} while (0) + +#define MEMSET_ARRAY_FIELD(_p, _f, _v, _w) do { \ + if ((_w) == 8) \ + memset(&(_p)->x64._f[0], (_v), sizeof((_p)->x64._f)); \ + else \ + memset(&(_p)->x32._f[0], (_v), sizeof((_p)->x32._f)); \ +} while (0) diff --git a/tools/libs/guest/xg_sr_common.c b/tools/libs/guest/xg_sr_common.c new file mode 100644 index 0000000000..17567ab133 --- /dev/null +++ b/tools/libs/guest/xg_sr_common.c @@ -0,0 +1,167 @@ +#include + +#include "xg_sr_common.h" + +#include + +static const char *const dhdr_types[] = +{ + [DHDR_TYPE_X86_PV] = "x86 PV", + [DHDR_TYPE_X86_HVM] = "x86 HVM", +}; + +const char *dhdr_type_to_str(uint32_t type) +{ + if ( type < ARRAY_SIZE(dhdr_types) && dhdr_types[type] ) + return dhdr_types[type]; + + return "Reserved"; +} + +static const char *const mandatory_rec_types[] = +{ + [REC_TYPE_END] = "End", + [REC_TYPE_PAGE_DATA] = "Page data", + [REC_TYPE_X86_PV_INFO] = "x86 PV info", + [REC_TYPE_X86_PV_P2M_FRAMES] = "x86 PV P2M frames", + [REC_TYPE_X86_PV_VCPU_BASIC] = "x86 PV vcpu basic", + [REC_TYPE_X86_PV_VCPU_EXTENDED] = "x86 PV vcpu extended", + [REC_TYPE_X86_PV_VCPU_XSAVE] = "x86 PV vcpu xsave", + [REC_TYPE_SHARED_INFO] = "Shared info", + [REC_TYPE_X86_TSC_INFO] = "x86 TSC info", + [REC_TYPE_HVM_CONTEXT] = "HVM context", + [REC_TYPE_HVM_PARAMS] = "HVM params", + [REC_TYPE_TOOLSTACK] = "Toolstack", + [REC_TYPE_X86_PV_VCPU_MSRS] = "x86 PV vcpu msrs", + [REC_TYPE_VERIFY] = "Verify", + [REC_TYPE_CHECKPOINT] = "Checkpoint", + [REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST] = "Checkpoint dirty pfn list", + [REC_TYPE_STATIC_DATA_END] = "Static data end", + [REC_TYPE_X86_CPUID_POLICY] = "x86 CPUID policy", + [REC_TYPE_X86_MSR_POLICY] = "x86 MSR policy", +}; + +const char *rec_type_to_str(uint32_t type) +{ + if ( !(type & REC_TYPE_OPTIONAL) ) + { + if ( (type < ARRAY_SIZE(mandatory_rec_types)) && + (mandatory_rec_types[type]) ) + return mandatory_rec_types[type]; + } + + return "Reserved"; +} + +int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec, + void *buf, size_t sz) +{ + static const char zeroes[(1u << REC_ALIGN_ORDER) - 1] = { 0 }; + + xc_interface *xch = ctx->xch; + typeof(rec->length) combined_length = rec->length + sz; + size_t record_length = ROUNDUP(combined_length, REC_ALIGN_ORDER); + struct iovec parts[] = { + { &rec->type, sizeof(rec->type) }, + { &combined_length, sizeof(combined_length) }, + { rec->data, rec->length }, + { buf, sz }, + { (void *)zeroes, record_length - combined_length }, + }; + + if ( record_length > REC_LENGTH_MAX ) + { + ERROR("Record (0x%08x, %s) length %#zx exceeds max (%#x)", rec->type, + rec_type_to_str(rec->type), record_length, REC_LENGTH_MAX); + return -1; + } + + if ( rec->length ) + assert(rec->data); + if ( sz ) + assert(buf); + + if ( writev_exact(ctx->fd, parts, ARRAY_SIZE(parts)) ) + goto err; + + return 0; + + err: + PERROR("Unable to write record to stream"); + return -1; +} + +int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rhdr rhdr; + size_t datasz; + + if ( read_exact(fd, &rhdr, sizeof(rhdr)) ) + { + PERROR("Failed to read Record Header from stream"); + return -1; + } + + if ( rhdr.length > REC_LENGTH_MAX ) + { + ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rhdr.type, + rec_type_to_str(rhdr.type), rhdr.length, REC_LENGTH_MAX); + return -1; + } + + datasz = ROUNDUP(rhdr.length, REC_ALIGN_ORDER); + + if ( datasz ) + { + rec->data = malloc(datasz); + + if ( !rec->data ) + { + ERROR("Unable to allocate %zu bytes for record data (0x%08x, %s)", + datasz, rhdr.type, rec_type_to_str(rhdr.type)); + return -1; + } + + if ( read_exact(fd, rec->data, datasz) ) + { + free(rec->data); + rec->data = NULL; + PERROR("Failed to read %zu bytes of data for record (0x%08x, %s)", + datasz, rhdr.type, rec_type_to_str(rhdr.type)); + return -1; + } + } + else + rec->data = NULL; + + rec->type = rhdr.type; + rec->length = rhdr.length; + + return 0; +}; + +static void __attribute__((unused)) build_assertions(void) +{ + BUILD_BUG_ON(sizeof(struct xc_sr_ihdr) != 24); + BUILD_BUG_ON(sizeof(struct xc_sr_dhdr) != 16); + BUILD_BUG_ON(sizeof(struct xc_sr_rhdr) != 8); + + BUILD_BUG_ON(sizeof(struct xc_sr_rec_page_data_header) != 8); + BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_info) != 8); + BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_p2m_frames) != 8); + BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_vcpu_hdr) != 8); + BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_tsc_info) != 24); + BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params_entry) != 16); + BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params) != 8); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_common.h b/tools/libs/guest/xg_sr_common.h new file mode 100644 index 0000000000..13fcc47420 --- /dev/null +++ b/tools/libs/guest/xg_sr_common.h @@ -0,0 +1,468 @@ +#ifndef __COMMON__H +#define __COMMON__H + +#include + +#include "xg_private.h" +#include "xg_save_restore.h" +#include "xenctrl_dom.h" +#include "xc_bitops.h" + +#include "xg_sr_stream_format.h" + +/* String representation of Domain Header types. */ +const char *dhdr_type_to_str(uint32_t type); + +/* String representation of Record types. */ +const char *rec_type_to_str(uint32_t type); + +struct xc_sr_context; +struct xc_sr_record; + +/** + * Save operations. To be implemented for each type of guest, for use by the + * common save algorithm. + * + * Every function must be implemented, even if only with a no-op stub. + */ +struct xc_sr_save_ops +{ + /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */ + xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn); + + /** + * Optionally transform the contents of a page from being specific to the + * sending environment, to being generic for the stream. + * + * The page of data at the end of 'page' may be a read-only mapping of a + * running guest; it must not be modified. If no transformation is + * required, the callee should leave '*pages' untouched. + * + * If a transformation is required, the callee should allocate themselves + * a local page using malloc() and return it via '*page'. + * + * The caller shall free() '*page' in all cases. In the case that the + * callee encounters an error, it should *NOT* free() the memory it + * allocated for '*page'. + * + * It is valid to fail with EAGAIN if the transformation is not able to be + * completed at this point. The page shall be retried later. + * + * @returns 0 for success, -1 for failure, with errno appropriately set. + */ + int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type, + void **page); + + /** + * Set up local environment to save a domain. (Typically querying + * running domain state, setting up mappings etc.) + * + * This is called once before any common setup has occurred, allowing for + * guest-specific adjustments to be made to common state. + */ + int (*setup)(struct xc_sr_context *ctx); + + /** + * Send static records at the head of the stream. This is called once, + * after the Image and Domain headers are written. + */ + int (*static_data)(struct xc_sr_context *ctx); + + /** + * Send dynamic records which need to be at the start of the stream. This + * is called after the STATIC_DATA_END record is written. + */ + int (*start_of_stream)(struct xc_sr_context *ctx); + + /** + * Send records which need to be at the start of a checkpoint. This is + * called once, or once per checkpoint in a checkpointed stream, and is + * ahead of memory data. + */ + int (*start_of_checkpoint)(struct xc_sr_context *ctx); + + /** + * Send records which need to be at the end of the checkpoint. This is + * called once, or once per checkpoint in a checkpointed stream, and is + * after the memory data. + */ + int (*end_of_checkpoint)(struct xc_sr_context *ctx); + + /** + * Check state of guest to decide whether it makes sense to continue + * migration. This is called in each iteration or checkpoint to check + * whether all criteria for the migration are still met. If that's not + * the case either migration is cancelled via a bad rc or the situation + * is handled, e.g. by sending appropriate records. + */ + int (*check_vm_state)(struct xc_sr_context *ctx); + + /** + * Clean up the local environment. Will be called exactly once, either + * after a successful save, or upon encountering an error. + */ + int (*cleanup)(struct xc_sr_context *ctx); +}; + + +/** + * Restore operations. To be implemented for each type of guest, for use by + * the common restore algorithm. + * + * Every function must be implemented, even if only with a no-op stub. + */ +struct xc_sr_restore_ops +{ + /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */ + xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn); + + /* Check to see whether a PFN is valid. */ + bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn); + + /* Set the GFN of a PFN. */ + void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn); + + /* Set the type of a PFN. */ + void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn, + xen_pfn_t type); + + /** + * Optionally transform the contents of a page from being generic in the + * stream, to being specific to the restoring environment. + * + * 'page' is expected to be modified in-place if a transformation is + * required. + * + * @returns 0 for success, -1 for failure, with errno appropriately set. + */ + int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page); + + /** + * Set up local environment to restore a domain. + * + * This is called once before any common setup has occurred, allowing for + * guest-specific adjustments to be made to common state. + */ + int (*setup)(struct xc_sr_context *ctx); + + /** + * Process an individual record from the stream. The caller shall take + * care of processing common records (e.g. END, PAGE_DATA). + * + * @return 0 for success, -1 for failure, or the following sentinels: + * - RECORD_NOT_PROCESSED + * - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and + * a failover is needed. + */ +#define RECORD_NOT_PROCESSED 1 +#define BROKEN_CHANNEL 2 + int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec); + + /** + * Perform any actions required after the static data has arrived. Called + * when the STATIC_DATA_COMPLETE record has been recieved/inferred. + * 'missing' should be filled in for any data item the higher level + * toolstack needs to provide compatiblity for. + */ + int (*static_data_complete)(struct xc_sr_context *ctx, + unsigned int *missing); + + /** + * Perform any actions required after the stream has been finished. Called + * after the END record has been received. + */ + int (*stream_complete)(struct xc_sr_context *ctx); + + /** + * Clean up the local environment. Will be called exactly once, either + * after a successful restore, or upon encountering an error. + */ + int (*cleanup)(struct xc_sr_context *ctx); +}; + +/* Wrapper for blobs of data heading Xen-wards. */ +struct xc_sr_blob +{ + void *ptr; + size_t size; +}; + +/* + * Update a blob. Duplicate src/size, freeing the old blob if necessary. May + * fail due to memory allocation. + */ +static inline int update_blob(struct xc_sr_blob *blob, + const void *src, size_t size) +{ + void *ptr; + + if ( !src || !size ) + { + errno = EINVAL; + return -1; + } + + if ( (ptr = malloc(size)) == NULL ) + return -1; + + free(blob->ptr); + blob->ptr = memcpy(ptr, src, size); + blob->size = size; + + return 0; +} + +struct xc_sr_context +{ + xc_interface *xch; + uint32_t domid; + int fd; + + /* Plain VM, or checkpoints over time. */ + xc_stream_type_t stream_type; + + xc_dominfo_t dominfo; + + union /* Common save or restore data. */ + { + struct /* Save data. */ + { + int recv_fd; + + struct xc_sr_save_ops ops; + struct save_callbacks *callbacks; + + /* Live migrate vs non live suspend. */ + bool live; + + /* Further debugging information in the stream. */ + bool debug; + + unsigned long p2m_size; + + struct precopy_stats stats; + + xen_pfn_t *batch_pfns; + unsigned int nr_batch_pfns; + unsigned long *deferred_pages; + unsigned long nr_deferred_pages; + xc_hypercall_buffer_t dirty_bitmap_hbuf; + } save; + + struct /* Restore data. */ + { + struct xc_sr_restore_ops ops; + struct restore_callbacks *callbacks; + + int send_back_fd; + unsigned long p2m_size; + xc_hypercall_buffer_t dirty_bitmap_hbuf; + + /* From Image Header. */ + uint32_t format_version; + + /* From Domain Header. */ + uint32_t guest_type; + uint32_t guest_page_size; + + /* Currently buffering records between a checkpoint */ + bool buffer_all_records; + + /* Whether a STATIC_DATA_END record has been seen/inferred. */ + bool seen_static_data_end; + +/* + * With Remus/COLO, we buffer the records sent by the primary at checkpoint, + * in case the primary will fail, we can recover from the last + * checkpoint state. + * This should be enough for most of the cases because primary only send + * dirty pages at checkpoint. + */ +#define DEFAULT_BUF_RECORDS 1024 + struct xc_sr_record *buffered_records; + unsigned int allocated_rec_num; + unsigned int buffered_rec_num; + + /* + * Xenstore and Console parameters. + * INPUT: evtchn & domid + * OUTPUT: gfn + */ + xen_pfn_t xenstore_gfn, console_gfn; + unsigned int xenstore_evtchn, console_evtchn; + uint32_t xenstore_domid, console_domid; + + /* Bitmap of currently populated PFNs during restore. */ + unsigned long *populated_pfns; + xen_pfn_t max_populated_pfn; + + /* Sender has invoked verify mode on the stream. */ + bool verify; + } restore; + }; + + union /* Guest-arch specific data. */ + { + struct /* x86 */ + { + /* Common save/restore data. */ + union + { + struct + { + /* X86_{CPUID,MSR}_DATA blobs for CPU Policy. */ + struct xc_sr_blob cpuid, msr; + } restore; + }; + + struct /* x86 PV guest. */ + { + /* 4 or 8; 32 or 64 bit domain */ + unsigned int width; + /* 3 or 4 pagetable levels */ + unsigned int levels; + + /* Maximum Xen frame */ + xen_pfn_t max_mfn; + /* Read-only machine to phys map */ + xen_pfn_t *m2p; + /* first mfn of the compat m2p (Only needed for 32bit PV guests) */ + xen_pfn_t compat_m2p_mfn0; + /* Number of m2p frames mapped */ + unsigned long nr_m2p_frames; + + /* Maximum guest frame */ + xen_pfn_t max_pfn; + + /* Number of frames making up the p2m */ + unsigned int p2m_frames; + /* Guest's phys to machine map. Mapped read-only (save) or + * allocated locally (restore). Uses guest unsigned longs. */ + void *p2m; + /* The guest pfns containing the p2m leaves */ + xen_pfn_t *p2m_pfns; + + /* Read-only mapping of guests shared info page */ + shared_info_any_t *shinfo; + + /* p2m generation count for verifying validity of local p2m. */ + uint64_t p2m_generation; + + union + { + struct + { + /* State machine for the order of received records. */ + bool seen_pv_info; + + /* Types for each page (bounded by max_pfn). */ + uint32_t *pfn_types; + + /* x86 PV per-vcpu storage structure for blobs. */ + struct xc_sr_x86_pv_restore_vcpu + { + struct xc_sr_blob basic, extd, xsave, msr; + } *vcpus; + unsigned int nr_vcpus; + } restore; + }; + } pv; + + struct /* x86 HVM guest. */ + { + union + { + struct + { + /* Whether qemu enabled logdirty mode, and we should + * disable on cleanup. */ + bool qemu_enabled_logdirty; + } save; + + struct + { + /* HVM context blob. */ + struct xc_sr_blob context; + } restore; + }; + } hvm; + + } x86; + }; +}; + +extern struct xc_sr_save_ops save_ops_x86_pv; +extern struct xc_sr_save_ops save_ops_x86_hvm; + +extern struct xc_sr_restore_ops restore_ops_x86_pv; +extern struct xc_sr_restore_ops restore_ops_x86_hvm; + +struct xc_sr_record +{ + uint32_t type; + uint32_t length; + void *data; +}; + +/* + * Writes a split record to the stream, applying correct padding where + * appropriate. It is common when sending records containing blobs from Xen + * that the header and blob data are separate. This function accepts a second + * buffer and length, and will merge it with the main record when sending. + * + * Records with a non-zero length must provide a valid data field; records + * with a 0 length shall have their data field ignored. + * + * Returns 0 on success and non0 on failure. + */ +int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec, + void *buf, size_t sz); + +/* + * Writes a record to the stream, applying correct padding where appropriate. + * Records with a non-zero length must provide a valid data field; records + * with a 0 length shall have their data field ignored. + * + * Returns 0 on success and non0 on failure. + */ +static inline int write_record(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + return write_split_record(ctx, rec, NULL, 0); +} + +/* + * Reads a record from the stream, and fills in the record structure. + * + * Returns 0 on success and non-0 on failure. + * + * On success, the records type and size shall be valid. + * - If size is 0, data shall be NULL. + * - If size is non-0, data shall be a buffer allocated by malloc() which must + * be passed to free() by the caller. + * + * On failure, the contents of the record structure are undefined. + */ +int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec); + +/* + * This would ideally be private in restore.c, but is needed by + * x86_pv_localise_page() if we receive pagetables frames ahead of the + * contents of the frames they point at. + */ +int populate_pfns(struct xc_sr_context *ctx, unsigned int count, + const xen_pfn_t *original_pfns, const uint32_t *types); + +/* Handle a STATIC_DATA_END record. */ +int handle_static_data_end(struct xc_sr_context *ctx); + +#endif +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_common_x86.c b/tools/libs/guest/xg_sr_common_x86.c new file mode 100644 index 0000000000..6f12483907 --- /dev/null +++ b/tools/libs/guest/xg_sr_common_x86.c @@ -0,0 +1,173 @@ +#include "xg_sr_common_x86.h" + +int write_x86_tsc_info(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_x86_tsc_info tsc = {}; + struct xc_sr_record rec = { + .type = REC_TYPE_X86_TSC_INFO, + .length = sizeof(tsc), + .data = &tsc, + }; + + if ( xc_domain_get_tsc_info(xch, ctx->domid, &tsc.mode, + &tsc.nsec, &tsc.khz, &tsc.incarnation) < 0 ) + { + PERROR("Unable to obtain TSC information"); + return -1; + } + + return write_record(ctx, &rec); +} + +int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_x86_tsc_info *tsc = rec->data; + + if ( rec->length != sizeof(*tsc) ) + { + ERROR("X86_TSC_INFO record wrong size: length %u, expected %zu", + rec->length, sizeof(*tsc)); + return -1; + } + + if ( xc_domain_set_tsc_info(xch, ctx->domid, tsc->mode, + tsc->nsec, tsc->khz, tsc->incarnation) ) + { + PERROR("Unable to set TSC information"); + return -1; + } + + return 0; +} + +int write_x86_cpu_policy_records(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_record cpuid = { .type = REC_TYPE_X86_CPUID_POLICY, }; + struct xc_sr_record msrs = { .type = REC_TYPE_X86_MSR_POLICY, }; + uint32_t nr_leaves = 0, nr_msrs = 0; + int rc; + + if ( xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs) < 0 ) + { + PERROR("Unable to get CPU Policy size"); + return -1; + } + + cpuid.data = malloc(nr_leaves * sizeof(xen_cpuid_leaf_t)); + msrs.data = malloc(nr_msrs * sizeof(xen_msr_entry_t)); + if ( !cpuid.data || !msrs.data ) + { + ERROR("Cannot allocate memory for CPU Policy"); + rc = -1; + goto out; + } + + if ( xc_get_domain_cpu_policy(xch, ctx->domid, &nr_leaves, cpuid.data, + &nr_msrs, msrs.data) ) + { + PERROR("Unable to get d%d CPU Policy", ctx->domid); + rc = -1; + goto out; + } + + cpuid.length = nr_leaves * sizeof(xen_cpuid_leaf_t); + if ( cpuid.length ) + { + rc = write_record(ctx, &cpuid); + if ( rc ) + goto out; + } + + msrs.length = nr_msrs * sizeof(xen_msr_entry_t); + if ( msrs.length ) + rc = write_record(ctx, &msrs); + + out: + free(cpuid.data); + free(msrs.data); + + return rc; +} + +int handle_x86_cpuid_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + int rc; + + if ( rec->length == 0 || + rec->length % sizeof(xen_cpuid_leaf_t) != 0 ) + { + ERROR("X86_CPUID_POLICY size %u should be multiple of %zu", + rec->length, sizeof(xen_cpuid_leaf_t)); + return -1; + } + + rc = update_blob(&ctx->x86.restore.cpuid, rec->data, rec->length); + if ( rc ) + ERROR("Unable to allocate %u bytes for X86_CPUID_POLICY", rec->length); + + return rc; +} + +int handle_x86_msr_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + int rc; + + if ( rec->length == 0 || + rec->length % sizeof(xen_msr_entry_t) != 0 ) + { + ERROR("X86_MSR_POLICY size %u should be multiple of %zu", + rec->length, sizeof(xen_cpuid_leaf_t)); + return -1; + } + + rc = update_blob(&ctx->x86.restore.msr, rec->data, rec->length); + if ( rc ) + ERROR("Unable to allocate %u bytes for X86_MSR_POLICY", rec->length); + + return rc; +} + +int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing) +{ + xc_interface *xch = ctx->xch; + uint32_t nr_leaves = 0, nr_msrs = 0; + uint32_t err_l = ~0, err_s = ~0, err_m = ~0; + + if ( ctx->x86.restore.cpuid.ptr ) + nr_leaves = ctx->x86.restore.cpuid.size / sizeof(xen_cpuid_leaf_t); + else + *missing |= XGR_SDD_MISSING_CPUID; + + if ( ctx->x86.restore.msr.ptr ) + nr_msrs = ctx->x86.restore.msr.size / sizeof(xen_msr_entry_t); + else + *missing |= XGR_SDD_MISSING_MSR; + + if ( (nr_leaves || nr_msrs) && + xc_set_domain_cpu_policy(xch, ctx->domid, + nr_leaves, ctx->x86.restore.cpuid.ptr, + nr_msrs, ctx->x86.restore.msr.ptr, + &err_l, &err_s, &err_m) ) + { + PERROR("Failed to set CPUID policy: leaf %08x, subleaf %08x, msr %08x", + err_l, err_s, err_m); + return -1; + } + + return 0; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_common_x86.h b/tools/libs/guest/xg_sr_common_x86.h new file mode 100644 index 0000000000..b55758c96d --- /dev/null +++ b/tools/libs/guest/xg_sr_common_x86.h @@ -0,0 +1,51 @@ +#ifndef __COMMON_X86__H +#define __COMMON_X86__H + +#include "xg_sr_common.h" + +/* + * Obtains a domains TSC information from Xen and writes a X86_TSC_INFO record + * into the stream. + */ +int write_x86_tsc_info(struct xc_sr_context *ctx); + +/* + * Parses a X86_TSC_INFO record and applies the result to the domain. + */ +int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec); + +/* + * Obtains a domains CPU Policy from Xen, and writes X86_{CPUID,MSR}_POLICY + * records into the stream. + */ +int write_x86_cpu_policy_records(struct xc_sr_context *ctx); + +/* + * Parses an X86_CPUID_POLICY record and stashes the content for application + * when a STATIC_DATA_END record is encountered. + */ +int handle_x86_cpuid_policy(struct xc_sr_context *ctx, + struct xc_sr_record *rec); + +/* + * Parses an X86_MSR_POLICY record and stashes the content for application + * when a STATIC_DATA_END record is encountered. + */ +int handle_x86_msr_policy(struct xc_sr_context *ctx, + struct xc_sr_record *rec); + +/* + * Perform common x86 actions required after the static data has arrived. + */ +int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing); + +#endif +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_common_x86_pv.c b/tools/libs/guest/xg_sr_common_x86_pv.c new file mode 100644 index 0000000000..cd33406aab --- /dev/null +++ b/tools/libs/guest/xg_sr_common_x86_pv.c @@ -0,0 +1,193 @@ +#include + +#include "xg_sr_common_x86_pv.h" + +xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn) +{ + assert(mfn <= ctx->x86.pv.max_mfn); + return ctx->x86.pv.m2p[mfn]; +} + +bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn) +{ + return ((mfn <= ctx->x86.pv.max_mfn) && + (mfn_to_pfn(ctx, mfn) <= ctx->x86.pv.max_pfn) && + (xc_pfn_to_mfn(mfn_to_pfn(ctx, mfn), ctx->x86.pv.p2m, + ctx->x86.pv.width) == mfn)); +} + +void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t pfn = ~0UL; + + ERROR("mfn %#lx, max %#lx", mfn, ctx->x86.pv.max_mfn); + + if ( (mfn != ~0UL) && (mfn <= ctx->x86.pv.max_mfn) ) + { + pfn = ctx->x86.pv.m2p[mfn]; + ERROR(" m2p[%#lx] = %#lx, max_pfn %#lx", + mfn, pfn, ctx->x86.pv.max_pfn); + } + + if ( (pfn != ~0UL) && (pfn <= ctx->x86.pv.max_pfn) ) + ERROR(" p2m[%#lx] = %#lx", + pfn, xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width)); +} + +xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3) +{ + if ( ctx->x86.pv.width == 8 ) + return cr3 >> 12; + else + { + /* 32bit guests can't represent mfns wider than 32 bits */ + if ( cr3 & 0xffffffff00000000UL ) + return ~0UL; + else + return (uint32_t)((cr3 >> 12) | (cr3 << 20)); + } +} + +uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t _mfn) +{ + uint64_t mfn = _mfn; + + if ( ctx->x86.pv.width == 8 ) + return mfn << 12; + else + { + /* 32bit guests can't represent mfns wider than 32 bits */ + if ( mfn & 0xffffffff00000000UL ) + return ~0UL; + else + return (uint32_t)((mfn << 12) | (mfn >> 20)); + } +} + +int x86_pv_domain_info(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + unsigned int guest_width, guest_levels; + + /* Get the domain width */ + if ( xc_domain_get_guest_width(xch, ctx->domid, &guest_width) ) + { + PERROR("Unable to determine dom%d's width", ctx->domid); + return -1; + } + + if ( guest_width == 4 ) + guest_levels = 3; + else if ( guest_width == 8 ) + guest_levels = 4; + else + { + ERROR("Invalid guest width %d. Expected 32 or 64", guest_width * 8); + return -1; + } + ctx->x86.pv.width = guest_width; + ctx->x86.pv.levels = guest_levels; + + DPRINTF("%d bits, %d levels", guest_width * 8, guest_levels); + + return 0; +} + +int x86_pv_map_m2p(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t m2p_chunks, m2p_size, max_page; + privcmd_mmap_entry_t *entries = NULL; + xen_pfn_t *extents_start = NULL; + int rc = -1, i; + + if ( xc_maximum_ram_page(xch, &max_page) < 0 ) + { + PERROR("Failed to get maximum ram page"); + goto err; + } + + ctx->x86.pv.max_mfn = max_page; + m2p_size = M2P_SIZE(ctx->x86.pv.max_mfn); + m2p_chunks = M2P_CHUNKS(ctx->x86.pv.max_mfn); + + extents_start = malloc(m2p_chunks * sizeof(xen_pfn_t)); + if ( !extents_start ) + { + ERROR("Unable to allocate %lu bytes for m2p mfns", + m2p_chunks * sizeof(xen_pfn_t)); + goto err; + } + + if ( xc_machphys_mfn_list(xch, m2p_chunks, extents_start) ) + { + PERROR("Failed to get m2p mfn list"); + goto err; + } + + entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)); + if ( !entries ) + { + ERROR("Unable to allocate %lu bytes for m2p mapping mfns", + m2p_chunks * sizeof(privcmd_mmap_entry_t)); + goto err; + } + + for ( i = 0; i < m2p_chunks; ++i ) + entries[i].mfn = extents_start[i]; + + ctx->x86.pv.m2p = xc_map_foreign_ranges( + xch, DOMID_XEN, m2p_size, PROT_READ, + M2P_CHUNK_SIZE, entries, m2p_chunks); + + if ( !ctx->x86.pv.m2p ) + { + PERROR("Failed to mmap() m2p ranges"); + goto err; + } + + ctx->x86.pv.nr_m2p_frames = (M2P_CHUNK_SIZE >> PAGE_SHIFT) * m2p_chunks; + +#ifdef __i386__ + /* 32 bit toolstacks automatically get the compat m2p */ + ctx->x86.pv.compat_m2p_mfn0 = entries[0].mfn; +#else + /* 64 bit toolstacks need to ask Xen specially for it */ + { + struct xen_machphys_mfn_list xmml = { + .max_extents = 1, + .extent_start = { &ctx->x86.pv.compat_m2p_mfn0 }, + }; + + rc = do_memory_op(xch, XENMEM_machphys_compat_mfn_list, + &xmml, sizeof(xmml)); + if ( rc || xmml.nr_extents != 1 ) + { + PERROR("Failed to get compat mfn list from Xen"); + rc = -1; + goto err; + } + } +#endif + + /* All Done */ + rc = 0; + DPRINTF("max_mfn %#lx", ctx->x86.pv.max_mfn); + + err: + free(entries); + free(extents_start); + + return rc; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_common_x86_pv.h b/tools/libs/guest/xg_sr_common_x86_pv.h new file mode 100644 index 0000000000..953b5bfb8d --- /dev/null +++ b/tools/libs/guest/xg_sr_common_x86_pv.h @@ -0,0 +1,109 @@ +#ifndef __COMMON_X86_PV_H +#define __COMMON_X86_PV_H + +#include "xg_sr_common_x86.h" + +/* Virtual address ranges reserved for hypervisor. */ +#define HYPERVISOR_VIRT_START_X86_64 0xFFFF800000000000ULL +#define HYPERVISOR_VIRT_END_X86_64 0xFFFF87FFFFFFFFFFULL + +#define HYPERVISOR_VIRT_START_X86_32 0x00000000F5800000ULL +#define HYPERVISOR_VIRT_END_X86_32 0x00000000FFFFFFFFULL + +/* + * Convert an mfn to a pfn, given Xen's m2p table. + * + * Caller must ensure that the requested mfn is in range. + */ +xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn); + +/* + * Query whether a particular mfn is valid in the physmap of a guest. + */ +bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn); + +/* + * Debug a particular mfn by walking the p2m and m2p. + */ +void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn); + +/* + * Convert a PV cr3 field to an mfn. + * + * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into + * a 32bit architectural cr3. + */ +xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3); + +/* + * Convert an mfn to a PV cr3 field. + * + * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into + * a 32bit architectural cr3. + */ +uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t mfn); + +/* Bits 12 through 51 of a PTE point at the frame */ +#define PTE_FRAME_MASK 0x000ffffffffff000ULL + +/* + * Extract an mfn from a Pagetable Entry. May return INVALID_MFN if the pte + * would overflow a 32bit xen_pfn_t. + */ +static inline xen_pfn_t pte_to_frame(uint64_t pte) +{ + uint64_t frame = (pte & PTE_FRAME_MASK) >> PAGE_SHIFT; + +#ifdef __i386__ + if ( frame >= INVALID_MFN ) + return INVALID_MFN; +#endif + + return frame; +} + +/* + * Change the frame in a Pagetable Entry while leaving the flags alone. + */ +static inline uint64_t merge_pte(uint64_t pte, xen_pfn_t mfn) +{ + return (pte & ~PTE_FRAME_MASK) | ((uint64_t)mfn << PAGE_SHIFT); +} + +/* + * Get current domain information. + * + * Fills ctx->x86.pv + * - .width + * - .levels + * - .fpp + * - .p2m_frames + * + * Used by the save side to create the X86_PV_INFO record, and by the restore + * side to verify the incoming stream. + * + * Returns 0 on success and non-zero on error. + */ +int x86_pv_domain_info(struct xc_sr_context *ctx); + +/* + * Maps the Xen M2P. + * + * Fills ctx->x86.pv. + * - .max_mfn + * - .m2p + * + * Returns 0 on success and non-zero on error. + */ +int x86_pv_map_m2p(struct xc_sr_context *ctx); + +#endif +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_restore.c b/tools/libs/guest/xg_sr_restore.c new file mode 100644 index 0000000000..b57a787519 --- /dev/null +++ b/tools/libs/guest/xg_sr_restore.c @@ -0,0 +1,986 @@ +#include + +#include + +#include "xg_sr_common.h" + +/* + * Read and validate the Image and Domain headers. + */ +static int read_headers(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_ihdr ihdr; + struct xc_sr_dhdr dhdr; + + if ( read_exact(ctx->fd, &ihdr, sizeof(ihdr)) ) + { + PERROR("Failed to read Image Header from stream"); + return -1; + } + + ihdr.id = ntohl(ihdr.id); + ihdr.version = ntohl(ihdr.version); + ihdr.options = ntohs(ihdr.options); + + if ( ihdr.marker != IHDR_MARKER ) + { + ERROR("Invalid marker: Got 0x%016"PRIx64, ihdr.marker); + return -1; + } + + if ( ihdr.id != IHDR_ID ) + { + ERROR("Invalid ID: Expected 0x%08x, Got 0x%08x", IHDR_ID, ihdr.id); + return -1; + } + + if ( ihdr.version < 2 || ihdr.version > 3 ) + { + ERROR("Invalid Version: Expected 2 <= ver <= 3, Got %d", + ihdr.version); + return -1; + } + + if ( ihdr.options & IHDR_OPT_BIG_ENDIAN ) + { + ERROR("Unable to handle big endian streams"); + return -1; + } + + ctx->restore.format_version = ihdr.version; + + if ( read_exact(ctx->fd, &dhdr, sizeof(dhdr)) ) + { + PERROR("Failed to read Domain Header from stream"); + return -1; + } + + ctx->restore.guest_type = dhdr.type; + ctx->restore.guest_page_size = (1U << dhdr.page_shift); + + if ( dhdr.xen_major == 0 ) + { + IPRINTF("Found %s domain, converted from legacy stream format", + dhdr_type_to_str(dhdr.type)); + DPRINTF(" Legacy conversion script version %u", dhdr.xen_minor); + } + else + IPRINTF("Found %s domain from Xen %u.%u", + dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor); + return 0; +} + +/* + * Is a pfn populated? + */ +static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + if ( pfn > ctx->restore.max_populated_pfn ) + return false; + return test_bit(pfn, ctx->restore.populated_pfns); +} + +/* + * Set a pfn as populated, expanding the tracking structures if needed. To + * avoid realloc()ing too excessively, the size increased to the nearest power + * of two large enough to contain the required pfn. + */ +static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xc_interface *xch = ctx->xch; + + if ( pfn > ctx->restore.max_populated_pfn ) + { + xen_pfn_t new_max; + size_t old_sz, new_sz; + unsigned long *p; + + /* Round up to the nearest power of two larger than pfn, less 1. */ + new_max = pfn; + new_max |= new_max >> 1; + new_max |= new_max >> 2; + new_max |= new_max >> 4; + new_max |= new_max >> 8; + new_max |= new_max >> 16; +#ifdef __x86_64__ + new_max |= new_max >> 32; +#endif + + old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1); + new_sz = bitmap_size(new_max + 1); + p = realloc(ctx->restore.populated_pfns, new_sz); + if ( !p ) + { + ERROR("Failed to realloc populated bitmap"); + errno = ENOMEM; + return -1; + } + + memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz); + + ctx->restore.populated_pfns = p; + ctx->restore.max_populated_pfn = new_max; + } + + assert(!test_bit(pfn, ctx->restore.populated_pfns)); + set_bit(pfn, ctx->restore.populated_pfns); + + return 0; +} + +/* + * Given a set of pfns, obtain memory from Xen to fill the physmap for the + * unpopulated subset. If types is NULL, no page type checking is performed + * and all unpopulated pfns are populated. + */ +int populate_pfns(struct xc_sr_context *ctx, unsigned int count, + const xen_pfn_t *original_pfns, const uint32_t *types) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t *mfns = malloc(count * sizeof(*mfns)), + *pfns = malloc(count * sizeof(*pfns)); + unsigned int i, nr_pfns = 0; + int rc = -1; + + if ( !mfns || !pfns ) + { + ERROR("Failed to allocate %zu bytes for populating the physmap", + 2 * count * sizeof(*mfns)); + goto err; + } + + for ( i = 0; i < count; ++i ) + { + if ( (!types || (types && + (types[i] != XEN_DOMCTL_PFINFO_XTAB && + types[i] != XEN_DOMCTL_PFINFO_BROKEN))) && + !pfn_is_populated(ctx, original_pfns[i]) ) + { + rc = pfn_set_populated(ctx, original_pfns[i]); + if ( rc ) + goto err; + pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i]; + ++nr_pfns; + } + } + + if ( nr_pfns ) + { + rc = xc_domain_populate_physmap_exact( + xch, ctx->domid, nr_pfns, 0, 0, mfns); + if ( rc ) + { + PERROR("Failed to populate physmap"); + goto err; + } + + for ( i = 0; i < nr_pfns; ++i ) + { + if ( mfns[i] == INVALID_MFN ) + { + ERROR("Populate physmap failed for pfn %u", i); + rc = -1; + goto err; + } + + ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]); + } + } + + rc = 0; + + err: + free(pfns); + free(mfns); + + return rc; +} + +/* + * Given a list of pfns, their types, and a block of page data from the + * stream, populate and record their types, map the relevant subset and copy + * the data into the guest. + */ +static int process_page_data(struct xc_sr_context *ctx, unsigned int count, + xen_pfn_t *pfns, uint32_t *types, void *page_data) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t *mfns = malloc(count * sizeof(*mfns)); + int *map_errs = malloc(count * sizeof(*map_errs)); + int rc; + void *mapping = NULL, *guest_page = NULL; + unsigned int i, /* i indexes the pfns from the record. */ + j, /* j indexes the subset of pfns we decide to map. */ + nr_pages = 0; + + if ( !mfns || !map_errs ) + { + rc = -1; + ERROR("Failed to allocate %zu bytes to process page data", + count * (sizeof(*mfns) + sizeof(*map_errs))); + goto err; + } + + rc = populate_pfns(ctx, count, pfns, types); + if ( rc ) + { + ERROR("Failed to populate pfns for batch of %u pages", count); + goto err; + } + + for ( i = 0; i < count; ++i ) + { + ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]); + + switch ( types[i] ) + { + case XEN_DOMCTL_PFINFO_NOTAB: + + case XEN_DOMCTL_PFINFO_L1TAB: + case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB: + + case XEN_DOMCTL_PFINFO_L2TAB: + case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB: + + case XEN_DOMCTL_PFINFO_L3TAB: + case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB: + + case XEN_DOMCTL_PFINFO_L4TAB: + case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB: + + mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]); + break; + } + } + + /* Nothing to do? */ + if ( nr_pages == 0 ) + goto done; + + mapping = guest_page = xenforeignmemory_map( + xch->fmem, ctx->domid, PROT_READ | PROT_WRITE, + nr_pages, mfns, map_errs); + if ( !mapping ) + { + rc = -1; + PERROR("Unable to map %u mfns for %u pages of data", + nr_pages, count); + goto err; + } + + for ( i = 0, j = 0; i < count; ++i ) + { + switch ( types[i] ) + { + case XEN_DOMCTL_PFINFO_XTAB: + case XEN_DOMCTL_PFINFO_BROKEN: + case XEN_DOMCTL_PFINFO_XALLOC: + /* No page data to deal with. */ + continue; + } + + if ( map_errs[j] ) + { + rc = -1; + ERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed with %d", + pfns[i], mfns[j], types[i], map_errs[j]); + goto err; + } + + /* Undo page normalisation done by the saver. */ + rc = ctx->restore.ops.localise_page(ctx, types[i], page_data); + if ( rc ) + { + ERROR("Failed to localise pfn %#"PRIpfn" (type %#"PRIx32")", + pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT); + goto err; + } + + if ( ctx->restore.verify ) + { + /* Verify mode - compare incoming data to what we already have. */ + if ( memcmp(guest_page, page_data, PAGE_SIZE) ) + ERROR("verify pfn %#"PRIpfn" failed (type %#"PRIx32")", + pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT); + } + else + { + /* Regular mode - copy incoming data into place. */ + memcpy(guest_page, page_data, PAGE_SIZE); + } + + ++j; + guest_page += PAGE_SIZE; + page_data += PAGE_SIZE; + } + + done: + rc = 0; + + err: + if ( mapping ) + xenforeignmemory_unmap(xch->fmem, mapping, nr_pages); + + free(map_errs); + free(mfns); + + return rc; +} + +/* + * Validate a PAGE_DATA record from the stream, and pass the results to + * process_page_data() to actually perform the legwork. + */ +static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_page_data_header *pages = rec->data; + unsigned int i, pages_of_data = 0; + int rc = -1; + + xen_pfn_t *pfns = NULL, pfn; + uint32_t *types = NULL, type; + + /* + * v2 compatibility only exists for x86 streams. This is a bit of a + * bodge, but it is less bad than duplicating handle_page_data() between + * different architectures. + */ +#if defined(__i386__) || defined(__x86_64__) + /* v2 compat. Infer the position of STATIC_DATA_END. */ + if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end ) + { + rc = handle_static_data_end(ctx); + if ( rc ) + { + ERROR("Inferred STATIC_DATA_END record failed"); + goto err; + } + rc = -1; + } + + if ( !ctx->restore.seen_static_data_end ) + { + ERROR("No STATIC_DATA_END seen"); + goto err; + } +#endif + + if ( rec->length < sizeof(*pages) ) + { + ERROR("PAGE_DATA record truncated: length %u, min %zu", + rec->length, sizeof(*pages)); + goto err; + } + + if ( pages->count < 1 ) + { + ERROR("Expected at least 1 pfn in PAGE_DATA record"); + goto err; + } + + if ( rec->length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) ) + { + ERROR("PAGE_DATA record (length %u) too short to contain %u" + " pfns worth of information", rec->length, pages->count); + goto err; + } + + pfns = malloc(pages->count * sizeof(*pfns)); + types = malloc(pages->count * sizeof(*types)); + if ( !pfns || !types ) + { + ERROR("Unable to allocate enough memory for %u pfns", + pages->count); + goto err; + } + + for ( i = 0; i < pages->count; ++i ) + { + pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK; + if ( !ctx->restore.ops.pfn_is_valid(ctx, pfn) ) + { + ERROR("pfn %#"PRIpfn" (index %u) outside domain maximum", pfn, i); + goto err; + } + + type = (pages->pfn[i] & PAGE_DATA_TYPE_MASK) >> 32; + if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) && + ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) ) + { + ERROR("Invalid type %#"PRIx32" for pfn %#"PRIpfn" (index %u)", + type, pfn, i); + goto err; + } + + if ( type < XEN_DOMCTL_PFINFO_BROKEN ) + /* NOTAB and all L1 through L4 tables (including pinned) should + * have a page worth of data in the record. */ + pages_of_data++; + + pfns[i] = pfn; + types[i] = type; + } + + if ( rec->length != (sizeof(*pages) + + (sizeof(uint64_t) * pages->count) + + (PAGE_SIZE * pages_of_data)) ) + { + ERROR("PAGE_DATA record wrong size: length %u, expected " + "%zu + %zu + %lu", rec->length, sizeof(*pages), + (sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data)); + goto err; + } + + rc = process_page_data(ctx, pages->count, pfns, types, + &pages->pfn[pages->count]); + err: + free(types); + free(pfns); + + return rc; +} + +/* + * Send checkpoint dirty pfn list to primary. + */ +static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc = -1; + unsigned int count, written; + uint64_t i, *pfns = NULL; + struct iovec *iov = NULL; + xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size }; + struct xc_sr_record rec = { + .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST, + }; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->restore.dirty_bitmap_hbuf); + + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, + HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size, + NULL, 0, &stats) != ctx->restore.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + goto err; + } + + for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ ) + { + if ( test_bit(i, dirty_bitmap) ) + count++; + } + + + pfns = malloc(count * sizeof(*pfns)); + if ( !pfns ) + { + ERROR("Unable to allocate %zu bytes of memory for dirty pfn list", + count * sizeof(*pfns)); + goto err; + } + + for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i ) + { + if ( !test_bit(i, dirty_bitmap) ) + continue; + + if ( written > count ) + { + ERROR("Dirty pfn list exceed"); + goto err; + } + + pfns[written++] = i; + } + + /* iovec[] for writev(). */ + iov = malloc(3 * sizeof(*iov)); + if ( !iov ) + { + ERROR("Unable to allocate memory for sending dirty bitmap"); + goto err; + } + + rec.length = count * sizeof(*pfns); + + iov[0].iov_base = &rec.type; + iov[0].iov_len = sizeof(rec.type); + + iov[1].iov_base = &rec.length; + iov[1].iov_len = sizeof(rec.length); + + iov[2].iov_base = pfns; + iov[2].iov_len = count * sizeof(*pfns); + + if ( writev_exact(ctx->restore.send_back_fd, iov, 3) ) + { + PERROR("Failed to write dirty bitmap to stream"); + goto err; + } + + rc = 0; + err: + free(pfns); + free(iov); + return rc; +} + +static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec); +static int handle_checkpoint(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc = 0, ret; + unsigned int i; + + if ( ctx->stream_type == XC_STREAM_PLAIN ) + { + ERROR("Found checkpoint in non-checkpointed stream"); + rc = -1; + goto err; + } + + ret = ctx->restore.callbacks->checkpoint(ctx->restore.callbacks->data); + switch ( ret ) + { + case XGR_CHECKPOINT_SUCCESS: + break; + + case XGR_CHECKPOINT_FAILOVER: + if ( ctx->restore.buffer_all_records ) + rc = BROKEN_CHANNEL; + else + /* We don't have a consistent state */ + rc = -1; + goto err; + + default: /* Other fatal error */ + rc = -1; + goto err; + } + + if ( ctx->restore.buffer_all_records ) + { + IPRINTF("All records buffered"); + + for ( i = 0; i < ctx->restore.buffered_rec_num; i++ ) + { + rc = process_record(ctx, &ctx->restore.buffered_records[i]); + if ( rc ) + goto err; + } + ctx->restore.buffered_rec_num = 0; + IPRINTF("All records processed"); + } + else + ctx->restore.buffer_all_records = true; + + if ( ctx->stream_type == XC_STREAM_COLO ) + { +#define HANDLE_CALLBACK_RETURN_VALUE(ret) \ + do { \ + if ( ret == 1 ) \ + rc = 0; /* Success */ \ + else \ + { \ + if ( ret == 2 ) \ + rc = BROKEN_CHANNEL; \ + else \ + rc = -1; /* Some unspecified error */ \ + goto err; \ + } \ + } while (0) + + /* COLO */ + + /* We need to resume guest */ + rc = ctx->restore.ops.stream_complete(ctx); + if ( rc ) + goto err; + + ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn, + ctx->restore.console_gfn, + ctx->restore.callbacks->data); + + /* Resume secondary vm */ + ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + + /* Wait for a new checkpoint */ + ret = ctx->restore.callbacks->wait_checkpoint( + ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + + /* suspend secondary vm */ + ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data); + HANDLE_CALLBACK_RETURN_VALUE(ret); + +#undef HANDLE_CALLBACK_RETURN_VALUE + + rc = send_checkpoint_dirty_pfn_list(ctx); + if ( rc ) + goto err; + } + + err: + return rc; +} + +static int buffer_record(struct xc_sr_context *ctx, struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + unsigned int new_alloc_num; + struct xc_sr_record *p; + + if ( ctx->restore.buffered_rec_num >= ctx->restore.allocated_rec_num ) + { + new_alloc_num = ctx->restore.allocated_rec_num + DEFAULT_BUF_RECORDS; + p = realloc(ctx->restore.buffered_records, + new_alloc_num * sizeof(struct xc_sr_record)); + if ( !p ) + { + ERROR("Failed to realloc memory for buffered records"); + return -1; + } + + ctx->restore.buffered_records = p; + ctx->restore.allocated_rec_num = new_alloc_num; + } + + memcpy(&ctx->restore.buffered_records[ctx->restore.buffered_rec_num++], + rec, sizeof(*rec)); + + return 0; +} + +int handle_static_data_end(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + unsigned int missing = 0; + int rc = 0; + + if ( ctx->restore.seen_static_data_end ) + { + ERROR("Multiple STATIC_DATA_END records found"); + return -1; + } + + ctx->restore.seen_static_data_end = true; + + rc = ctx->restore.ops.static_data_complete(ctx, &missing); + if ( rc ) + return rc; + + if ( ctx->restore.callbacks->static_data_done && + (rc = ctx->restore.callbacks->static_data_done( + missing, ctx->restore.callbacks->data) != 0) ) + ERROR("static_data_done() callback failed: %d\n", rc); + + return rc; +} + +static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + int rc = 0; + + switch ( rec->type ) + { + case REC_TYPE_END: + break; + + case REC_TYPE_PAGE_DATA: + rc = handle_page_data(ctx, rec); + break; + + case REC_TYPE_VERIFY: + DPRINTF("Verify mode enabled"); + ctx->restore.verify = true; + break; + + case REC_TYPE_CHECKPOINT: + rc = handle_checkpoint(ctx); + break; + + case REC_TYPE_STATIC_DATA_END: + rc = handle_static_data_end(ctx); + break; + + default: + rc = ctx->restore.ops.process_record(ctx, rec); + break; + } + + free(rec->data); + rec->data = NULL; + + return rc; +} + +static int setup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->restore.dirty_bitmap_hbuf); + + if ( ctx->stream_type == XC_STREAM_COLO ) + { + dirty_bitmap = xc_hypercall_buffer_alloc_pages( + xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size))); + + if ( !dirty_bitmap ) + { + ERROR("Unable to allocate memory for dirty bitmap"); + rc = -1; + goto err; + } + } + + rc = ctx->restore.ops.setup(ctx); + if ( rc ) + goto err; + + ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1; + ctx->restore.populated_pfns = bitmap_alloc( + ctx->restore.max_populated_pfn + 1); + if ( !ctx->restore.populated_pfns ) + { + ERROR("Unable to allocate memory for populated_pfns bitmap"); + rc = -1; + goto err; + } + + ctx->restore.buffered_records = malloc( + DEFAULT_BUF_RECORDS * sizeof(struct xc_sr_record)); + if ( !ctx->restore.buffered_records ) + { + ERROR("Unable to allocate memory for buffered records"); + rc = -1; + goto err; + } + ctx->restore.allocated_rec_num = DEFAULT_BUF_RECORDS; + + err: + return rc; +} + +static void cleanup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + unsigned int i; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->restore.dirty_bitmap_hbuf); + + for ( i = 0; i < ctx->restore.buffered_rec_num; i++ ) + free(ctx->restore.buffered_records[i].data); + + if ( ctx->stream_type == XC_STREAM_COLO ) + xc_hypercall_buffer_free_pages( + xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size))); + + free(ctx->restore.buffered_records); + free(ctx->restore.populated_pfns); + + if ( ctx->restore.ops.cleanup(ctx) ) + PERROR("Failed to clean up"); +} + +/* + * Restore a domain. + */ +static int restore(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_record rec; + int rc, saved_rc = 0, saved_errno = 0; + + IPRINTF("Restoring domain"); + + rc = setup(ctx); + if ( rc ) + goto err; + + do + { + rc = read_record(ctx, ctx->fd, &rec); + if ( rc ) + { + if ( ctx->restore.buffer_all_records ) + goto remus_failover; + else + goto err; + } + + if ( ctx->restore.buffer_all_records && + rec.type != REC_TYPE_END && + rec.type != REC_TYPE_CHECKPOINT ) + { + rc = buffer_record(ctx, &rec); + if ( rc ) + goto err; + } + else + { + rc = process_record(ctx, &rec); + if ( rc == RECORD_NOT_PROCESSED ) + { + if ( rec.type & REC_TYPE_OPTIONAL ) + DPRINTF("Ignoring optional record %#x (%s)", + rec.type, rec_type_to_str(rec.type)); + else + { + ERROR("Mandatory record %#x (%s) not handled", + rec.type, rec_type_to_str(rec.type)); + rc = -1; + goto err; + } + } + else if ( rc == BROKEN_CHANNEL ) + goto remus_failover; + else if ( rc ) + goto err; + } + + } while ( rec.type != REC_TYPE_END ); + + remus_failover: + if ( ctx->stream_type == XC_STREAM_COLO ) + { + /* With COLO, we have already called stream_complete */ + rc = 0; + IPRINTF("COLO Failover"); + goto done; + } + + /* + * With Remus, if we reach here, there must be some error on primary, + * failover from the last checkpoint state. + */ + rc = ctx->restore.ops.stream_complete(ctx); + if ( rc ) + goto err; + + IPRINTF("Restore successful"); + goto done; + + err: + saved_errno = errno; + saved_rc = rc; + PERROR("Restore failed"); + + done: + cleanup(ctx); + + if ( saved_rc ) + { + rc = saved_rc; + errno = saved_errno; + } + + return rc; +} + +int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, + unsigned int store_evtchn, unsigned long *store_mfn, + uint32_t store_domid, unsigned int console_evtchn, + unsigned long *console_gfn, uint32_t console_domid, + xc_stream_type_t stream_type, + struct restore_callbacks *callbacks, int send_back_fd) +{ + xen_pfn_t nr_pfns; + struct xc_sr_context ctx = { + .xch = xch, + .fd = io_fd, + .stream_type = stream_type, + }; + + /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */ + ctx.restore.console_evtchn = console_evtchn; + ctx.restore.console_domid = console_domid; + ctx.restore.xenstore_evtchn = store_evtchn; + ctx.restore.xenstore_domid = store_domid; + ctx.restore.callbacks = callbacks; + ctx.restore.send_back_fd = send_back_fd; + + /* Sanity check stream_type-related parameters */ + switch ( stream_type ) + { + case XC_STREAM_COLO: + assert(callbacks->suspend && + callbacks->postcopy && + callbacks->wait_checkpoint && + callbacks->restore_results); + /* Fallthrough */ + case XC_STREAM_REMUS: + assert(callbacks->checkpoint); + /* Fallthrough */ + case XC_STREAM_PLAIN: + break; + + default: + assert(!"Bad stream_type"); + break; + } + + if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 ) + { + PERROR("Failed to get domain info"); + return -1; + } + + if ( ctx.dominfo.domid != dom ) + { + ERROR("Domain %u does not exist", dom); + return -1; + } + + DPRINTF("fd %d, dom %u, hvm %u, stream_type %d", + io_fd, dom, ctx.dominfo.hvm, stream_type); + + ctx.domid = dom; + + if ( read_headers(&ctx) ) + return -1; + + if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 ) + { + PERROR("Unable to obtain the guest p2m size"); + return -1; + } + + ctx.restore.p2m_size = nr_pfns; + ctx.restore.ops = ctx.dominfo.hvm + ? restore_ops_x86_hvm : restore_ops_x86_pv; + + if ( restore(&ctx) ) + return -1; + + IPRINTF("XenStore: mfn %#"PRIpfn", dom %d, evt %u", + ctx.restore.xenstore_gfn, + ctx.restore.xenstore_domid, + ctx.restore.xenstore_evtchn); + + IPRINTF("Console: mfn %#"PRIpfn", dom %d, evt %u", + ctx.restore.console_gfn, + ctx.restore.console_domid, + ctx.restore.console_evtchn); + + *console_gfn = ctx.restore.console_gfn; + *store_mfn = ctx.restore.xenstore_gfn; + + return 0; +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_restore_x86_hvm.c b/tools/libs/guest/xg_sr_restore_x86_hvm.c new file mode 100644 index 0000000000..d6ea6f3012 --- /dev/null +++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c @@ -0,0 +1,274 @@ +#include +#include + +#include "xg_sr_common_x86.h" + +/* + * Process an HVM_CONTEXT record from the stream. + */ +static int handle_hvm_context(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + int rc = update_blob(&ctx->x86.hvm.restore.context, rec->data, rec->length); + + if ( rc ) + ERROR("Unable to allocate %u bytes for hvm context", rec->length); + + return rc; +} + +/* + * Process an HVM_PARAMS record from the stream. + */ +static int handle_hvm_params(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_hvm_params *hdr = rec->data; + struct xc_sr_rec_hvm_params_entry *entry = hdr->param; + unsigned int i; + int rc; + + if ( rec->length < sizeof(*hdr) ) + { + ERROR("HVM_PARAMS record truncated: length %u, header size %zu", + rec->length, sizeof(*hdr)); + return -1; + } + + if ( rec->length != (sizeof(*hdr) + hdr->count * sizeof(*entry)) ) + { + ERROR("HVM_PARAMS record truncated: header %zu, count %u, " + "expected len %zu, got %u", + sizeof(*hdr), hdr->count, hdr->count * sizeof(*entry), + rec->length); + return -1; + } + + /* + * Tolerate empty records. Older sending sides used to accidentally + * generate them. + */ + if ( hdr->count == 0 ) + { + DBGPRINTF("Skipping empty HVM_PARAMS record\n"); + return 0; + } + + for ( i = 0; i < hdr->count; i++, entry++ ) + { + switch ( entry->index ) + { + case HVM_PARAM_CONSOLE_PFN: + ctx->restore.console_gfn = entry->value; + xc_clear_domain_page(xch, ctx->domid, entry->value); + break; + case HVM_PARAM_STORE_PFN: + ctx->restore.xenstore_gfn = entry->value; + xc_clear_domain_page(xch, ctx->domid, entry->value); + break; + case HVM_PARAM_IOREQ_PFN: + case HVM_PARAM_BUFIOREQ_PFN: + xc_clear_domain_page(xch, ctx->domid, entry->value); + break; + + case HVM_PARAM_PAE_ENABLED: + /* + * This HVM_PARAM only ever existed to pass data into + * xc_cpuid_apply_policy(). The function has now been updated to + * use a normal calling convention, making the param obsolete. + * + * Discard if we find it in an old migration stream. + */ + continue; + } + + rc = xc_hvm_param_set(xch, ctx->domid, entry->index, entry->value); + if ( rc < 0 ) + { + PERROR("set HVM param %"PRId64" = 0x%016"PRIx64, + entry->index, entry->value); + return rc; + } + } + return 0; +} + +/* restore_ops function. */ +static bool x86_hvm_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + return true; +} + +/* restore_ops function. */ +static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx, + xen_pfn_t pfn) +{ + return pfn; +} + +/* restore_ops function. */ +static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, + xen_pfn_t gfn) +{ + /* no op */ +} + +/* restore_ops function. */ +static void x86_hvm_set_page_type(struct xc_sr_context *ctx, + xen_pfn_t pfn, xen_pfn_t type) +{ + /* no-op */ +} + +/* restore_ops function. */ +static int x86_hvm_localise_page(struct xc_sr_context *ctx, + uint32_t type, void *page) +{ + /* no-op */ + return 0; +} + +/* + * restore_ops function. Confirms the stream matches the domain. + */ +static int x86_hvm_setup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + + if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM ) + { + ERROR("Unable to restore %s domain into an x86 HVM domain", + dhdr_type_to_str(ctx->restore.guest_type)); + return -1; + } + + if ( ctx->restore.guest_page_size != PAGE_SIZE ) + { + ERROR("Invalid page size %u for x86 HVM domains", + ctx->restore.guest_page_size); + return -1; + } + +#ifdef __i386__ + /* Very large domains (> 1TB) will exhaust virtual address space. */ + if ( ctx->restore.p2m_size > 0x0fffffff ) + { + errno = E2BIG; + PERROR("Cannot restore this big a guest"); + return -1; + } +#endif + + return 0; +} + +/* + * restore_ops function. + */ +static int x86_hvm_process_record(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + switch ( rec->type ) + { + case REC_TYPE_X86_TSC_INFO: + return handle_x86_tsc_info(ctx, rec); + + case REC_TYPE_HVM_CONTEXT: + return handle_hvm_context(ctx, rec); + + case REC_TYPE_HVM_PARAMS: + return handle_hvm_params(ctx, rec); + + case REC_TYPE_X86_CPUID_POLICY: + return handle_x86_cpuid_policy(ctx, rec); + + case REC_TYPE_X86_MSR_POLICY: + return handle_x86_msr_policy(ctx, rec); + + default: + return RECORD_NOT_PROCESSED; + } +} + +/* + * restore_ops function. Sets extra hvm parameters and seeds the grant table. + */ +static int x86_hvm_stream_complete(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + + rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_STORE_EVTCHN, + ctx->restore.xenstore_evtchn); + if ( rc ) + { + PERROR("Failed to set HVM_PARAM_STORE_EVTCHN"); + return rc; + } + + rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_CONSOLE_EVTCHN, + ctx->restore.console_evtchn); + if ( rc ) + { + PERROR("Failed to set HVM_PARAM_CONSOLE_EVTCHN"); + return rc; + } + + rc = xc_domain_hvm_setcontext(xch, ctx->domid, + ctx->x86.hvm.restore.context.ptr, + ctx->x86.hvm.restore.context.size); + if ( rc < 0 ) + { + PERROR("Unable to restore HVM context"); + return rc; + } + + rc = xc_dom_gnttab_seed(xch, ctx->domid, true, + ctx->restore.console_gfn, + ctx->restore.xenstore_gfn, + ctx->restore.console_domid, + ctx->restore.xenstore_domid); + if ( rc ) + { + PERROR("Failed to seed grant table"); + return rc; + } + + return rc; +} + +static int x86_hvm_cleanup(struct xc_sr_context *ctx) +{ + free(ctx->x86.hvm.restore.context.ptr); + + free(ctx->x86.restore.cpuid.ptr); + free(ctx->x86.restore.msr.ptr); + + return 0; +} + +struct xc_sr_restore_ops restore_ops_x86_hvm = +{ + .pfn_is_valid = x86_hvm_pfn_is_valid, + .pfn_to_gfn = x86_hvm_pfn_to_gfn, + .set_gfn = x86_hvm_set_gfn, + .set_page_type = x86_hvm_set_page_type, + .localise_page = x86_hvm_localise_page, + .setup = x86_hvm_setup, + .process_record = x86_hvm_process_record, + .static_data_complete = x86_static_data_complete, + .stream_complete = x86_hvm_stream_complete, + .cleanup = x86_hvm_cleanup, +}; + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_restore_x86_pv.c b/tools/libs/guest/xg_sr_restore_x86_pv.c new file mode 100644 index 0000000000..dc50b0f5a8 --- /dev/null +++ b/tools/libs/guest/xg_sr_restore_x86_pv.c @@ -0,0 +1,1210 @@ +#include + +#include "xg_sr_common_x86_pv.h" + +static xen_pfn_t pfn_to_mfn(const struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + assert(pfn <= ctx->x86.pv.max_pfn); + + return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width); +} + +/* + * Expand our local tracking information for the p2m table and domains maximum + * size. Normally this will be called once to expand from 0 to max_pfn, but + * is liable to expand multiple times if the domain grows on the sending side + * after migration has started. + */ +static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn) +{ + xc_interface *xch = ctx->xch; + unsigned long old_max = ctx->x86.pv.max_pfn, i; + unsigned int fpp = PAGE_SIZE / ctx->x86.pv.width; + unsigned long end_frame = (max_pfn / fpp) + 1; + unsigned long old_end_frame = (old_max / fpp) + 1; + xen_pfn_t *p2m = NULL, *p2m_pfns = NULL; + uint32_t *pfn_types = NULL; + size_t p2msz, p2m_pfnsz, pfn_typesz; + + assert(max_pfn > old_max); + + p2msz = (max_pfn + 1) * ctx->x86.pv.width; + p2m = realloc(ctx->x86.pv.p2m, p2msz); + if ( !p2m ) + { + ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz); + return -1; + } + ctx->x86.pv.p2m = p2m; + + pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types); + pfn_types = realloc(ctx->x86.pv.restore.pfn_types, pfn_typesz); + if ( !pfn_types ) + { + ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz); + return -1; + } + ctx->x86.pv.restore.pfn_types = pfn_types; + + p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns); + p2m_pfns = realloc(ctx->x86.pv.p2m_pfns, p2m_pfnsz); + if ( !p2m_pfns ) + { + ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz); + return -1; + } + ctx->x86.pv.p2m_frames = end_frame; + ctx->x86.pv.p2m_pfns = p2m_pfns; + + ctx->x86.pv.max_pfn = max_pfn; + for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i ) + { + ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN); + ctx->restore.ops.set_page_type(ctx, i, 0); + } + + for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i ) + ctx->x86.pv.p2m_pfns[i] = INVALID_MFN; + + DPRINTF("Changed max_pfn from %#lx to %#lx", old_max, max_pfn); + return 0; +} + +/* + * Pin all of the pagetables. + */ +static int pin_pagetables(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + unsigned long i, nr_pins; + struct mmuext_op pin[MAX_PIN_BATCH]; + + for ( i = nr_pins = 0; i <= ctx->x86.pv.max_pfn; ++i ) + { + if ( (ctx->x86.pv.restore.pfn_types[i] & + XEN_DOMCTL_PFINFO_LPINTAB) == 0 ) + continue; + + switch ( (ctx->x86.pv.restore.pfn_types[i] & + XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ) + { + case XEN_DOMCTL_PFINFO_L1TAB: + pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE; + break; + case XEN_DOMCTL_PFINFO_L2TAB: + pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE; + break; + case XEN_DOMCTL_PFINFO_L3TAB: + pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE; + break; + case XEN_DOMCTL_PFINFO_L4TAB: + pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE; + break; + default: + continue; + } + + pin[nr_pins].arg1.mfn = pfn_to_mfn(ctx, i); + nr_pins++; + + if ( nr_pins == MAX_PIN_BATCH ) + { + if ( xc_mmuext_op(xch, pin, nr_pins, ctx->domid) != 0 ) + { + PERROR("Failed to pin batch of pagetables"); + return -1; + } + nr_pins = 0; + } + } + + if ( (nr_pins > 0) && (xc_mmuext_op(xch, pin, nr_pins, ctx->domid) < 0) ) + { + PERROR("Failed to pin batch of pagetables"); + return -1; + } + + return 0; +} + +/* + * Update details in a guests start_info structure. + */ +static int process_start_info(struct xc_sr_context *ctx, + vcpu_guest_context_any_t *vcpu) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t pfn, mfn; + start_info_any_t *guest_start_info = NULL; + int rc = -1; + + pfn = GET_FIELD(vcpu, user_regs.edx, ctx->x86.pv.width); + + if ( pfn > ctx->x86.pv.max_pfn ) + { + ERROR("Start Info pfn %#lx out of range", pfn); + goto err; + } + + if ( ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB ) + { + ERROR("Start Info pfn %#lx has bad type %u", pfn, + (ctx->x86.pv.restore.pfn_types[pfn] >> + XEN_DOMCTL_PFINFO_LTAB_SHIFT)); + goto err; + } + + mfn = pfn_to_mfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Start Info has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + SET_FIELD(vcpu, user_regs.edx, mfn, ctx->x86.pv.width); + guest_start_info = xc_map_foreign_range( + xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); + if ( !guest_start_info ) + { + PERROR("Failed to map Start Info at mfn %#lx", mfn); + goto err; + } + + /* Deal with xenstore stuff */ + pfn = GET_FIELD(guest_start_info, store_mfn, ctx->x86.pv.width); + if ( pfn > ctx->x86.pv.max_pfn ) + { + ERROR("XenStore pfn %#lx out of range", pfn); + goto err; + } + + mfn = pfn_to_mfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("XenStore pfn has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + ctx->restore.xenstore_gfn = mfn; + SET_FIELD(guest_start_info, store_mfn, mfn, ctx->x86.pv.width); + SET_FIELD(guest_start_info, store_evtchn, + ctx->restore.xenstore_evtchn, ctx->x86.pv.width); + + /* Deal with console stuff */ + pfn = GET_FIELD(guest_start_info, console.domU.mfn, ctx->x86.pv.width); + if ( pfn > ctx->x86.pv.max_pfn ) + { + ERROR("Console pfn %#lx out of range", pfn); + goto err; + } + + mfn = pfn_to_mfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Console pfn has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + ctx->restore.console_gfn = mfn; + SET_FIELD(guest_start_info, console.domU.mfn, mfn, ctx->x86.pv.width); + SET_FIELD(guest_start_info, console.domU.evtchn, + ctx->restore.console_evtchn, ctx->x86.pv.width); + + /* Set other information */ + SET_FIELD(guest_start_info, nr_pages, + ctx->x86.pv.max_pfn + 1, ctx->x86.pv.width); + SET_FIELD(guest_start_info, shared_info, + ctx->dominfo.shared_info_frame << PAGE_SHIFT, ctx->x86.pv.width); + SET_FIELD(guest_start_info, flags, 0, ctx->x86.pv.width); + + rc = 0; + + err: + if ( guest_start_info ) + munmap(guest_start_info, PAGE_SIZE); + + return rc; +} + +/* + * Process one stashed vcpu worth of basic state and send to Xen. + */ +static int process_vcpu_basic(struct xc_sr_context *ctx, + unsigned int vcpuid) +{ + xc_interface *xch = ctx->xch; + vcpu_guest_context_any_t *vcpu = ctx->x86.pv.restore.vcpus[vcpuid].basic.ptr; + xen_pfn_t pfn, mfn; + unsigned int i, gdt_count; + int rc = -1; + + /* Vcpu 0 is special: Convert the suspend record to an mfn. */ + if ( vcpuid == 0 ) + { + rc = process_start_info(ctx, vcpu); + if ( rc ) + return rc; + rc = -1; + } + + SET_FIELD(vcpu, flags, + GET_FIELD(vcpu, flags, ctx->x86.pv.width) | VGCF_online, + ctx->x86.pv.width); + + gdt_count = GET_FIELD(vcpu, gdt_ents, ctx->x86.pv.width); + if ( gdt_count > FIRST_RESERVED_GDT_ENTRY ) + { + ERROR("GDT entry count (%u) out of range (max %u)", + gdt_count, FIRST_RESERVED_GDT_ENTRY); + errno = ERANGE; + goto err; + } + gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */ + + /* Convert GDT frames to mfns. */ + for ( i = 0; i < gdt_count; ++i ) + { + pfn = GET_FIELD(vcpu, gdt_frames[i], ctx->x86.pv.width); + if ( pfn > ctx->x86.pv.max_pfn ) + { + ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn); + goto err; + } + + if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) + { + ERROR("GDT frame %u (pfn %#lx) has bad type %u", i, pfn, + (ctx->x86.pv.restore.pfn_types[pfn] >> + XEN_DOMCTL_PFINFO_LTAB_SHIFT)); + goto err; + } + + mfn = pfn_to_mfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("GDT frame %u has bad mfn", i); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + SET_FIELD(vcpu, gdt_frames[i], mfn, ctx->x86.pv.width); + } + + /* Convert CR3 to an mfn. */ + pfn = cr3_to_mfn(ctx, GET_FIELD(vcpu, ctrlreg[3], ctx->x86.pv.width)); + if ( pfn > ctx->x86.pv.max_pfn ) + { + ERROR("cr3 (pfn %#lx) out of range", pfn); + goto err; + } + + if ( (ctx->x86.pv.restore.pfn_types[pfn] & + XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != + (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) + { + ERROR("cr3 (pfn %#lx) has bad type %u, expected %u", pfn, + (ctx->x86.pv.restore.pfn_types[pfn] >> + XEN_DOMCTL_PFINFO_LTAB_SHIFT), + ctx->x86.pv.levels); + goto err; + } + + mfn = pfn_to_mfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("cr3 has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + SET_FIELD(vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn), ctx->x86.pv.width); + + /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */ + if ( ctx->x86.pv.levels == 4 && (vcpu->x64.ctrlreg[1] & 1) ) + { + pfn = vcpu->x64.ctrlreg[1] >> PAGE_SHIFT; + + if ( pfn > ctx->x86.pv.max_pfn ) + { + ERROR("cr1 (pfn %#lx) out of range", pfn); + goto err; + } + + if ( (ctx->x86.pv.restore.pfn_types[pfn] & + XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != + (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) + { + ERROR("cr1 (pfn %#lx) has bad type %u, expected %u", pfn, + (ctx->x86.pv.restore.pfn_types[pfn] >> + XEN_DOMCTL_PFINFO_LTAB_SHIFT), + ctx->x86.pv.levels); + goto err; + } + + mfn = pfn_to_mfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("cr1 has bad mfn"); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + vcpu->x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT; + } + + if ( xc_vcpu_setcontext(xch, ctx->domid, vcpuid, vcpu) ) + { + PERROR("Failed to set vcpu%u's basic info", vcpuid); + goto err; + } + + rc = 0; + + err: + return rc; +} + +/* + * Process one stashed vcpu worth of extended state and send to Xen. + */ +static int process_vcpu_extended(struct xc_sr_context *ctx, + unsigned int vcpuid) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_x86_pv_restore_vcpu *vcpu = + &ctx->x86.pv.restore.vcpus[vcpuid]; + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext; + domctl.domain = ctx->domid; + memcpy(&domctl.u.ext_vcpucontext, vcpu->extd.ptr, vcpu->extd.size); + + if ( xc_domctl(xch, &domctl) != 0 ) + { + PERROR("Failed to set vcpu%u's extended info", vcpuid); + return -1; + } + + return 0; +} + +/* + * Process one stashed vcpu worth of xsave state and send to Xen. + */ +static int process_vcpu_xsave(struct xc_sr_context *ctx, + unsigned int vcpuid) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_x86_pv_restore_vcpu *vcpu = + &ctx->x86.pv.restore.vcpus[vcpuid]; + int rc; + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(void, buffer); + + buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->xsave.size); + if ( !buffer ) + { + ERROR("Unable to allocate %zu bytes for xsave hypercall buffer", + vcpu->xsave.size); + return -1; + } + + domctl.cmd = XEN_DOMCTL_setvcpuextstate; + domctl.domain = ctx->domid; + domctl.u.vcpuextstate.vcpu = vcpuid; + domctl.u.vcpuextstate.size = vcpu->xsave.size; + set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); + + memcpy(buffer, vcpu->xsave.ptr, vcpu->xsave.size); + + rc = xc_domctl(xch, &domctl); + if ( rc ) + PERROR("Failed to set vcpu%u's xsave info", vcpuid); + + xc_hypercall_buffer_free(xch, buffer); + + return rc; +} + +/* + * Process one stashed vcpu worth of msr state and send to Xen. + */ +static int process_vcpu_msrs(struct xc_sr_context *ctx, + unsigned int vcpuid) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_x86_pv_restore_vcpu *vcpu = + &ctx->x86.pv.restore.vcpus[vcpuid]; + int rc; + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(void, buffer); + + buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->msr.size); + if ( !buffer ) + { + ERROR("Unable to allocate %zu bytes for msr hypercall buffer", + vcpu->msr.size); + return -1; + } + + domctl.cmd = XEN_DOMCTL_set_vcpu_msrs; + domctl.domain = ctx->domid; + domctl.u.vcpu_msrs.vcpu = vcpuid; + domctl.u.vcpu_msrs.msr_count = vcpu->msr.size / sizeof(xen_domctl_vcpu_msr_t); + set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer); + + memcpy(buffer, vcpu->msr.ptr, vcpu->msr.size); + + rc = xc_domctl(xch, &domctl); + if ( rc ) + PERROR("Failed to set vcpu%u's msrs", vcpuid); + + xc_hypercall_buffer_free(xch, buffer); + + return rc; +} + +/* + * Process all stashed vcpu context and send to Xen. + */ +static int update_vcpu_context(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_x86_pv_restore_vcpu *vcpu; + unsigned int i; + int rc = 0; + + for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i ) + { + vcpu = &ctx->x86.pv.restore.vcpus[i]; + + if ( vcpu->basic.ptr ) + { + rc = process_vcpu_basic(ctx, i); + if ( rc ) + return rc; + } + else if ( i == 0 ) + { + ERROR("Sender didn't send vcpu0's basic state"); + return -1; + } + + if ( vcpu->extd.ptr ) + { + rc = process_vcpu_extended(ctx, i); + if ( rc ) + return rc; + } + + if ( vcpu->xsave.ptr ) + { + rc = process_vcpu_xsave(ctx, i); + if ( rc ) + return rc; + } + + if ( vcpu->msr.ptr ) + { + rc = process_vcpu_msrs(ctx, i); + if ( rc ) + return rc; + } + } + + return rc; +} + +/* + * Copy the p2m which has been constructed locally as memory has been + * allocated, over the p2m in guest, so the guest can find its memory again on + * resume. + */ +static int update_guest_p2m(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t mfn, pfn, *guest_p2m = NULL; + unsigned int i; + int rc = -1; + + for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i ) + { + pfn = ctx->x86.pv.p2m_pfns[i]; + + if ( pfn > ctx->x86.pv.max_pfn ) + { + ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range", + pfn, i); + goto err; + } + + if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) + { + ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %u", pfn, i, + (ctx->x86.pv.restore.pfn_types[pfn] >> + XEN_DOMCTL_PFINFO_LTAB_SHIFT)); + goto err; + } + + mfn = pfn_to_mfn(ctx, pfn); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("p2m_frame_list[%u] has bad mfn", i); + dump_bad_pseudophysmap_entry(ctx, mfn); + goto err; + } + + ctx->x86.pv.p2m_pfns[i] = mfn; + } + + guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE, + ctx->x86.pv.p2m_pfns, + ctx->x86.pv.p2m_frames); + if ( !guest_p2m ) + { + PERROR("Failed to map p2m frames"); + goto err; + } + + memcpy(guest_p2m, ctx->x86.pv.p2m, + (ctx->x86.pv.max_pfn + 1) * ctx->x86.pv.width); + rc = 0; + + err: + if ( guest_p2m ) + munmap(guest_p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE); + + return rc; +} + +/* + * The valid width/pt_levels values in X86_PV_INFO are inextricably linked. + * Cross-check the legitimate combinations. + */ +static bool valid_x86_pv_info_combination( + const struct xc_sr_rec_x86_pv_info *info) +{ + switch ( info->guest_width ) + { + case 4: return info->pt_levels == 3; + case 8: return info->pt_levels == 4; + default: return false; + } +} + +/* + * Process an X86_PV_INFO record. + */ +static int handle_x86_pv_info(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_x86_pv_info *info = rec->data; + + if ( ctx->x86.pv.restore.seen_pv_info ) + { + ERROR("Already received X86_PV_INFO record"); + return -1; + } + + if ( rec->length < sizeof(*info) ) + { + ERROR("X86_PV_INFO record truncated: length %u, expected %zu", + rec->length, sizeof(*info)); + return -1; + } + + if ( !valid_x86_pv_info_combination(info) ) + { + ERROR("Invalid X86_PV_INFO combination: width %u, pt_levels %u", + info->guest_width, info->pt_levels); + return -1; + } + + /* + * PV domains default to native width. For an incomming compat domain, we + * will typically be the first entity to inform Xen. + */ + if ( info->guest_width != ctx->x86.pv.width ) + { + struct xen_domctl domctl = { + .domain = ctx->domid, + .cmd = XEN_DOMCTL_set_address_size, + .u.address_size.size = info->guest_width * 8, + }; + int rc = do_domctl(xch, &domctl); + + if ( rc != 0 ) + { + ERROR("Failed to update d%d address size to %u", + ctx->domid, info->guest_width * 8); + return -1; + } + + /* Domain's information changed, better to refresh. */ + rc = x86_pv_domain_info(ctx); + if ( rc != 0 ) + { + ERROR("Unable to refresh guest information"); + return -1; + } + } + + /* Sanity check (possibly new) domain settings. */ + if ( (info->guest_width != ctx->x86.pv.width) || + (info->pt_levels != ctx->x86.pv.levels) ) + { + ERROR("X86_PV_INFO width/pt_levels settings %u/%u mismatch with d%d %u/%u", + info->guest_width, info->pt_levels, ctx->domid, + ctx->x86.pv.width, ctx->x86.pv.levels); + return -1; + } + + ctx->x86.pv.restore.seen_pv_info = true; + return 0; +} + +/* + * Process an X86_PV_P2M_FRAMES record. Takes care of expanding the local p2m + * state if needed. + */ +static int handle_x86_pv_p2m_frames(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_x86_pv_p2m_frames *data = rec->data; + unsigned int start, end, x, fpp = PAGE_SIZE / ctx->x86.pv.width; + int rc; + + /* v2 compat. Infer the position of STATIC_DATA_END. */ + if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end ) + { + rc = handle_static_data_end(ctx); + if ( rc ) + { + ERROR("Inferred STATIC_DATA_END record failed"); + return rc; + } + } + + if ( !ctx->restore.seen_static_data_end ) + { + ERROR("No STATIC_DATA_END seen"); + return -1; + } + + if ( !ctx->x86.pv.restore.seen_pv_info ) + { + ERROR("Not yet received X86_PV_INFO record"); + return -1; + } + + if ( rec->length < sizeof(*data) ) + { + ERROR("X86_PV_P2M_FRAMES record truncated: length %u, min %zu", + rec->length, sizeof(*data) + sizeof(uint64_t)); + return -1; + } + + if ( data->start_pfn > data->end_pfn ) + { + ERROR("End pfn in stream (%#x) exceeds Start (%#x)", + data->end_pfn, data->start_pfn); + return -1; + } + + start = data->start_pfn / fpp; + end = data->end_pfn / fpp + 1; + + if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) ) + { + ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#x" + ", end_pfn %#x, length %u, expected %zu + (%u - %u) * %zu", + data->start_pfn, data->end_pfn, rec->length, + sizeof(*data), end, start, sizeof(uint64_t)); + return -1; + } + + if ( data->end_pfn > ctx->x86.pv.max_pfn ) + { + rc = expand_p2m(ctx, data->end_pfn); + if ( rc ) + return rc; + } + + for ( x = 0; x < (end - start); ++x ) + ctx->x86.pv.p2m_pfns[start + x] = data->p2m_pfns[x]; + + return 0; +} + +/* + * Processes X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} records from the stream. + * The blobs are all stashed to one side as they need to be deferred until the + * very end of the stream, rather than being send to Xen at the point they + * arrive in the stream. It performs all pre-hypercall size validation. + */ +static int handle_x86_pv_vcpu_blob(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_x86_pv_vcpu_hdr *vhdr = rec->data; + struct xc_sr_x86_pv_restore_vcpu *vcpu; + const char *rec_name; + size_t blobsz; + struct xc_sr_blob *blob = NULL; + int rc = -1; + + switch ( rec->type ) + { + case REC_TYPE_X86_PV_VCPU_BASIC: + rec_name = "X86_PV_VCPU_BASIC"; + break; + + case REC_TYPE_X86_PV_VCPU_EXTENDED: + rec_name = "X86_PV_VCPU_EXTENDED"; + break; + + case REC_TYPE_X86_PV_VCPU_XSAVE: + rec_name = "X86_PV_VCPU_XSAVE"; + break; + + case REC_TYPE_X86_PV_VCPU_MSRS: + rec_name = "X86_PV_VCPU_MSRS"; + break; + + default: + ERROR("Unrecognised vcpu blob record %s (%u)", + rec_type_to_str(rec->type), rec->type); + goto out; + } + + /* Confirm that there is a complete header. */ + if ( rec->length < sizeof(*vhdr) ) + { + ERROR("%s record truncated: length %u, header size %zu", + rec_name, rec->length, sizeof(*vhdr)); + goto out; + } + + blobsz = rec->length - sizeof(*vhdr); + + /* + * Tolerate empty records. Older sending sides used to accidentally + * generate them. + */ + if ( blobsz == 0 ) + { + DBGPRINTF("Skipping empty %s record for vcpu %u\n", + rec_type_to_str(rec->type), vhdr->vcpu_id); + rc = 0; + goto out; + } + + /* Check that the vcpu id is within range. */ + if ( vhdr->vcpu_id >= ctx->x86.pv.restore.nr_vcpus ) + { + ERROR("%s record vcpu_id (%u) exceeds domain max (%u)", + rec_name, vhdr->vcpu_id, ctx->x86.pv.restore.nr_vcpus - 1); + goto out; + } + + vcpu = &ctx->x86.pv.restore.vcpus[vhdr->vcpu_id]; + + /* Further per-record checks, where possible. */ + switch ( rec->type ) + { + case REC_TYPE_X86_PV_VCPU_BASIC: + { + size_t vcpusz = ctx->x86.pv.width == 8 ? + sizeof(vcpu_guest_context_x86_64_t) : + sizeof(vcpu_guest_context_x86_32_t); + + if ( blobsz != vcpusz ) + { + ERROR("%s record wrong size: expected %zu, got %u", + rec_name, sizeof(*vhdr) + vcpusz, rec->length); + goto out; + } + blob = &vcpu->basic; + break; + } + + case REC_TYPE_X86_PV_VCPU_EXTENDED: + if ( blobsz > 128 ) + { + ERROR("%s record too long: max %zu, got %u", + rec_name, sizeof(*vhdr) + 128, rec->length); + goto out; + } + blob = &vcpu->extd; + break; + + case REC_TYPE_X86_PV_VCPU_XSAVE: + if ( blobsz < 16 ) + { + ERROR("%s record too short: min %zu, got %u", + rec_name, sizeof(*vhdr) + 16, rec->length); + goto out; + } + blob = &vcpu->xsave; + break; + + case REC_TYPE_X86_PV_VCPU_MSRS: + if ( blobsz % sizeof(xen_domctl_vcpu_msr_t) != 0 ) + { + ERROR("%s record payload size %zu expected to be a multiple of %zu", + rec_name, blobsz, sizeof(xen_domctl_vcpu_msr_t)); + goto out; + } + blob = &vcpu->msr; + break; + } + + rc = update_blob(blob, vhdr->context, blobsz); + if ( rc ) + ERROR("Unable to allocate %zu bytes for vcpu%u %s blob", + blobsz, vhdr->vcpu_id, rec_name); + + out: + return rc; +} + +/* + * Process a SHARED_INFO record from the stream. + */ +static int handle_shared_info(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + xc_interface *xch = ctx->xch; + unsigned int i; + int rc = -1; + shared_info_any_t *guest_shinfo = NULL; + const shared_info_any_t *old_shinfo = rec->data; + + if ( !ctx->x86.pv.restore.seen_pv_info ) + { + ERROR("Not yet received X86_PV_INFO record"); + return -1; + } + + if ( rec->length != PAGE_SIZE ) + { + ERROR("X86_PV_SHARED_INFO record wrong size: length %u" + ", expected 4096", rec->length); + goto err; + } + + guest_shinfo = xc_map_foreign_range( + xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, + ctx->dominfo.shared_info_frame); + if ( !guest_shinfo ) + { + PERROR("Failed to map Shared Info at mfn %#lx", + ctx->dominfo.shared_info_frame); + goto err; + } + + MEMCPY_FIELD(guest_shinfo, old_shinfo, vcpu_info, ctx->x86.pv.width); + MEMCPY_FIELD(guest_shinfo, old_shinfo, arch, ctx->x86.pv.width); + + SET_FIELD(guest_shinfo, arch.pfn_to_mfn_frame_list_list, + 0, ctx->x86.pv.width); + + MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_pending, 0, ctx->x86.pv.width); + for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) + SET_FIELD(guest_shinfo, vcpu_info[i].evtchn_pending_sel, + 0, ctx->x86.pv.width); + + MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_mask, 0xff, ctx->x86.pv.width); + + rc = 0; + + err: + if ( guest_shinfo ) + munmap(guest_shinfo, PAGE_SIZE); + + return rc; +} + +/* restore_ops function. */ +static bool x86_pv_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + return pfn <= ctx->x86.pv.max_pfn; +} + +/* restore_ops function. */ +static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn, + unsigned long type) +{ + assert(pfn <= ctx->x86.pv.max_pfn); + + ctx->x86.pv.restore.pfn_types[pfn] = type; +} + +/* restore_ops function. */ +static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, + xen_pfn_t mfn) +{ + assert(pfn <= ctx->x86.pv.max_pfn); + + if ( ctx->x86.pv.width == sizeof(uint64_t) ) + /* 64 bit guest. Need to expand INVALID_MFN for 32 bit toolstacks. */ + ((uint64_t *)ctx->x86.pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn; + else + /* 32 bit guest. Can truncate INVALID_MFN for 64 bit toolstacks. */ + ((uint32_t *)ctx->x86.pv.p2m)[pfn] = mfn; +} + +/* + * restore_ops function. Convert pfns back to mfns in pagetables. Possibly + * needs to populate new frames if a PTE is found referring to a frame which + * hasn't yet been seen from PAGE_DATA records. + */ +static int x86_pv_localise_page(struct xc_sr_context *ctx, + uint32_t type, void *page) +{ + xc_interface *xch = ctx->xch; + uint64_t *table = page; + uint64_t pte; + unsigned int i, to_populate; + xen_pfn_t pfns[(PAGE_SIZE / sizeof(uint64_t))]; + + type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + /* Only page tables need localisation. */ + if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB ) + return 0; + + /* Check to see whether we need to populate any new frames. */ + for ( i = 0, to_populate = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) + { + pte = table[i]; + + if ( pte & _PAGE_PRESENT ) + { + xen_pfn_t pfn = pte_to_frame(pte); + +#ifdef __i386__ + if ( pfn == INVALID_MFN ) + { + ERROR("PTE truncation detected. L%u[%u] = %016"PRIx64, + type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); + errno = E2BIG; + return -1; + } +#endif + + if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN ) + pfns[to_populate++] = pfn; + } + } + + if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) ) + return -1; + + for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) + { + pte = table[i]; + + if ( pte & _PAGE_PRESENT ) + { + xen_pfn_t mfn, pfn; + + pfn = pte_to_frame(pte); + mfn = pfn_to_mfn(ctx, pfn); + + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Bad mfn for L%u[%u] - pte %"PRIx64, + type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); + dump_bad_pseudophysmap_entry(ctx, mfn); + errno = ERANGE; + return -1; + } + + table[i] = merge_pte(pte, mfn); + } + } + + return 0; +} + +/* + * restore_ops function. Confirm that the incoming stream matches the type of + * domain we are attempting to restore into. + */ +static int x86_pv_setup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + + if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV ) + { + ERROR("Unable to restore %s domain into an x86_pv domain", + dhdr_type_to_str(ctx->restore.guest_type)); + return -1; + } + + if ( ctx->restore.guest_page_size != PAGE_SIZE ) + { + ERROR("Invalid page size %d for x86_pv domains", + ctx->restore.guest_page_size); + return -1; + } + + rc = x86_pv_domain_info(ctx); + if ( rc ) + return rc; + + ctx->x86.pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1; + ctx->x86.pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu), + ctx->x86.pv.restore.nr_vcpus); + if ( !ctx->x86.pv.restore.vcpus ) + { + errno = ENOMEM; + return -1; + } + + rc = x86_pv_map_m2p(ctx); + if ( rc ) + return rc; + + return rc; +} + +/* + * restore_ops function. + */ +static int x86_pv_process_record(struct xc_sr_context *ctx, + struct xc_sr_record *rec) +{ + switch ( rec->type ) + { + case REC_TYPE_X86_PV_INFO: + return handle_x86_pv_info(ctx, rec); + + case REC_TYPE_X86_PV_P2M_FRAMES: + return handle_x86_pv_p2m_frames(ctx, rec); + + case REC_TYPE_X86_PV_VCPU_BASIC: + case REC_TYPE_X86_PV_VCPU_EXTENDED: + case REC_TYPE_X86_PV_VCPU_XSAVE: + case REC_TYPE_X86_PV_VCPU_MSRS: + return handle_x86_pv_vcpu_blob(ctx, rec); + + case REC_TYPE_SHARED_INFO: + return handle_shared_info(ctx, rec); + + case REC_TYPE_X86_TSC_INFO: + return handle_x86_tsc_info(ctx, rec); + + case REC_TYPE_X86_CPUID_POLICY: + return handle_x86_cpuid_policy(ctx, rec); + + case REC_TYPE_X86_MSR_POLICY: + return handle_x86_msr_policy(ctx, rec); + + default: + return RECORD_NOT_PROCESSED; + } +} + +/* + * restore_ops function. Update the vcpu context in Xen, pin the pagetables, + * rewrite the p2m and seed the grant table. + */ +static int x86_pv_stream_complete(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + + rc = update_vcpu_context(ctx); + if ( rc ) + return rc; + + rc = pin_pagetables(ctx); + if ( rc ) + return rc; + + rc = update_guest_p2m(ctx); + if ( rc ) + return rc; + + rc = xc_dom_gnttab_seed(xch, ctx->domid, false, + ctx->restore.console_gfn, + ctx->restore.xenstore_gfn, + ctx->restore.console_domid, + ctx->restore.xenstore_domid); + if ( rc ) + { + PERROR("Failed to seed grant table"); + return rc; + } + + return rc; +} + +/* + * restore_ops function. + */ +static int x86_pv_cleanup(struct xc_sr_context *ctx) +{ + free(ctx->x86.pv.p2m); + free(ctx->x86.pv.p2m_pfns); + + if ( ctx->x86.pv.restore.vcpus ) + { + unsigned int i; + + for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i ) + { + struct xc_sr_x86_pv_restore_vcpu *vcpu = + &ctx->x86.pv.restore.vcpus[i]; + + free(vcpu->basic.ptr); + free(vcpu->extd.ptr); + free(vcpu->xsave.ptr); + free(vcpu->msr.ptr); + } + + free(ctx->x86.pv.restore.vcpus); + } + + free(ctx->x86.pv.restore.pfn_types); + + if ( ctx->x86.pv.m2p ) + munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE); + + free(ctx->x86.restore.cpuid.ptr); + free(ctx->x86.restore.msr.ptr); + + return 0; +} + +struct xc_sr_restore_ops restore_ops_x86_pv = +{ + .pfn_is_valid = x86_pv_pfn_is_valid, + .pfn_to_gfn = pfn_to_mfn, + .set_page_type = x86_pv_set_page_type, + .set_gfn = x86_pv_set_gfn, + .localise_page = x86_pv_localise_page, + .setup = x86_pv_setup, + .process_record = x86_pv_process_record, + .static_data_complete = x86_static_data_complete, + .stream_complete = x86_pv_stream_complete, + .cleanup = x86_pv_cleanup, +}; + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_save.c b/tools/libs/guest/xg_sr_save.c new file mode 100644 index 0000000000..d74c72cba6 --- /dev/null +++ b/tools/libs/guest/xg_sr_save.c @@ -0,0 +1,1059 @@ +#include +#include + +#include "xg_sr_common.h" + +/* + * Writes an Image header and Domain header into the stream. + */ +static int write_headers(struct xc_sr_context *ctx, uint16_t guest_type) +{ + xc_interface *xch = ctx->xch; + int32_t xen_version = xc_version(xch, XENVER_version, NULL); + struct xc_sr_ihdr ihdr = { + .marker = IHDR_MARKER, + .id = htonl(IHDR_ID), + .version = htonl(3), + .options = htons(IHDR_OPT_LITTLE_ENDIAN), + }; + struct xc_sr_dhdr dhdr = { + .type = guest_type, + .page_shift = XC_PAGE_SHIFT, + .xen_major = (xen_version >> 16) & 0xffff, + .xen_minor = (xen_version) & 0xffff, + }; + + if ( xen_version < 0 ) + { + PERROR("Unable to obtain Xen Version"); + return -1; + } + + if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) ) + { + PERROR("Unable to write Image Header to stream"); + return -1; + } + + if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) ) + { + PERROR("Unable to write Domain Header to stream"); + return -1; + } + + return 0; +} + +/* + * Writes an END record into the stream. + */ +static int write_end_record(struct xc_sr_context *ctx) +{ + struct xc_sr_record end = { .type = REC_TYPE_END }; + + return write_record(ctx, &end); +} + +/* + * Writes a STATIC_DATA_END record into the stream. + */ +static int write_static_data_end_record(struct xc_sr_context *ctx) +{ + struct xc_sr_record end = { .type = REC_TYPE_STATIC_DATA_END }; + + return write_record(ctx, &end); +} + +/* + * Writes a CHECKPOINT record into the stream. + */ +static int write_checkpoint_record(struct xc_sr_context *ctx) +{ + struct xc_sr_record checkpoint = { .type = REC_TYPE_CHECKPOINT }; + + return write_record(ctx, &checkpoint); +} + +/* + * Writes a batch of memory as a PAGE_DATA record into the stream. The batch + * is constructed in ctx->save.batch_pfns. + * + * This function: + * - gets the types for each pfn in the batch. + * - for each pfn with real data: + * - maps and attempts to localise the pages. + * - construct and writes a PAGE_DATA record into the stream. + */ +static int write_batch(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t *mfns = NULL, *types = NULL; + void *guest_mapping = NULL; + void **guest_data = NULL; + void **local_pages = NULL; + int *errors = NULL, rc = -1; + unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0; + unsigned int nr_pfns = ctx->save.nr_batch_pfns; + void *page, *orig_page; + uint64_t *rec_pfns = NULL; + struct iovec *iov = NULL; int iovcnt = 0; + struct xc_sr_rec_page_data_header hdr = { 0 }; + struct xc_sr_record rec = { + .type = REC_TYPE_PAGE_DATA, + }; + + assert(nr_pfns != 0); + + /* Mfns of the batch pfns. */ + mfns = malloc(nr_pfns * sizeof(*mfns)); + /* Types of the batch pfns. */ + types = malloc(nr_pfns * sizeof(*types)); + /* Errors from attempting to map the gfns. */ + errors = malloc(nr_pfns * sizeof(*errors)); + /* Pointers to page data to send. Mapped gfns or local allocations. */ + guest_data = calloc(nr_pfns, sizeof(*guest_data)); + /* Pointers to locally allocated pages. Need freeing. */ + local_pages = calloc(nr_pfns, sizeof(*local_pages)); + /* iovec[] for writev(). */ + iov = malloc((nr_pfns + 4) * sizeof(*iov)); + + if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov ) + { + ERROR("Unable to allocate arrays for a batch of %u pages", + nr_pfns); + goto err; + } + + for ( i = 0; i < nr_pfns; ++i ) + { + types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx, + ctx->save.batch_pfns[i]); + + /* Likely a ballooned page. */ + if ( mfns[i] == INVALID_MFN ) + { + set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); + ++ctx->save.nr_deferred_pages; + } + } + + rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types); + if ( rc ) + { + PERROR("Failed to get types for pfn batch"); + goto err; + } + rc = -1; + + for ( i = 0; i < nr_pfns; ++i ) + { + switch ( types[i] ) + { + case XEN_DOMCTL_PFINFO_BROKEN: + case XEN_DOMCTL_PFINFO_XALLOC: + case XEN_DOMCTL_PFINFO_XTAB: + continue; + } + + mfns[nr_pages++] = mfns[i]; + } + + if ( nr_pages > 0 ) + { + guest_mapping = xenforeignmemory_map( + xch->fmem, ctx->domid, PROT_READ, nr_pages, mfns, errors); + if ( !guest_mapping ) + { + PERROR("Failed to map guest pages"); + goto err; + } + nr_pages_mapped = nr_pages; + + for ( i = 0, p = 0; i < nr_pfns; ++i ) + { + switch ( types[i] ) + { + case XEN_DOMCTL_PFINFO_BROKEN: + case XEN_DOMCTL_PFINFO_XALLOC: + case XEN_DOMCTL_PFINFO_XTAB: + continue; + } + + if ( errors[p] ) + { + ERROR("Mapping of pfn %#"PRIpfn" (mfn %#"PRIpfn") failed %d", + ctx->save.batch_pfns[i], mfns[p], errors[p]); + goto err; + } + + orig_page = page = guest_mapping + (p * PAGE_SIZE); + rc = ctx->save.ops.normalise_page(ctx, types[i], &page); + + if ( orig_page != page ) + local_pages[i] = page; + + if ( rc ) + { + if ( rc == -1 && errno == EAGAIN ) + { + set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); + ++ctx->save.nr_deferred_pages; + types[i] = XEN_DOMCTL_PFINFO_XTAB; + --nr_pages; + } + else + goto err; + } + else + guest_data[i] = page; + + rc = -1; + ++p; + } + } + + rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns)); + if ( !rec_pfns ) + { + ERROR("Unable to allocate %zu bytes of memory for page data pfn list", + nr_pfns * sizeof(*rec_pfns)); + goto err; + } + + hdr.count = nr_pfns; + + rec.length = sizeof(hdr); + rec.length += nr_pfns * sizeof(*rec_pfns); + rec.length += nr_pages * PAGE_SIZE; + + for ( i = 0; i < nr_pfns; ++i ) + rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i]; + + iov[0].iov_base = &rec.type; + iov[0].iov_len = sizeof(rec.type); + + iov[1].iov_base = &rec.length; + iov[1].iov_len = sizeof(rec.length); + + iov[2].iov_base = &hdr; + iov[2].iov_len = sizeof(hdr); + + iov[3].iov_base = rec_pfns; + iov[3].iov_len = nr_pfns * sizeof(*rec_pfns); + + iovcnt = 4; + + if ( nr_pages ) + { + for ( i = 0; i < nr_pfns; ++i ) + { + if ( guest_data[i] ) + { + iov[iovcnt].iov_base = guest_data[i]; + iov[iovcnt].iov_len = PAGE_SIZE; + iovcnt++; + --nr_pages; + } + } + } + + if ( writev_exact(ctx->fd, iov, iovcnt) ) + { + PERROR("Failed to write page data to stream"); + goto err; + } + + /* Sanity check we have sent all the pages we expected to. */ + assert(nr_pages == 0); + rc = ctx->save.nr_batch_pfns = 0; + + err: + free(rec_pfns); + if ( guest_mapping ) + xenforeignmemory_unmap(xch->fmem, guest_mapping, nr_pages_mapped); + for ( i = 0; local_pages && i < nr_pfns; ++i ) + free(local_pages[i]); + free(iov); + free(local_pages); + free(guest_data); + free(errors); + free(types); + free(mfns); + + return rc; +} + +/* + * Flush a batch of pfns into the stream. + */ +static int flush_batch(struct xc_sr_context *ctx) +{ + int rc = 0; + + if ( ctx->save.nr_batch_pfns == 0 ) + return rc; + + rc = write_batch(ctx); + + if ( !rc ) + { + VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns, + MAX_BATCH_SIZE * + sizeof(*ctx->save.batch_pfns)); + } + + return rc; +} + +/* + * Add a single pfn to the batch, flushing the batch if full. + */ +static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + int rc = 0; + + if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE ) + rc = flush_batch(ctx); + + if ( rc == 0 ) + ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn; + + return rc; +} + +/* + * Pause/suspend the domain, and refresh ctx->dominfo if required. + */ +static int suspend_domain(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + + /* TODO: Properly specify the return value from this callback. All + * implementations currently appear to return 1 for success, whereas + * the legacy code checks for != 0. */ + int cb_rc = ctx->save.callbacks->suspend(ctx->save.callbacks->data); + + if ( cb_rc == 0 ) + { + ERROR("save callback suspend() failed: %d", cb_rc); + return -1; + } + + /* Refresh domain information. */ + if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) || + (ctx->dominfo.domid != ctx->domid) ) + { + PERROR("Unable to refresh domain information"); + return -1; + } + + /* Confirm the domain has actually been paused. */ + if ( !ctx->dominfo.shutdown || + (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) ) + { + ERROR("Domain has not been suspended: shutdown %d, reason %d", + ctx->dominfo.shutdown, ctx->dominfo.shutdown_reason); + return -1; + } + + xc_report_progress_single(xch, "Domain now suspended"); + + return 0; +} + +/* + * Send a subset of pages in the guests p2m, according to the dirty bitmap. + * Used for each subsequent iteration of the live migration loop. + * + * Bitmap is bounded by p2m_size. + */ +static int send_dirty_pages(struct xc_sr_context *ctx, + unsigned long entries) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t p; + unsigned long written; + int rc; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->save.dirty_bitmap_hbuf); + + for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p ) + { + if ( !test_bit(p, dirty_bitmap) ) + continue; + + rc = add_to_batch(ctx, p); + if ( rc ) + return rc; + + /* Update progress every 4MB worth of memory sent. */ + if ( (written & ((1U << (22 - 12)) - 1)) == 0 ) + xc_report_progress_step(xch, written, entries); + + ++written; + } + + rc = flush_batch(ctx); + if ( rc ) + return rc; + + if ( written > entries ) + DPRINTF("Bitmap contained more entries than expected..."); + + xc_report_progress_step(xch, entries, entries); + + return ctx->save.ops.check_vm_state(ctx); +} + +/* + * Send all pages in the guests p2m. Used as the first iteration of the live + * migration loop, and for a non-live save. + */ +static int send_all_pages(struct xc_sr_context *ctx) +{ + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->save.dirty_bitmap_hbuf); + + bitmap_set(dirty_bitmap, ctx->save.p2m_size); + + return send_dirty_pages(ctx, ctx->save.p2m_size); +} + +static int enable_logdirty(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int on1 = 0, off = 0, on2 = 0; + int rc; + + /* This juggling is required if logdirty is enabled for VRAM tracking. */ + rc = xc_shadow_control(xch, ctx->domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL); + if ( rc < 0 ) + { + on1 = errno; + rc = xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL); + if ( rc < 0 ) + off = errno; + else { + rc = xc_shadow_control(xch, ctx->domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL); + if ( rc < 0 ) + on2 = errno; + } + if ( rc < 0 ) + { + PERROR("Failed to enable logdirty: %d,%d,%d", on1, off, on2); + return rc; + } + } + + return 0; +} + +static int update_progress_string(struct xc_sr_context *ctx, char **str) +{ + xc_interface *xch = ctx->xch; + char *new_str = NULL; + unsigned int iter = ctx->save.stats.iteration; + + if ( asprintf(&new_str, "Frames iteration %u", iter) == -1 ) + { + PERROR("Unable to allocate new progress string"); + return -1; + } + + free(*str); + *str = new_str; + + xc_set_progress_prefix(xch, *str); + return 0; +} + +/* + * This is the live migration precopy policy - it's called periodically during + * the precopy phase of live migrations, and is responsible for deciding when + * the precopy phase should terminate and what should be done next. + * + * The policy implemented here behaves identically to the policy previously + * hard-coded into xc_domain_save() - it proceeds to the stop-and-copy phase of + * the live migration when there are either fewer than 50 dirty pages, or more + * than 5 precopy rounds have completed. + */ +#define SPP_MAX_ITERATIONS 5 +#define SPP_TARGET_DIRTY_COUNT 50 + +static int simple_precopy_policy(struct precopy_stats stats, void *user) +{ + return ((stats.dirty_count >= 0 && + stats.dirty_count < SPP_TARGET_DIRTY_COUNT) || + stats.iteration >= SPP_MAX_ITERATIONS) + ? XGS_POLICY_STOP_AND_COPY + : XGS_POLICY_CONTINUE_PRECOPY; +} + +/* + * Send memory while guest is running. + */ +static int send_memory_live(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size }; + char *progress_str = NULL; + unsigned int x = 0; + int rc; + int policy_decision; + + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->save.dirty_bitmap_hbuf); + + precopy_policy_t precopy_policy = ctx->save.callbacks->precopy_policy; + void *data = ctx->save.callbacks->data; + + struct precopy_stats *policy_stats; + + rc = update_progress_string(ctx, &progress_str); + if ( rc ) + goto out; + + ctx->save.stats = (struct precopy_stats){ + .dirty_count = ctx->save.p2m_size, + }; + policy_stats = &ctx->save.stats; + + if ( precopy_policy == NULL ) + precopy_policy = simple_precopy_policy; + + bitmap_set(dirty_bitmap, ctx->save.p2m_size); + + for ( ; ; ) + { + policy_decision = precopy_policy(*policy_stats, data); + x++; + + if ( stats.dirty_count > 0 && policy_decision != XGS_POLICY_ABORT ) + { + rc = update_progress_string(ctx, &progress_str); + if ( rc ) + goto out; + + rc = send_dirty_pages(ctx, stats.dirty_count); + if ( rc ) + goto out; + } + + if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY ) + break; + + policy_stats->iteration = x; + policy_stats->total_written += policy_stats->dirty_count; + policy_stats->dirty_count = -1; + + policy_decision = precopy_policy(*policy_stats, data); + + if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY ) + break; + + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, + &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size, + NULL, 0, &stats) != ctx->save.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + rc = -1; + goto out; + } + + policy_stats->dirty_count = stats.dirty_count; + + } + + if ( policy_decision == XGS_POLICY_ABORT ) + { + PERROR("Abort precopy loop"); + rc = -1; + goto out; + } + + out: + xc_set_progress_prefix(xch, NULL); + free(progress_str); + return rc; +} + +static int colo_merge_secondary_dirty_bitmap(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_record rec; + uint64_t *pfns = NULL; + uint64_t pfn; + unsigned int count, i; + int rc; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->save.dirty_bitmap_hbuf); + + rc = read_record(ctx, ctx->save.recv_fd, &rec); + if ( rc ) + goto err; + + if ( rec.type != REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST ) + { + PERROR("Expect dirty bitmap record, but received %u", rec.type); + rc = -1; + goto err; + } + + if ( rec.length % sizeof(*pfns) ) + { + PERROR("Invalid dirty pfn list record length %u", rec.length); + rc = -1; + goto err; + } + + count = rec.length / sizeof(*pfns); + pfns = rec.data; + + for ( i = 0; i < count; i++ ) + { + pfn = pfns[i]; + if ( pfn > ctx->save.p2m_size ) + { + PERROR("Invalid pfn 0x%" PRIx64, pfn); + rc = -1; + goto err; + } + + set_bit(pfn, dirty_bitmap); + } + + rc = 0; + + err: + free(rec.data); + return rc; +} + +/* + * Suspend the domain and send dirty memory. + * This is the last iteration of the live migration and the + * heart of the checkpointed stream. + */ +static int suspend_and_send_dirty(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size }; + char *progress_str = NULL; + int rc; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->save.dirty_bitmap_hbuf); + + rc = suspend_domain(ctx); + if ( rc ) + goto out; + + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, + HYPERCALL_BUFFER(dirty_bitmap), ctx->save.p2m_size, + NULL, XEN_DOMCTL_SHADOW_LOGDIRTY_FINAL, &stats) != + ctx->save.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + rc = -1; + goto out; + } + + if ( ctx->save.live ) + { + rc = update_progress_string(ctx, &progress_str); + if ( rc ) + goto out; + } + else + xc_set_progress_prefix(xch, "Checkpointed save"); + + bitmap_or(dirty_bitmap, ctx->save.deferred_pages, ctx->save.p2m_size); + + if ( !ctx->save.live && ctx->stream_type == XC_STREAM_COLO ) + { + rc = colo_merge_secondary_dirty_bitmap(ctx); + if ( rc ) + { + PERROR("Failed to get secondary vm's dirty pages"); + goto out; + } + } + + rc = send_dirty_pages(ctx, stats.dirty_count + ctx->save.nr_deferred_pages); + if ( rc ) + goto out; + + bitmap_clear(ctx->save.deferred_pages, ctx->save.p2m_size); + ctx->save.nr_deferred_pages = 0; + + out: + xc_set_progress_prefix(xch, NULL); + free(progress_str); + return rc; +} + +static int verify_frames(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size }; + int rc; + struct xc_sr_record rec = { .type = REC_TYPE_VERIFY }; + + DPRINTF("Enabling verify mode"); + + rc = write_record(ctx, &rec); + if ( rc ) + goto out; + + xc_set_progress_prefix(xch, "Frames verify"); + rc = send_all_pages(ctx); + if ( rc ) + goto out; + + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_PEEK, + &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size, + NULL, 0, &stats) != ctx->save.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + rc = -1; + goto out; + } + + DPRINTF(" Further stats: faults %u, dirty %u", + stats.fault_count, stats.dirty_count); + + out: + return rc; +} + +/* + * Send all domain memory. This is the heart of the live migration loop. + */ +static int send_domain_memory_live(struct xc_sr_context *ctx) +{ + int rc; + + rc = enable_logdirty(ctx); + if ( rc ) + goto out; + + rc = send_memory_live(ctx); + if ( rc ) + goto out; + + rc = suspend_and_send_dirty(ctx); + if ( rc ) + goto out; + + if ( ctx->save.debug && ctx->stream_type != XC_STREAM_PLAIN ) + { + rc = verify_frames(ctx); + if ( rc ) + goto out; + } + + out: + return rc; +} + +/* + * Checkpointed save. + */ +static int send_domain_memory_checkpointed(struct xc_sr_context *ctx) +{ + return suspend_and_send_dirty(ctx); +} + +/* + * Send all domain memory, pausing the domain first. Generally used for + * suspend-to-file. + */ +static int send_domain_memory_nonlive(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + + rc = suspend_domain(ctx); + if ( rc ) + goto err; + + xc_set_progress_prefix(xch, "Frames"); + + rc = send_all_pages(ctx); + if ( rc ) + goto err; + + err: + return rc; +} + +static int setup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->save.dirty_bitmap_hbuf); + + rc = ctx->save.ops.setup(ctx); + if ( rc ) + goto err; + + dirty_bitmap = xc_hypercall_buffer_alloc_pages( + xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->save.p2m_size))); + ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE * + sizeof(*ctx->save.batch_pfns)); + ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size)); + + if ( !ctx->save.batch_pfns || !dirty_bitmap || !ctx->save.deferred_pages ) + { + ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and" + " deferred pages"); + rc = -1; + errno = ENOMEM; + goto err; + } + + rc = 0; + + err: + return rc; +} + +static void cleanup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, + &ctx->save.dirty_bitmap_hbuf); + + + xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL); + + if ( ctx->save.ops.cleanup(ctx) ) + PERROR("Failed to clean up"); + + xc_hypercall_buffer_free_pages(xch, dirty_bitmap, + NRPAGES(bitmap_size(ctx->save.p2m_size))); + free(ctx->save.deferred_pages); + free(ctx->save.batch_pfns); +} + +/* + * Save a domain. + */ +static int save(struct xc_sr_context *ctx, uint16_t guest_type) +{ + xc_interface *xch = ctx->xch; + int rc, saved_rc = 0, saved_errno = 0; + + IPRINTF("Saving domain %d, type %s", + ctx->domid, dhdr_type_to_str(guest_type)); + + rc = setup(ctx); + if ( rc ) + goto err; + + xc_report_progress_single(xch, "Start of stream"); + + rc = write_headers(ctx, guest_type); + if ( rc ) + goto err; + + rc = ctx->save.ops.static_data(ctx); + if ( rc ) + goto err; + + rc = write_static_data_end_record(ctx); + if ( rc ) + goto err; + + rc = ctx->save.ops.start_of_stream(ctx); + if ( rc ) + goto err; + + do { + rc = ctx->save.ops.start_of_checkpoint(ctx); + if ( rc ) + goto err; + + rc = ctx->save.ops.check_vm_state(ctx); + if ( rc ) + goto err; + + if ( ctx->save.live ) + rc = send_domain_memory_live(ctx); + else if ( ctx->stream_type != XC_STREAM_PLAIN ) + rc = send_domain_memory_checkpointed(ctx); + else + rc = send_domain_memory_nonlive(ctx); + + if ( rc ) + goto err; + + if ( !ctx->dominfo.shutdown || + (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) ) + { + ERROR("Domain has not been suspended"); + rc = -1; + goto err; + } + + rc = ctx->save.ops.end_of_checkpoint(ctx); + if ( rc ) + goto err; + + if ( ctx->stream_type != XC_STREAM_PLAIN ) + { + /* + * We have now completed the initial live portion of the checkpoint + * process. Therefore switch into periodically sending synchronous + * batches of pages. + */ + ctx->save.live = false; + + rc = write_checkpoint_record(ctx); + if ( rc ) + goto err; + + if ( ctx->stream_type == XC_STREAM_COLO ) + { + rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data); + if ( !rc ) + { + rc = -1; + goto err; + } + } + + rc = ctx->save.callbacks->postcopy(ctx->save.callbacks->data); + if ( rc <= 0 ) + goto err; + + if ( ctx->stream_type == XC_STREAM_COLO ) + { + rc = ctx->save.callbacks->wait_checkpoint( + ctx->save.callbacks->data); + if ( rc <= 0 ) + goto err; + } + else if ( ctx->stream_type == XC_STREAM_REMUS ) + { + rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data); + if ( rc <= 0 ) + goto err; + } + else + { + ERROR("Unknown checkpointed stream"); + rc = -1; + goto err; + } + } + } while ( ctx->stream_type != XC_STREAM_PLAIN ); + + xc_report_progress_single(xch, "End of stream"); + + rc = write_end_record(ctx); + if ( rc ) + goto err; + + xc_report_progress_single(xch, "Complete"); + goto done; + + err: + saved_errno = errno; + saved_rc = rc; + PERROR("Save failed"); + + done: + cleanup(ctx); + + if ( saved_rc ) + { + rc = saved_rc; + errno = saved_errno; + } + + return rc; +}; + +int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, + uint32_t flags, struct save_callbacks *callbacks, + xc_stream_type_t stream_type, int recv_fd) +{ + struct xc_sr_context ctx = { + .xch = xch, + .fd = io_fd, + .stream_type = stream_type, + }; + + /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */ + ctx.save.callbacks = callbacks; + ctx.save.live = !!(flags & XCFLAGS_LIVE); + ctx.save.debug = !!(flags & XCFLAGS_DEBUG); + ctx.save.recv_fd = recv_fd; + + if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 ) + { + PERROR("Failed to get domain info"); + return -1; + } + + if ( ctx.dominfo.domid != dom ) + { + ERROR("Domain %u does not exist", dom); + return -1; + } + + /* Sanity check stream_type-related parameters */ + switch ( stream_type ) + { + case XC_STREAM_COLO: + assert(callbacks->wait_checkpoint); + /* Fallthrough */ + case XC_STREAM_REMUS: + assert(callbacks->checkpoint && callbacks->postcopy); + /* Fallthrough */ + case XC_STREAM_PLAIN: + if ( ctx.dominfo.hvm ) + assert(callbacks->switch_qemu_logdirty); + break; + + default: + assert(!"Bad stream_type"); + break; + } + + DPRINTF("fd %d, dom %u, flags %u, hvm %d", + io_fd, dom, flags, ctx.dominfo.hvm); + + ctx.domid = dom; + + if ( ctx.dominfo.hvm ) + { + ctx.save.ops = save_ops_x86_hvm; + return save(&ctx, DHDR_TYPE_X86_HVM); + } + else + { + ctx.save.ops = save_ops_x86_pv; + return save(&ctx, DHDR_TYPE_X86_PV); + } +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_save_x86_hvm.c b/tools/libs/guest/xg_sr_save_x86_hvm.c new file mode 100644 index 0000000000..1634a7bc43 --- /dev/null +++ b/tools/libs/guest/xg_sr_save_x86_hvm.c @@ -0,0 +1,251 @@ +#include + +#include "xg_sr_common_x86.h" + +#include + +/* + * Query for the HVM context and write an HVM_CONTEXT record into the stream. + */ +static int write_hvm_context(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc, hvm_buf_size; + struct xc_sr_record hvm_rec = { + .type = REC_TYPE_HVM_CONTEXT, + }; + + hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid, 0, 0); + if ( hvm_buf_size < 0 ) + { + PERROR("Couldn't get HVM context size from Xen"); + rc = -1; + goto out; + } + + hvm_rec.data = malloc(hvm_buf_size); + if ( !hvm_rec.data ) + { + PERROR("Couldn't allocate memory"); + rc = -1; + goto out; + } + + hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid, + hvm_rec.data, hvm_buf_size); + if ( hvm_buf_size < 0 ) + { + PERROR("Couldn't get HVM context from Xen"); + rc = -1; + goto out; + } + + hvm_rec.length = hvm_buf_size; + rc = write_record(ctx, &hvm_rec); + if ( rc < 0 ) + { + PERROR("error write HVM_CONTEXT record"); + goto out; + } + + out: + free(hvm_rec.data); + return rc; +} + +/* + * Query for a range of HVM parameters and write an HVM_PARAMS record into the + * stream. + */ +static int write_hvm_params(struct xc_sr_context *ctx) +{ + static const unsigned int params[] = { + HVM_PARAM_STORE_PFN, + HVM_PARAM_IOREQ_PFN, + HVM_PARAM_BUFIOREQ_PFN, + HVM_PARAM_PAGING_RING_PFN, + HVM_PARAM_MONITOR_RING_PFN, + HVM_PARAM_SHARING_RING_PFN, + HVM_PARAM_VM86_TSS_SIZED, + HVM_PARAM_CONSOLE_PFN, + HVM_PARAM_ACPI_IOPORTS_LOCATION, + HVM_PARAM_VIRIDIAN, + HVM_PARAM_IDENT_PT, + HVM_PARAM_VM_GENERATION_ID_ADDR, + HVM_PARAM_IOREQ_SERVER_PFN, + HVM_PARAM_NR_IOREQ_SERVER_PAGES, + HVM_PARAM_X87_FIP_WIDTH, + HVM_PARAM_MCA_CAP, + }; + + xc_interface *xch = ctx->xch; + struct xc_sr_rec_hvm_params_entry entries[ARRAY_SIZE(params)]; + struct xc_sr_rec_hvm_params hdr = { + .count = 0, + }; + struct xc_sr_record rec = { + .type = REC_TYPE_HVM_PARAMS, + .length = sizeof(hdr), + .data = &hdr, + }; + unsigned int i; + int rc; + + for ( i = 0; i < ARRAY_SIZE(params); i++ ) + { + uint32_t index = params[i]; + uint64_t value; + + rc = xc_hvm_param_get(xch, ctx->domid, index, &value); + if ( rc ) + { + PERROR("Failed to get HVMPARAM at index %u", index); + return rc; + } + + if ( value != 0 ) + { + entries[hdr.count].index = index; + entries[hdr.count].value = value; + hdr.count++; + } + } + + /* No params? Skip this record. */ + if ( hdr.count == 0 ) + return 0; + + rc = write_split_record(ctx, &rec, entries, hdr.count * sizeof(*entries)); + if ( rc ) + PERROR("Failed to write HVM_PARAMS record"); + + return rc; +} + +static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx, + xen_pfn_t pfn) +{ + /* identity map */ + return pfn; +} + +static int x86_hvm_normalise_page(struct xc_sr_context *ctx, + xen_pfn_t type, void **page) +{ + return 0; +} + +static int x86_hvm_setup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t nr_pfns; + + if ( xc_domain_nr_gpfns(xch, ctx->domid, &nr_pfns) < 0 ) + { + PERROR("Unable to obtain the guest p2m size"); + return -1; + } +#ifdef __i386__ + /* Very large domains (> 1TB) will exhaust virtual address space. */ + if ( nr_pfns > 0x0fffffff ) + { + errno = E2BIG; + PERROR("Cannot save this big a guest"); + return -1; + } +#endif + + ctx->save.p2m_size = nr_pfns; + + if ( ctx->save.callbacks->switch_qemu_logdirty( + ctx->domid, 1, ctx->save.callbacks->data) ) + { + PERROR("Couldn't enable qemu log-dirty mode"); + return -1; + } + + ctx->x86.hvm.save.qemu_enabled_logdirty = true; + + return 0; +} + +static int x86_hvm_static_data(struct xc_sr_context *ctx) +{ + return write_x86_cpu_policy_records(ctx); +} + +static int x86_hvm_start_of_stream(struct xc_sr_context *ctx) +{ + return 0; +} + +static int x86_hvm_start_of_checkpoint(struct xc_sr_context *ctx) +{ + return 0; +} + +static int x86_hvm_check_vm_state(struct xc_sr_context *ctx) +{ + return 0; +} + +static int x86_hvm_end_of_checkpoint(struct xc_sr_context *ctx) +{ + int rc; + + /* Write the TSC record. */ + rc = write_x86_tsc_info(ctx); + if ( rc ) + return rc; + + /* Write the HVM_CONTEXT record. */ + rc = write_hvm_context(ctx); + if ( rc ) + return rc; + + /* Write HVM_PARAMS record contains applicable HVM params. */ + rc = write_hvm_params(ctx); + if ( rc ) + return rc; + + return 0; +} + +static int x86_hvm_cleanup(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + + /* If qemu successfully enabled logdirty mode, attempt to disable. */ + if ( ctx->x86.hvm.save.qemu_enabled_logdirty && + ctx->save.callbacks->switch_qemu_logdirty( + ctx->domid, 0, ctx->save.callbacks->data) ) + { + PERROR("Couldn't disable qemu log-dirty mode"); + return -1; + } + + return 0; +} + +struct xc_sr_save_ops save_ops_x86_hvm = +{ + .pfn_to_gfn = x86_hvm_pfn_to_gfn, + .normalise_page = x86_hvm_normalise_page, + .setup = x86_hvm_setup, + .static_data = x86_hvm_static_data, + .start_of_stream = x86_hvm_start_of_stream, + .start_of_checkpoint = x86_hvm_start_of_checkpoint, + .end_of_checkpoint = x86_hvm_end_of_checkpoint, + .check_vm_state = x86_hvm_check_vm_state, + .cleanup = x86_hvm_cleanup, +}; + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_save_x86_pv.c b/tools/libs/guest/xg_sr_save_x86_pv.c new file mode 100644 index 0000000000..4964f1f7b8 --- /dev/null +++ b/tools/libs/guest/xg_sr_save_x86_pv.c @@ -0,0 +1,1156 @@ +#include +#include + +#include "xg_sr_common_x86_pv.h" + +/* Check a 64 bit virtual address for being canonical. */ +static inline bool is_canonical_address(xen_vaddr_t vaddr) +{ + return ((int64_t)vaddr >> 47) == ((int64_t)vaddr >> 63); +} + +/* + * Maps the guests shared info page. + */ +static int map_shinfo(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + + ctx->x86.pv.shinfo = xc_map_foreign_range( + xch, ctx->domid, PAGE_SIZE, PROT_READ, ctx->dominfo.shared_info_frame); + if ( !ctx->x86.pv.shinfo ) + { + PERROR("Failed to map shared info frame at mfn %#lx", + ctx->dominfo.shared_info_frame); + return -1; + } + + return 0; +} + +/* + * Copy a list of mfns from a guest, accounting for differences between guest + * and toolstack width. Can fail if truncation would occur. + */ +static int copy_mfns_from_guest(const struct xc_sr_context *ctx, + xen_pfn_t *dst, const void *src, size_t count) +{ + size_t x; + + if ( ctx->x86.pv.width == sizeof(unsigned long) ) + memcpy(dst, src, count * sizeof(*dst)); + else + { + for ( x = 0; x < count; ++x ) + { +#ifdef __x86_64__ + /* 64bit toolstack, 32bit guest. Expand any INVALID_MFN. */ + uint32_t s = ((uint32_t *)src)[x]; + + dst[x] = s == ~0U ? INVALID_MFN : s; +#else + /* + * 32bit toolstack, 64bit guest. Truncate INVALID_MFN, but bail + * if any other truncation would occur. + * + * This will only occur on hosts where a PV guest has ram above + * the 16TB boundary. A 32bit dom0 is unlikely to have + * successfully booted on a system this large. + */ + uint64_t s = ((uint64_t *)src)[x]; + + if ( (s != ~0ULL) && ((s >> 32) != 0) ) + { + errno = E2BIG; + return -1; + } + + dst[x] = s; +#endif + } + } + + return 0; +} + +/* + * Map the p2m leave pages and build an array of their pfns. + */ +static int map_p2m_leaves(struct xc_sr_context *ctx, xen_pfn_t *mfns, + size_t n_mfns) +{ + xc_interface *xch = ctx->xch; + unsigned int x; + + ctx->x86.pv.p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, + mfns, n_mfns); + if ( !ctx->x86.pv.p2m ) + { + PERROR("Failed to map p2m frames"); + return -1; + } + + ctx->save.p2m_size = ctx->x86.pv.max_pfn + 1; + ctx->x86.pv.p2m_frames = n_mfns; + ctx->x86.pv.p2m_pfns = malloc(n_mfns * sizeof(*mfns)); + if ( !ctx->x86.pv.p2m_pfns ) + { + ERROR("Cannot allocate %zu bytes for p2m pfns list", + n_mfns * sizeof(*mfns)); + return -1; + } + + /* Convert leaf frames from mfns to pfns. */ + for ( x = 0; x < n_mfns; ++x ) + { + if ( !mfn_in_pseudophysmap(ctx, mfns[x]) ) + { + ERROR("Bad mfn in p2m_frame_list[%u]", x); + dump_bad_pseudophysmap_entry(ctx, mfns[x]); + errno = ERANGE; + return -1; + } + + ctx->x86.pv.p2m_pfns[x] = mfn_to_pfn(ctx, mfns[x]); + } + + return 0; +} + +/* + * Walk the guests frame list list and frame list to identify and map the + * frames making up the guests p2m table. Construct a list of pfns making up + * the table. + */ +static int map_p2m_tree(struct xc_sr_context *ctx) +{ + /* Terminology: + * + * fll - frame list list, top level p2m, list of fl mfns + * fl - frame list, mid level p2m, list of leaf mfns + * local - own allocated buffers, adjusted for bitness + * guest - mappings into the domain + */ + xc_interface *xch = ctx->xch; + int rc = -1; + unsigned int x, saved_x, fpp, fll_entries, fl_entries; + xen_pfn_t fll_mfn, saved_mfn, max_pfn; + + xen_pfn_t *local_fll = NULL; + void *guest_fll = NULL; + size_t local_fll_size; + + xen_pfn_t *local_fl = NULL; + void *guest_fl = NULL; + size_t local_fl_size; + + fpp = PAGE_SIZE / ctx->x86.pv.width; + fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1; + if ( fll_entries > fpp ) + { + ERROR("max_pfn %#lx too large for p2m tree", ctx->x86.pv.max_pfn); + goto err; + } + + fll_mfn = GET_FIELD(ctx->x86.pv.shinfo, arch.pfn_to_mfn_frame_list_list, + ctx->x86.pv.width); + if ( fll_mfn == 0 || fll_mfn > ctx->x86.pv.max_mfn ) + { + ERROR("Bad mfn %#lx for p2m frame list list", fll_mfn); + goto err; + } + + /* Map the guest top p2m. */ + guest_fll = xc_map_foreign_range(xch, ctx->domid, PAGE_SIZE, + PROT_READ, fll_mfn); + if ( !guest_fll ) + { + PERROR("Failed to map p2m frame list list at %#lx", fll_mfn); + goto err; + } + + local_fll_size = fll_entries * sizeof(*local_fll); + local_fll = malloc(local_fll_size); + if ( !local_fll ) + { + ERROR("Cannot allocate %zu bytes for local p2m frame list list", + local_fll_size); + goto err; + } + + if ( copy_mfns_from_guest(ctx, local_fll, guest_fll, fll_entries) ) + { + ERROR("Truncation detected copying p2m frame list list"); + goto err; + } + + /* Check for bad mfns in frame list list. */ + saved_mfn = 0; + saved_x = 0; + for ( x = 0; x < fll_entries; ++x ) + { + if ( local_fll[x] == 0 || local_fll[x] > ctx->x86.pv.max_mfn ) + { + ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list list", + local_fll[x], x, fll_entries); + goto err; + } + if ( local_fll[x] != saved_mfn ) + { + saved_mfn = local_fll[x]; + saved_x = x; + } + } + + /* + * Check for actual lower max_pfn: + * If the trailing entries of the frame list list were all the same we can + * assume they all reference mid pages all referencing p2m pages with all + * invalid entries. Otherwise there would be multiple pfns referencing all + * the same mfn which can't work across migration, as this sharing would be + * broken by the migration process. + * Adjust max_pfn if possible to avoid allocating much larger areas as + * needed for p2m and logdirty map. + */ + max_pfn = (saved_x + 1) * fpp * fpp - 1; + if ( max_pfn < ctx->x86.pv.max_pfn ) + { + ctx->x86.pv.max_pfn = max_pfn; + fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1; + } + ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp; + DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn, + ctx->x86.pv.p2m_frames); + fl_entries = (ctx->x86.pv.max_pfn / fpp) + 1; + + /* Map the guest mid p2m frames. */ + guest_fl = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, + local_fll, fll_entries); + if ( !guest_fl ) + { + PERROR("Failed to map p2m frame list"); + goto err; + } + + local_fl_size = fl_entries * sizeof(*local_fl); + local_fl = malloc(local_fl_size); + if ( !local_fl ) + { + ERROR("Cannot allocate %zu bytes for local p2m frame list", + local_fl_size); + goto err; + } + + if ( copy_mfns_from_guest(ctx, local_fl, guest_fl, fl_entries) ) + { + ERROR("Truncation detected copying p2m frame list"); + goto err; + } + + for ( x = 0; x < fl_entries; ++x ) + { + if ( local_fl[x] == 0 || local_fl[x] > ctx->x86.pv.max_mfn ) + { + ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list", + local_fl[x], x, fl_entries); + goto err; + } + } + + /* Map the p2m leaves themselves. */ + rc = map_p2m_leaves(ctx, local_fl, fl_entries); + + err: + free(local_fl); + if ( guest_fl ) + munmap(guest_fl, fll_entries * PAGE_SIZE); + + free(local_fll); + if ( guest_fll ) + munmap(guest_fll, PAGE_SIZE); + + return rc; +} + +/* + * Get p2m_generation count. + * Returns an error if the generation count has changed since the last call. + */ +static int get_p2m_generation(struct xc_sr_context *ctx) +{ + uint64_t p2m_generation; + int rc; + + p2m_generation = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_generation, + ctx->x86.pv.width); + + rc = (p2m_generation == ctx->x86.pv.p2m_generation) ? 0 : -1; + ctx->x86.pv.p2m_generation = p2m_generation; + + return rc; +} + +static int x86_pv_check_vm_state_p2m_list(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; + + if ( !ctx->save.live ) + return 0; + + rc = get_p2m_generation(ctx); + if ( rc ) + ERROR("p2m generation count changed. Migration aborted."); + + return rc; +} + +/* + * Map the guest p2m frames specified via a cr3 value, a virtual address, and + * the maximum pfn. PTE entries are 64 bits for both, 32 and 64 bit guests as + * in 32 bit case we support PAE guests only. + */ +static int map_p2m_list(struct xc_sr_context *ctx, uint64_t p2m_cr3) +{ + xc_interface *xch = ctx->xch; + xen_vaddr_t p2m_vaddr, p2m_end, mask, off; + xen_pfn_t p2m_mfn, mfn, saved_mfn, max_pfn; + uint64_t *ptes = NULL; + xen_pfn_t *mfns = NULL; + unsigned int fpp, n_pages, level, shift, idx_start, idx_end, idx, saved_idx; + int rc = -1; + + p2m_mfn = cr3_to_mfn(ctx, p2m_cr3); + assert(p2m_mfn != 0); + if ( p2m_mfn > ctx->x86.pv.max_mfn ) + { + ERROR("Bad p2m_cr3 value %#" PRIx64, p2m_cr3); + errno = ERANGE; + goto err; + } + + get_p2m_generation(ctx); + + p2m_vaddr = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_vaddr, + ctx->x86.pv.width); + fpp = PAGE_SIZE / ctx->x86.pv.width; + ctx->x86.pv.p2m_frames = ctx->x86.pv.max_pfn / fpp + 1; + p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1; + + if ( ctx->x86.pv.width == 8 ) + { + mask = 0x0000ffffffffffffULL; + if ( !is_canonical_address(p2m_vaddr) || + !is_canonical_address(p2m_end) || + p2m_end < p2m_vaddr || + (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_64 && + p2m_end > HYPERVISOR_VIRT_START_X86_64) ) + { + ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64, + p2m_vaddr, p2m_end); + errno = ERANGE; + goto err; + } + } + else + { + mask = 0x00000000ffffffffULL; + if ( p2m_vaddr > mask || p2m_end > mask || p2m_end < p2m_vaddr || + (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_32 && + p2m_end > HYPERVISOR_VIRT_START_X86_32) ) + { + ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64, + p2m_vaddr, p2m_end); + errno = ERANGE; + goto err; + } + } + + DPRINTF("p2m list from %#" PRIx64 " to %#" PRIx64 ", root at %#lx", + p2m_vaddr, p2m_end, p2m_mfn); + DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn, + ctx->x86.pv.p2m_frames); + + mfns = malloc(sizeof(*mfns)); + if ( !mfns ) + { + ERROR("Cannot allocate memory for array of %u mfns", 1); + goto err; + } + mfns[0] = p2m_mfn; + off = 0; + saved_mfn = 0; + idx_start = idx_end = saved_idx = 0; + + for ( level = ctx->x86.pv.levels; level > 0; level-- ) + { + n_pages = idx_end - idx_start + 1; + ptes = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, mfns, n_pages); + if ( !ptes ) + { + PERROR("Failed to map %u page table pages for p2m list", n_pages); + goto err; + } + free(mfns); + + shift = level * 9 + 3; + idx_start = ((p2m_vaddr - off) & mask) >> shift; + idx_end = ((p2m_end - off) & mask) >> shift; + idx = idx_end - idx_start + 1; + mfns = malloc(sizeof(*mfns) * idx); + if ( !mfns ) + { + ERROR("Cannot allocate memory for array of %u mfns", idx); + goto err; + } + + for ( idx = idx_start; idx <= idx_end; idx++ ) + { + mfn = pte_to_frame(ptes[idx]); + if ( mfn == 0 || mfn > ctx->x86.pv.max_mfn ) + { + ERROR("Bad mfn %#lx during page table walk for vaddr %#" PRIx64 " at level %d of p2m list", + mfn, off + ((xen_vaddr_t)idx << shift), level); + errno = ERANGE; + goto err; + } + mfns[idx - idx_start] = mfn; + + /* Maximum pfn check at level 2. Same reasoning as for p2m tree. */ + if ( level == 2 ) + { + if ( mfn != saved_mfn ) + { + saved_mfn = mfn; + saved_idx = idx - idx_start; + } + } + } + + if ( level == 2 ) + { + if ( saved_idx == idx_end ) + saved_idx++; + max_pfn = ((xen_pfn_t)saved_idx << 9) * fpp - 1; + if ( max_pfn < ctx->x86.pv.max_pfn ) + { + ctx->x86.pv.max_pfn = max_pfn; + ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp; + p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1; + idx_end = idx_start + saved_idx; + } + } + + munmap(ptes, n_pages * PAGE_SIZE); + ptes = NULL; + off = p2m_vaddr & ((mask >> shift) << shift); + } + + /* Map the p2m leaves themselves. */ + rc = map_p2m_leaves(ctx, mfns, idx_end - idx_start + 1); + + err: + free(mfns); + if ( ptes ) + munmap(ptes, n_pages * PAGE_SIZE); + + return rc; +} + +/* + * Map the guest p2m frames. + * Depending on guest support this might either be a virtual mapped linear + * list (preferred format) or a 3 level tree linked via mfns. + */ +static int map_p2m(struct xc_sr_context *ctx) +{ + uint64_t p2m_cr3; + + ctx->x86.pv.p2m_generation = ~0ULL; + ctx->x86.pv.max_pfn = GET_FIELD(ctx->x86.pv.shinfo, arch.max_pfn, + ctx->x86.pv.width) - 1; + p2m_cr3 = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_cr3, ctx->x86.pv.width); + + return p2m_cr3 ? map_p2m_list(ctx, p2m_cr3) : map_p2m_tree(ctx); +} + +/* + * Obtain a specific vcpus basic state and write an X86_PV_VCPU_BASIC record + * into the stream. Performs mfn->pfn conversion on architectural state. + */ +static int write_one_vcpu_basic(struct xc_sr_context *ctx, uint32_t id) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t mfn, pfn; + unsigned int i, gdt_count; + int rc = -1; + vcpu_guest_context_any_t vcpu; + struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { + .vcpu_id = id, + }; + struct xc_sr_record rec = { + .type = REC_TYPE_X86_PV_VCPU_BASIC, + .length = sizeof(vhdr), + .data = &vhdr, + }; + + if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) ) + { + PERROR("Failed to get vcpu%u context", id); + goto err; + } + + /* Vcpu0 is special: Convert the suspend record to a pfn. */ + if ( id == 0 ) + { + mfn = GET_FIELD(&vcpu, user_regs.edx, ctx->x86.pv.width); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Bad mfn for suspend record"); + dump_bad_pseudophysmap_entry(ctx, mfn); + errno = ERANGE; + goto err; + } + SET_FIELD(&vcpu, user_regs.edx, mfn_to_pfn(ctx, mfn), + ctx->x86.pv.width); + } + + gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86.pv.width); + if ( gdt_count > FIRST_RESERVED_GDT_ENTRY ) + { + ERROR("GDT entry count (%u) out of range (max %u)", + gdt_count, FIRST_RESERVED_GDT_ENTRY); + errno = ERANGE; + goto err; + } + gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */ + + /* Convert GDT frames to pfns. */ + for ( i = 0; i < gdt_count; ++i ) + { + mfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86.pv.width); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Bad mfn for frame %u of vcpu%u's GDT", i, id); + dump_bad_pseudophysmap_entry(ctx, mfn); + errno = ERANGE; + goto err; + } + SET_FIELD(&vcpu, gdt_frames[i], mfn_to_pfn(ctx, mfn), + ctx->x86.pv.width); + } + + /* Convert CR3 to a pfn. */ + mfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86.pv.width)); + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Bad mfn for vcpu%u's cr3", id); + dump_bad_pseudophysmap_entry(ctx, mfn); + errno = ERANGE; + goto err; + } + pfn = mfn_to_pfn(ctx, mfn); + SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, pfn), ctx->x86.pv.width); + + /* 64bit guests: Convert CR1 (guest pagetables) to pfn. */ + if ( ctx->x86.pv.levels == 4 && vcpu.x64.ctrlreg[1] ) + { + mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT; + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + ERROR("Bad mfn for vcpu%u's cr1", id); + dump_bad_pseudophysmap_entry(ctx, mfn); + errno = ERANGE; + goto err; + } + pfn = mfn_to_pfn(ctx, mfn); + vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT); + } + + if ( ctx->x86.pv.width == 8 ) + rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x64)); + else + rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x32)); + + err: + return rc; +} + +/* + * Obtain a specific vcpus extended state and write an X86_PV_VCPU_EXTENDED + * record into the stream. + */ +static int write_one_vcpu_extended(struct xc_sr_context *ctx, uint32_t id) +{ + xc_interface *xch = ctx->xch; + struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { + .vcpu_id = id, + }; + struct xc_sr_record rec = { + .type = REC_TYPE_X86_PV_VCPU_EXTENDED, + .length = sizeof(vhdr), + .data = &vhdr, + }; + struct xen_domctl domctl = { + .cmd = XEN_DOMCTL_get_ext_vcpucontext, + .domain = ctx->domid, + .u.ext_vcpucontext.vcpu = id, + }; + + if ( xc_domctl(xch, &domctl) < 0 ) + { + PERROR("Unable to get vcpu%u extended context", id); + return -1; + } + + /* No content? Skip the record. */ + if ( domctl.u.ext_vcpucontext.size == 0 ) + return 0; + + return write_split_record(ctx, &rec, &domctl.u.ext_vcpucontext, + domctl.u.ext_vcpucontext.size); +} + +/* + * Query to see whether a specific vcpu has xsave state and if so, write an + * X86_PV_VCPU_XSAVE record into the stream. + */ +static int write_one_vcpu_xsave(struct xc_sr_context *ctx, uint32_t id) +{ + xc_interface *xch = ctx->xch; + int rc = -1; + DECLARE_HYPERCALL_BUFFER(void, buffer); + struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { + .vcpu_id = id, + }; + struct xc_sr_record rec = { + .type = REC_TYPE_X86_PV_VCPU_XSAVE, + .length = sizeof(vhdr), + .data = &vhdr, + }; + struct xen_domctl domctl = { + .cmd = XEN_DOMCTL_getvcpuextstate, + .domain = ctx->domid, + .u.vcpuextstate.vcpu = id, + }; + + if ( xc_domctl(xch, &domctl) < 0 ) + { + PERROR("Unable to get vcpu%u's xsave context", id); + goto err; + } + + /* No xsave state? skip this record. */ + if ( !domctl.u.vcpuextstate.xfeature_mask ) + goto out; + + buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size); + if ( !buffer ) + { + ERROR("Unable to allocate %"PRIx64" bytes for vcpu%u's xsave context", + domctl.u.vcpuextstate.size, id); + goto err; + } + + set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); + if ( xc_domctl(xch, &domctl) < 0 ) + { + PERROR("Unable to get vcpu%u's xsave context", id); + goto err; + } + + /* No xsave state? Skip this record. */ + if ( domctl.u.vcpuextstate.size == 0 ) + goto out; + + rc = write_split_record(ctx, &rec, buffer, domctl.u.vcpuextstate.size); + if ( rc ) + goto err; + + out: + rc = 0; + + err: + xc_hypercall_buffer_free(xch, buffer); + + return rc; +} + +/* + * Query to see whether a specific vcpu has msr state and if so, write an + * X86_PV_VCPU_MSRS record into the stream. + */ +static int write_one_vcpu_msrs(struct xc_sr_context *ctx, uint32_t id) +{ + xc_interface *xch = ctx->xch; + int rc = -1; + size_t buffersz; + DECLARE_HYPERCALL_BUFFER(void, buffer); + struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { + .vcpu_id = id, + }; + struct xc_sr_record rec = { + .type = REC_TYPE_X86_PV_VCPU_MSRS, + .length = sizeof(vhdr), + .data = &vhdr, + }; + struct xen_domctl domctl = { + .cmd = XEN_DOMCTL_get_vcpu_msrs, + .domain = ctx->domid, + .u.vcpu_msrs.vcpu = id, + }; + + if ( xc_domctl(xch, &domctl) < 0 ) + { + PERROR("Unable to get vcpu%u's msrs", id); + goto err; + } + + /* No MSRs? skip this record. */ + if ( !domctl.u.vcpu_msrs.msr_count ) + goto out; + + buffersz = domctl.u.vcpu_msrs.msr_count * sizeof(xen_domctl_vcpu_msr_t); + buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz); + if ( !buffer ) + { + ERROR("Unable to allocate %zu bytes for vcpu%u's msrs", + buffersz, id); + goto err; + } + + set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer); + if ( xc_domctl(xch, &domctl) < 0 ) + { + PERROR("Unable to get vcpu%u's msrs", id); + goto err; + } + + /* No MSRs? Skip this record. */ + if ( domctl.u.vcpu_msrs.msr_count == 0 ) + goto out; + + rc = write_split_record(ctx, &rec, buffer, + domctl.u.vcpu_msrs.msr_count * + sizeof(xen_domctl_vcpu_msr_t)); + if ( rc ) + goto err; + + out: + rc = 0; + + err: + xc_hypercall_buffer_free(xch, buffer); + + return rc; +} + +/* + * For each vcpu, if it is online, write its state into the stream. + */ +static int write_all_vcpu_information(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xc_vcpuinfo_t vinfo; + unsigned int i; + int rc; + + for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i ) + { + rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo); + if ( rc ) + { + PERROR("Failed to get vcpu%u information", i); + return rc; + } + + /* Vcpu offline? skip all these records. */ + if ( !vinfo.online ) + continue; + + rc = write_one_vcpu_basic(ctx, i); + if ( rc ) + return rc; + + rc = write_one_vcpu_extended(ctx, i); + if ( rc ) + return rc; + + rc = write_one_vcpu_xsave(ctx, i); + if ( rc ) + return rc; + + rc = write_one_vcpu_msrs(ctx, i); + if ( rc ) + return rc; + } + + return 0; +} + +/* + * Writes an X86_PV_INFO record into the stream. + */ +static int write_x86_pv_info(struct xc_sr_context *ctx) +{ + struct xc_sr_rec_x86_pv_info info = { + .guest_width = ctx->x86.pv.width, + .pt_levels = ctx->x86.pv.levels, + }; + struct xc_sr_record rec = { + .type = REC_TYPE_X86_PV_INFO, + .length = sizeof(info), + .data = &info, + }; + + return write_record(ctx, &rec); +} + +/* + * Writes an X86_PV_P2M_FRAMES record into the stream. This contains the list + * of pfns making up the p2m table. + */ +static int write_x86_pv_p2m_frames(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc; unsigned int i; + size_t datasz = ctx->x86.pv.p2m_frames * sizeof(uint64_t); + uint64_t *data = NULL; + struct xc_sr_rec_x86_pv_p2m_frames hdr = { + .end_pfn = ctx->x86.pv.max_pfn, + }; + struct xc_sr_record rec = { + .type = REC_TYPE_X86_PV_P2M_FRAMES, + .length = sizeof(hdr), + .data = &hdr, + }; + + /* No need to translate if sizeof(uint64_t) == sizeof(xen_pfn_t). */ + if ( sizeof(uint64_t) != sizeof(*ctx->x86.pv.p2m_pfns) ) + { + if ( !(data = malloc(datasz)) ) + { + ERROR("Cannot allocate %zu bytes for X86_PV_P2M_FRAMES data", + datasz); + return -1; + } + + for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i ) + data[i] = ctx->x86.pv.p2m_pfns[i]; + } + else + data = (uint64_t *)ctx->x86.pv.p2m_pfns; + + rc = write_split_record(ctx, &rec, data, datasz); + + if ( data != (uint64_t *)ctx->x86.pv.p2m_pfns ) + free(data); + + return rc; +} + +/* + * Writes an SHARED_INFO record into the stream. + */ +static int write_shared_info(struct xc_sr_context *ctx) +{ + struct xc_sr_record rec = { + .type = REC_TYPE_SHARED_INFO, + .length = PAGE_SIZE, + .data = ctx->x86.pv.shinfo, + }; + + return write_record(ctx, &rec); +} + +/* + * Normalise a pagetable for the migration stream. Performs mfn->pfn + * conversions on the ptes. + */ +static int normalise_pagetable(struct xc_sr_context *ctx, const uint64_t *src, + uint64_t *dst, unsigned long type) +{ + xc_interface *xch = ctx->xch; + uint64_t pte; + unsigned int i, xen_first = -1, xen_last = -1; /* Indices of Xen mappings. */ + + type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + if ( ctx->x86.pv.levels == 4 ) + { + /* 64bit guests only have Xen mappings in their L4 tables. */ + if ( type == XEN_DOMCTL_PFINFO_L4TAB ) + { + xen_first = (HYPERVISOR_VIRT_START_X86_64 >> + L4_PAGETABLE_SHIFT_X86_64) & 511; + xen_last = (HYPERVISOR_VIRT_END_X86_64 >> + L4_PAGETABLE_SHIFT_X86_64) & 511; + } + } + else + { + switch ( type ) + { + case XEN_DOMCTL_PFINFO_L4TAB: + ERROR("??? Found L4 table for 32bit guest"); + errno = EINVAL; + return -1; + + case XEN_DOMCTL_PFINFO_L3TAB: + /* 32bit guests can only use the first 4 entries of their L3 tables. + * All other are potentially used by Xen. */ + xen_first = 4; + xen_last = 511; + break; + + case XEN_DOMCTL_PFINFO_L2TAB: + /* It is hard to spot Xen mappings in a 32bit guest's L2. Most + * are normal but only a few will have Xen mappings. + */ + i = (HYPERVISOR_VIRT_START_X86_32 >> L2_PAGETABLE_SHIFT_PAE) & 511; + if ( pte_to_frame(src[i]) == ctx->x86.pv.compat_m2p_mfn0 ) + { + xen_first = i; + xen_last = (HYPERVISOR_VIRT_END_X86_32 >> + L2_PAGETABLE_SHIFT_PAE) & 511; + } + break; + } + } + + for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) + { + xen_pfn_t mfn; + + pte = src[i]; + + /* Remove Xen mappings: Xen will reconstruct on the other side. */ + if ( i >= xen_first && i <= xen_last ) + pte = 0; + + /* + * Errors during the live part of migration are expected as a result + * of split pagetable updates, page type changes, active grant + * mappings etc. The pagetable will need to be resent after pausing. + * In such cases we fail with EAGAIN. + * + * For domains which are already paused, errors are fatal. + */ + if ( pte & _PAGE_PRESENT ) + { + mfn = pte_to_frame(pte); + +#ifdef __i386__ + if ( mfn == INVALID_MFN ) + { + if ( !ctx->dominfo.paused ) + errno = EAGAIN; + else + { + ERROR("PTE truncation detected. L%lu[%u] = %016"PRIx64, + type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); + errno = E2BIG; + } + return -1; + } +#endif + + if ( (type > XEN_DOMCTL_PFINFO_L1TAB) && (pte & _PAGE_PSE) ) + { + ERROR("Cannot migrate superpage (L%lu[%u]: 0x%016"PRIx64")", + type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); + errno = E2BIG; + return -1; + } + + if ( !mfn_in_pseudophysmap(ctx, mfn) ) + { + if ( !ctx->dominfo.paused ) + errno = EAGAIN; + else + { + ERROR("Bad mfn for L%lu[%u]", + type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i); + dump_bad_pseudophysmap_entry(ctx, mfn); + errno = ERANGE; + } + return -1; + } + + pte = merge_pte(pte, mfn_to_pfn(ctx, mfn)); + } + + dst[i] = pte; + } + + return 0; +} + +static xen_pfn_t x86_pv_pfn_to_gfn(const struct xc_sr_context *ctx, + xen_pfn_t pfn) +{ + assert(pfn <= ctx->x86.pv.max_pfn); + + return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width); +} + + +/* + * save_ops function. Performs pagetable normalisation on appropriate pages. + */ +static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type, + void **page) +{ + xc_interface *xch = ctx->xch; + void *local_page; + int rc; + + type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB ) + return 0; + + local_page = malloc(PAGE_SIZE); + if ( !local_page ) + { + ERROR("Unable to allocate scratch page"); + rc = -1; + goto out; + } + + rc = normalise_pagetable(ctx, *page, local_page, type); + *page = local_page; + + out: + return rc; +} + +/* + * save_ops function. Queries domain information and maps the Xen m2p and the + * guests shinfo and p2m table. + */ +static int x86_pv_setup(struct xc_sr_context *ctx) +{ + int rc; + + rc = x86_pv_domain_info(ctx); + if ( rc ) + return rc; + + rc = x86_pv_map_m2p(ctx); + if ( rc ) + return rc; + + rc = map_shinfo(ctx); + if ( rc ) + return rc; + + rc = map_p2m(ctx); + if ( rc ) + return rc; + + return 0; +} + +static int x86_pv_static_data(struct xc_sr_context *ctx) +{ + int rc; + + rc = write_x86_pv_info(ctx); + if ( rc ) + return rc; + + rc = write_x86_cpu_policy_records(ctx); + if ( rc ) + return rc; + + return 0; +} + +static int x86_pv_start_of_stream(struct xc_sr_context *ctx) +{ + int rc; + + /* + * Ideally should be able to change during migration. Currently + * corruption will occur if the contents or location of the P2M changes + * during the live migration loop. If one is very lucky, the breakage + * will not be subtle. + */ + rc = write_x86_pv_p2m_frames(ctx); + if ( rc ) + return rc; + + return 0; +} + +static int x86_pv_start_of_checkpoint(struct xc_sr_context *ctx) +{ + return 0; +} + +static int x86_pv_end_of_checkpoint(struct xc_sr_context *ctx) +{ + int rc; + + rc = write_x86_tsc_info(ctx); + if ( rc ) + return rc; + + rc = write_shared_info(ctx); + if ( rc ) + return rc; + + rc = write_all_vcpu_information(ctx); + if ( rc ) + return rc; + + return 0; +} + +static int x86_pv_check_vm_state(struct xc_sr_context *ctx) +{ + if ( ctx->x86.pv.p2m_generation == ~0ULL ) + return 0; + + return x86_pv_check_vm_state_p2m_list(ctx); +} + +static int x86_pv_cleanup(struct xc_sr_context *ctx) +{ + free(ctx->x86.pv.p2m_pfns); + + if ( ctx->x86.pv.p2m ) + munmap(ctx->x86.pv.p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE); + + if ( ctx->x86.pv.shinfo ) + munmap(ctx->x86.pv.shinfo, PAGE_SIZE); + + if ( ctx->x86.pv.m2p ) + munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE); + + return 0; +} + +struct xc_sr_save_ops save_ops_x86_pv = +{ + .pfn_to_gfn = x86_pv_pfn_to_gfn, + .normalise_page = x86_pv_normalise_page, + .setup = x86_pv_setup, + .static_data = x86_pv_static_data, + .start_of_stream = x86_pv_start_of_stream, + .start_of_checkpoint = x86_pv_start_of_checkpoint, + .end_of_checkpoint = x86_pv_end_of_checkpoint, + .check_vm_state = x86_pv_check_vm_state, + .cleanup = x86_pv_cleanup, +}; + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_sr_stream_format.h b/tools/libs/guest/xg_sr_stream_format.h new file mode 100644 index 0000000000..8a0da26f75 --- /dev/null +++ b/tools/libs/guest/xg_sr_stream_format.h @@ -0,0 +1,150 @@ +#ifndef __STREAM_FORMAT__H +#define __STREAM_FORMAT__H + +/* + * C structures for the Migration v2 stream format. + * See docs/specs/libxc-migration-stream.pandoc + */ + +#include + +/* + * Image Header + */ +struct xc_sr_ihdr +{ + uint64_t marker; + uint32_t id; + uint32_t version; + uint16_t options; + uint16_t _res1; + uint32_t _res2; +}; + +#define IHDR_MARKER 0xffffffffffffffffULL +#define IHDR_ID 0x58454E46U + +#define _IHDR_OPT_ENDIAN 0 +#define IHDR_OPT_LITTLE_ENDIAN (0 << _IHDR_OPT_ENDIAN) +#define IHDR_OPT_BIG_ENDIAN (1 << _IHDR_OPT_ENDIAN) + +/* + * Domain Header + */ +struct xc_sr_dhdr +{ + uint32_t type; + uint16_t page_shift; + uint16_t _res1; + uint32_t xen_major; + uint32_t xen_minor; +}; + +#define DHDR_TYPE_X86_PV 0x00000001U +#define DHDR_TYPE_X86_HVM 0x00000002U + +/* + * Record Header + */ +struct xc_sr_rhdr +{ + uint32_t type; + uint32_t length; +}; + +/* All records must be aligned up to an 8 octet boundary */ +#define REC_ALIGN_ORDER (3U) +/* Somewhat arbitrary - 128MB */ +#define REC_LENGTH_MAX (128U << 20) + +#define REC_TYPE_END 0x00000000U +#define REC_TYPE_PAGE_DATA 0x00000001U +#define REC_TYPE_X86_PV_INFO 0x00000002U +#define REC_TYPE_X86_PV_P2M_FRAMES 0x00000003U +#define REC_TYPE_X86_PV_VCPU_BASIC 0x00000004U +#define REC_TYPE_X86_PV_VCPU_EXTENDED 0x00000005U +#define REC_TYPE_X86_PV_VCPU_XSAVE 0x00000006U +#define REC_TYPE_SHARED_INFO 0x00000007U +#define REC_TYPE_X86_TSC_INFO 0x00000008U +#define REC_TYPE_HVM_CONTEXT 0x00000009U +#define REC_TYPE_HVM_PARAMS 0x0000000aU +#define REC_TYPE_TOOLSTACK 0x0000000bU +#define REC_TYPE_X86_PV_VCPU_MSRS 0x0000000cU +#define REC_TYPE_VERIFY 0x0000000dU +#define REC_TYPE_CHECKPOINT 0x0000000eU +#define REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST 0x0000000fU +#define REC_TYPE_STATIC_DATA_END 0x00000010U +#define REC_TYPE_X86_CPUID_POLICY 0x00000011U +#define REC_TYPE_X86_MSR_POLICY 0x00000012U + +#define REC_TYPE_OPTIONAL 0x80000000U + +/* PAGE_DATA */ +struct xc_sr_rec_page_data_header +{ + uint32_t count; + uint32_t _res1; + uint64_t pfn[0]; +}; + +#define PAGE_DATA_PFN_MASK 0x000fffffffffffffULL +#define PAGE_DATA_TYPE_MASK 0xf000000000000000ULL + +/* X86_PV_INFO */ +struct xc_sr_rec_x86_pv_info +{ + uint8_t guest_width; + uint8_t pt_levels; + uint8_t _res[6]; +}; + +/* X86_PV_P2M_FRAMES */ +struct xc_sr_rec_x86_pv_p2m_frames +{ + uint32_t start_pfn; + uint32_t end_pfn; + uint64_t p2m_pfns[0]; +}; + +/* X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} */ +struct xc_sr_rec_x86_pv_vcpu_hdr +{ + uint32_t vcpu_id; + uint32_t _res1; + uint8_t context[0]; +}; + +/* X86_TSC_INFO */ +struct xc_sr_rec_x86_tsc_info +{ + uint32_t mode; + uint32_t khz; + uint64_t nsec; + uint32_t incarnation; + uint32_t _res1; +}; + +/* HVM_PARAMS */ +struct xc_sr_rec_hvm_params_entry +{ + uint64_t index; + uint64_t value; +}; + +struct xc_sr_rec_hvm_params +{ + uint32_t count; + uint32_t _res1; + struct xc_sr_rec_hvm_params_entry param[0]; +}; + +#endif +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libs/guest/xg_suspend.c b/tools/libs/guest/xg_suspend.c new file mode 100644 index 0000000000..0ce6364963 --- /dev/null +++ b/tools/libs/guest/xg_suspend.c @@ -0,0 +1,202 @@ +/* + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see . + */ + +#include +#include + +#include + +#include "xc_private.h" +#include "xenguest.h" + +#define SUSPEND_LOCK_FILE XEN_RUN_DIR "/suspend-evtchn-%d.lock" + +/* + * locking + */ + +#define ERR(x) do{ \ + ERROR("Can't " #x " lock file for suspend event channel %s: %s\n", \ + suspend_file, strerror(errno)); \ + goto err; \ +}while(0) + +#define SUSPEND_FILE_BUFLEN (sizeof(SUSPEND_LOCK_FILE) + 10) + +static void get_suspend_file(char buf[], uint32_t domid) +{ + snprintf(buf, SUSPEND_FILE_BUFLEN, SUSPEND_LOCK_FILE, domid); +} + +static int lock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd) +{ + int fd = -1, r; + char suspend_file[SUSPEND_FILE_BUFLEN]; + struct stat ours, theirs; + struct flock fl; + + get_suspend_file(suspend_file, domid); + + *lockfd = -1; + + for (;;) { + if (fd >= 0) + close (fd); + + fd = open(suspend_file, O_CREAT | O_RDWR, 0600); + if (fd < 0) + ERR("create"); + + r = fcntl(fd, F_SETFD, FD_CLOEXEC); + if (r) + ERR("fcntl F_SETFD FD_CLOEXEC"); + + memset(&fl, 0, sizeof(fl)); + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_len = 1; + r = fcntl(fd, F_SETLK, &fl); + if (r) + ERR("fcntl F_SETLK"); + + r = fstat(fd, &ours); + if (r) + ERR("fstat"); + + r = stat(suspend_file, &theirs); + if (r) { + if (errno == ENOENT) + /* try again */ + continue; + ERR("stat"); + } + + if (ours.st_ino != theirs.st_ino) + /* someone else must have removed it while we were locking it */ + continue; + + break; + } + + *lockfd = fd; + return 0; + + err: + if (fd >= 0) + close(fd); + + return -1; +} + +static int unlock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd) +{ + int r; + char suspend_file[SUSPEND_FILE_BUFLEN]; + + if (*lockfd < 0) + return 0; + + get_suspend_file(suspend_file, domid); + + r = unlink(suspend_file); + if (r) + ERR("unlink"); + + r = close(*lockfd); + *lockfd = -1; + if (r) + ERR("close"); + + err: + if (*lockfd >= 0) + close(*lockfd); + + return -1; +} + +int xc_await_suspend(xc_interface *xch, xenevtchn_handle *xce, int suspend_evtchn) +{ + int rc; + + do { + rc = xenevtchn_pending(xce); + if (rc < 0) { + ERROR("error polling suspend notification channel: %d", rc); + return -1; + } + } while (rc != suspend_evtchn); + + /* harmless for one-off suspend */ + if (xenevtchn_unmask(xce, suspend_evtchn) < 0) + ERROR("failed to unmask suspend notification channel: %d", rc); + + return 0; +} + +/* Internal callers are allowed to call this with suspend_evtchn<0 + * but *lockfd>0. */ +int xc_suspend_evtchn_release(xc_interface *xch, xenevtchn_handle *xce, + uint32_t domid, int suspend_evtchn, int *lockfd) +{ + if (suspend_evtchn >= 0) + xenevtchn_unbind(xce, suspend_evtchn); + + return unlock_suspend_event(xch, domid, lockfd); +} + +int xc_suspend_evtchn_init_sane(xc_interface *xch, xenevtchn_handle *xce, + uint32_t domid, int port, int *lockfd) +{ + int rc, suspend_evtchn = -1; + + if (lock_suspend_event(xch, domid, lockfd)) { + errno = EINVAL; + goto cleanup; + } + + suspend_evtchn = xenevtchn_bind_interdomain(xce, domid, port); + if (suspend_evtchn < 0) { + ERROR("failed to bind suspend event channel: %d", suspend_evtchn); + goto cleanup; + } + + rc = xc_domain_subscribe_for_suspend(xch, domid, port); + if (rc < 0) { + ERROR("failed to subscribe to domain: %d", rc); + goto cleanup; + } + + return suspend_evtchn; + +cleanup: + xc_suspend_evtchn_release(xch, xce, domid, suspend_evtchn, lockfd); + + return -1; +} + +int xc_suspend_evtchn_init_exclusive(xc_interface *xch, xenevtchn_handle *xce, + uint32_t domid, int port, int *lockfd) +{ + int suspend_evtchn; + + suspend_evtchn = xc_suspend_evtchn_init_sane(xch, xce, domid, port, lockfd); + if (suspend_evtchn < 0) + return suspend_evtchn; + + /* event channel is pending immediately after binding */ + xc_await_suspend(xch, xce, suspend_evtchn); + + return suspend_evtchn; +} diff --git a/tools/libs/libs.mk b/tools/libs/libs.mk index 4679268fc2..9d0ed08846 100644 --- a/tools/libs/libs.mk +++ b/tools/libs/libs.mk @@ -34,7 +34,7 @@ PKG_CONFIG_DESC ?= The $(PKG_CONFIG_NAME) library for Xen hypervisor PKG_CONFIG_VERSION := $(MAJOR).$(MINOR) PKG_CONFIG_USELIBS := $(SHLIB_libxen$(LIBNAME)) PKG_CONFIG_LIB := xen$(LIBNAME) -PKG_CONFIG_REQPRIV := $(subst $(space),$(comma),$(strip $(foreach lib,$(USELIBS_$(LIBNAME)),xen$(lib)))) +PKG_CONFIG_REQPRIV := $(subst $(space),$(comma),$(strip $(foreach lib,$(patsubst ctrl,control,$(USELIBS_$(LIBNAME))),xen$(lib)))) ifneq ($(CONFIG_LIBXC_MINIOS),y) PKG_CONFIG_INST := $(PKG_CONFIG) diff --git a/tools/libs/uselibs.mk b/tools/libs/uselibs.mk index 8e45e8d917..9619c576ba 100644 --- a/tools/libs/uselibs.mk +++ b/tools/libs/uselibs.mk @@ -18,3 +18,5 @@ LIBS_LIBS += hypfs USELIBS_hypfs := toollog toolcore call LIBS_LIBS += ctrl USELIBS_ctrl := toollog call evtchn gnttab foreignmemory devicemodel +LIBS_LIBS += guest +USELIBS_guest := evtchn ctrl diff --git a/tools/libxc/COPYING b/tools/libxc/COPYING deleted file mode 100644 index 7ca8702509..0000000000 --- a/tools/libxc/COPYING +++ /dev/null @@ -1,467 +0,0 @@ -Note that the only valid version of the LGPL as far as the files in -this directory (and its subdirectories) are concerned is _this_ -particular version of the license (i.e., *only* v2.1, not v2.2 or v3.x -or whatever), unless explicitly otherwise stated. - -Where clause 3 is invoked in order to relicense under the GPL then -this shall be considered to be GPL v2 only for files which have -specified LGPL v2.1 only. - - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile deleted file mode 100644 index 44fa0488c9..0000000000 --- a/tools/libxc/Makefile +++ /dev/null @@ -1,219 +0,0 @@ -XEN_ROOT = $(CURDIR)/../.. -include $(XEN_ROOT)/tools/Rules.mk - -MAJOR = 4.15 -MINOR = 0 - -ifeq ($(CONFIG_LIBXC_MINIOS),y) -# Save/restore of a domain is currently incompatible with a stubdom environment -override CONFIG_MIGRATE := n -endif - -LINK_FILES := xc_private.h xc_core.h xc_core_x86.h xc_core_arm.h xc_bitops.h - -$(LINK_FILES): - ln -sf $(XEN_ROOT)/tools/libs/ctrl/$(notdir $@) $@ - -GUEST_SRCS-y := -GUEST_SRCS-y += xg_private.c -GUEST_SRCS-y += xg_domain.c -GUEST_SRCS-y += xg_suspend.c -ifeq ($(CONFIG_MIGRATE),y) -GUEST_SRCS-y += xg_sr_common.c -GUEST_SRCS-$(CONFIG_X86) += xg_sr_common_x86.c -GUEST_SRCS-$(CONFIG_X86) += xg_sr_common_x86_pv.c -GUEST_SRCS-$(CONFIG_X86) += xg_sr_restore_x86_pv.c -GUEST_SRCS-$(CONFIG_X86) += xg_sr_restore_x86_hvm.c -GUEST_SRCS-$(CONFIG_X86) += xg_sr_save_x86_pv.c -GUEST_SRCS-$(CONFIG_X86) += xg_sr_save_x86_hvm.c -GUEST_SRCS-y += xg_sr_restore.c -GUEST_SRCS-y += xg_sr_save.c -GUEST_SRCS-y += xg_offline_page.c -else -GUEST_SRCS-y += xg_nomigrate.c -endif - -vpath %.c ../../xen/common/libelf -CFLAGS += -I../../xen/common/libelf - -ELF_SRCS-y += libelf-tools.c libelf-loader.c -ELF_SRCS-y += libelf-dominfo.c - -GUEST_SRCS-y += $(ELF_SRCS-y) - -$(patsubst %.c,%.o,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign -$(patsubst %.c,%.opic,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign - -ifeq ($(CONFIG_X86),y) # Add libx86 to the build -vpath %.c ../../xen/lib/x86 - -GUEST_SRCS-y += cpuid.c msr.c -endif - -# new domain builder -GUEST_SRCS-y += xg_dom_core.c -GUEST_SRCS-y += xg_dom_boot.c -GUEST_SRCS-y += xg_dom_elfloader.c -GUEST_SRCS-$(CONFIG_X86) += xg_dom_bzimageloader.c -GUEST_SRCS-$(CONFIG_X86) += xg_dom_decompress_lz4.c -GUEST_SRCS-$(CONFIG_X86) += xg_dom_hvmloader.c -GUEST_SRCS-$(CONFIG_ARM) += xg_dom_armzimageloader.c -GUEST_SRCS-y += xg_dom_binloader.c -GUEST_SRCS-y += xg_dom_compat_linux.c - -GUEST_SRCS-$(CONFIG_X86) += xg_dom_x86.c -GUEST_SRCS-$(CONFIG_X86) += xg_cpuid_x86.c -GUEST_SRCS-$(CONFIG_ARM) += xg_dom_arm.c - -ifeq ($(CONFIG_LIBXC_MINIOS),y) -GUEST_SRCS-y += xg_dom_decompress_unsafe.c -GUEST_SRCS-y += xg_dom_decompress_unsafe_bzip2.c -GUEST_SRCS-y += xg_dom_decompress_unsafe_lzma.c -GUEST_SRCS-y += xg_dom_decompress_unsafe_lzo1x.c -GUEST_SRCS-y += xg_dom_decompress_unsafe_xz.c -endif - --include $(XEN_TARGET_ARCH)/Makefile - -CFLAGS += -Werror -Wmissing-prototypes -CFLAGS += -I. -I./include $(CFLAGS_xeninclude) -CFLAGS += -D__XEN_TOOLS__ - -# Needed for posix_fadvise64() in xc_linux.c -CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE - -CFLAGS += $(PTHREAD_CFLAGS) -CFLAGS += $(CFLAGS_libxentoollog) -CFLAGS += $(CFLAGS_libxenevtchn) -CFLAGS += $(CFLAGS_libxendevicemodel) - -GUEST_LIB_OBJS := $(patsubst %.c,%.o,$(GUEST_SRCS-y)) -GUEST_PIC_OBJS := $(patsubst %.c,%.opic,$(GUEST_SRCS-y)) - -$(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS): CFLAGS += -include $(XEN_ROOT)/tools/config.h - -# libxenguest includes xc_private.h, so needs this despite not using -# this functionality directly. -$(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS): CFLAGS += $(CFLAGS_libxencall) $(CFLAGS_libxenforeignmemory) - -LIB += libxenguest.a -ifneq ($(nosharedlibs),y) -LIB += libxenguest.so libxenguest.so.$(MAJOR) libxenguest.so.$(MAJOR).$(MINOR) -endif - -genpath-target = $(call buildmakevars2header,_paths.h) -$(eval $(genpath-target)) - -xc_private.h: _paths.h - -$(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS): $(LINK_FILES) - -PKG_CONFIG := xenguest.pc -PKG_CONFIG_VERSION := $(MAJOR).$(MINOR) - -xenguest.pc: PKG_CONFIG_NAME = Xenguest -xenguest.pc: PKG_CONFIG_DESC = The Xenguest library for Xen hypervisor -xenguest.pc: PKG_CONFIG_USELIBS = $(SHLIB_libxenguest) -xenguest.pc: PKG_CONFIG_LIB = xenguest -xenguest.pc: PKG_CONFIG_REQPRIV = xentoollog,xencall,xenforeignmemory,xenevtchn - -$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_NAME = Xenguest -$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_DESC = The Xenguest library for Xen hypervisor -$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_USELIBS = $(SHLIB_libxenguest) -$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_LIB = xenguest -$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_REQPRIV = xentoollog,xencall,xenforeignmemory,xenevtchn,xencontrol - -ifneq ($(CONFIG_LIBXC_MINIOS),y) -PKG_CONFIG_INST := $(PKG_CONFIG) -$(PKG_CONFIG_INST): PKG_CONFIG_PREFIX = $(prefix) -$(PKG_CONFIG_INST): PKG_CONFIG_INCDIR = $(includedir) -$(PKG_CONFIG_INST): PKG_CONFIG_LIBDIR = $(libdir) -endif - -PKG_CONFIG_LOCAL := $(foreach pc,$(PKG_CONFIG),$(PKG_CONFIG_DIR)/$(pc)) - -$(PKG_CONFIG_LOCAL): PKG_CONFIG_PREFIX = $(XEN_ROOT) -$(PKG_CONFIG_LOCAL): PKG_CONFIG_INCDIR = $(XEN_libxenctrl)/include -$(PKG_CONFIG_LOCAL): PKG_CONFIG_LIBDIR = $(CURDIR) -$(PKG_CONFIG_LOCAL): PKG_CONFIG_CFLAGS_LOCAL = $(CFLAGS_xeninclude) - -.PHONY: all -all: build - -.PHONY: build -build: - $(MAKE) libs - -.PHONY: libs -libs: $(LIB) $(PKG_CONFIG_INST) $(PKG_CONFIG_LOCAL) - -.PHONY: install -install: build - $(INSTALL_DIR) $(DESTDIR)$(libdir) - $(INSTALL_DIR) $(DESTDIR)$(includedir) - $(INSTALL_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir) - $(INSTALL_DATA) libxenguest.a $(DESTDIR)$(libdir) - $(SYMLINK_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenguest.so.$(MAJOR) - $(SYMLINK_SHLIB) libxenguest.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenguest.so - $(INSTALL_DATA) include/xenguest.h $(DESTDIR)$(includedir) - $(INSTALL_DATA) xenguest.pc $(DESTDIR)$(PKG_INSTALLDIR) - -.PHONY: uninstall -uninstall: - rm -f $(DESTDIR)$(PKG_INSTALLDIR)/xenguest.pc - rm -f $(DESTDIR)$(includedir)/xenguest.h - rm -f $(DESTDIR)$(libdir)/libxenguest.so - rm -f $(DESTDIR)$(libdir)/libxenguest.so.$(MAJOR) - rm -f $(DESTDIR)$(libdir)/libxenguest.so.$(MAJOR).$(MINOR) - rm -f $(DESTDIR)$(libdir)/libxenguest.a - -.PHONY: TAGS -TAGS: - etags -t *.c *.h - -.PHONY: clean -clean: - rm -rf *.rpm $(LIB) *~ $(DEPS_RM) \ - _paths.h \ - $(LINK_FILES) \ - xenguest.pc \ - $(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS) - -.PHONY: distclean -distclean: clean - -.PHONY: rpm -rpm: build - rm -rf staging - mkdir staging - mkdir staging/i386 - rpmbuild --define "staging$$PWD/staging" --define '_builddir.' \ - --define "_rpmdir$$PWD/staging" -bb rpm.spec - mv staging/i386/*.rpm . - rm -rf staging - -# libxenguest - -libxenguest.a: $(GUEST_LIB_OBJS) - $(AR) rc $@ $^ - -libxenguest.so: libxenguest.so.$(MAJOR) - $(SYMLINK_SHLIB) $< $@ -libxenguest.so.$(MAJOR): libxenguest.so.$(MAJOR).$(MINOR) - $(SYMLINK_SHLIB) $< $@ - -ifeq ($(CONFIG_MiniOS),y) -zlib-options = -else -zlib-options = $(ZLIB) -endif - -xc_dom_bzimageloader.o: CFLAGS += $(filter -D%,$(zlib-options)) -xc_dom_bzimageloader.opic: CFLAGS += $(filter -D%,$(zlib-options)) - -libxenguest.so.$(MAJOR).$(MINOR): COMPRESSION_LIBS = $(filter -l%,$(zlib-options)) -libxenguest.so.$(MAJOR).$(MINOR): $(GUEST_PIC_OBJS) - $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libxenguest.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $(GUEST_PIC_OBJS) $(COMPRESSION_LIBS) -lz $(LDLIBS_libxenevtchn) $(LDLIBS_libxenctrl) $(PTHREAD_LIBS) $(APPEND_LDFLAGS) - --include $(DEPS_INCLUDE) - diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h deleted file mode 100644 index 4643384790..0000000000 --- a/tools/libxc/include/xenguest.h +++ /dev/null @@ -1,327 +0,0 @@ -/****************************************************************************** - * xenguest.h - * - * A library for guest domain management in Xen. - * - * Copyright (c) 2003-2004, K A Fraser. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - */ - -#ifndef XENGUEST_H -#define XENGUEST_H - -#include - -#define XC_NUMA_NO_NODE (~0U) - -#define XCFLAGS_LIVE (1 << 0) -#define XCFLAGS_DEBUG (1 << 1) - -#define X86_64_B_SIZE 64 -#define X86_32_B_SIZE 32 - -/* - * User not using xc_suspend_* / xc_await_suspent may not want to - * include the full libxenevtchn API here. - */ -struct xenevtchn_handle; - -/* For save's precopy_policy(). */ -struct precopy_stats -{ - unsigned int iteration; - unsigned int total_written; - long dirty_count; /* -1 if unknown */ -}; - -/* - * A precopy_policy callback may not be running in the same address - * space as libxc an so precopy_stats is passed by value. - */ -typedef int (*precopy_policy_t)(struct precopy_stats, void *); - -/* callbacks provided by xc_domain_save */ -struct save_callbacks { - /* - * Called after expiration of checkpoint interval, - * to suspend the guest. - */ - int (*suspend)(void *data); - - /* - * Called before and after every batch of page data sent during - * the precopy phase of a live migration to ask the caller what - * to do next based on the current state of the precopy migration. - * - * Should return one of the values listed below: - */ -#define XGS_POLICY_ABORT (-1) /* Abandon the migration entirely - * and tidy up. */ -#define XGS_POLICY_CONTINUE_PRECOPY 0 /* Remain in the precopy phase. */ -#define XGS_POLICY_STOP_AND_COPY 1 /* Immediately suspend and transmit the - * remaining dirty pages. */ - precopy_policy_t precopy_policy; - - /* - * Called after the guest's dirty pages have been - * copied into an output buffer. - * Callback function resumes the guest & the device model, - * returns to xc_domain_save. - * xc_domain_save then flushes the output buffer, while the - * guest continues to run. - */ - int (*postcopy)(void *data); - - /* - * Called after the memory checkpoint has been flushed - * out into the network. Typical actions performed in this - * callback include: - * (a) send the saved device model state (for HVM guests), - * (b) wait for checkpoint ack - * (c) release the network output buffer pertaining to the acked checkpoint. - * (c) sleep for the checkpoint interval. - * - * returns: - * 0: terminate checkpointing gracefully - * 1: take another checkpoint - */ - int (*checkpoint)(void *data); - - /* - * Called after the checkpoint callback. - * - * returns: - * 0: terminate checkpointing gracefully - * 1: take another checkpoint - */ - int (*wait_checkpoint)(void *data); - - /* Enable qemu-dm logging dirty pages to xen */ - int (*switch_qemu_logdirty)(uint32_t domid, unsigned enable, void *data); /* HVM only */ - - /* to be provided as the last argument to each callback function */ - void *data; -}; - -/* Type of stream. Plain, or using a continuous replication protocol? */ -typedef enum { - XC_STREAM_PLAIN, - XC_STREAM_REMUS, - XC_STREAM_COLO, -} xc_stream_type_t; - -/** - * This function will save a running domain. - * - * @param xch a handle to an open hypervisor interface - * @param io_fd the file descriptor to save a domain to - * @param dom the id of the domain - * @param flags XCFLAGS_xxx - * @param stream_type XC_STREAM_PLAIN if the far end of the stream - * doesn't use checkpointing - * @param recv_fd Only used for XC_STREAM_COLO. Contains backchannel from - * the destination side. - * @return 0 on success, -1 on failure - */ -int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, - uint32_t flags, struct save_callbacks *callbacks, - xc_stream_type_t stream_type, int recv_fd); - -/* callbacks provided by xc_domain_restore */ -struct restore_callbacks { - /* - * Called once the STATIC_DATA_END record has been received/inferred. - * - * For compatibility with older streams, provides a list of static data - * expected to be found in the stream, which was missing. A higher level - * toolstack is responsible for providing any necessary compatibiltiy. - */ -#define XGR_SDD_MISSING_CPUID (1 << 0) -#define XGR_SDD_MISSING_MSR (1 << 1) - int (*static_data_done)(unsigned int missing, void *data); - - /* Called after a new checkpoint to suspend the guest. */ - int (*suspend)(void *data); - - /* - * Called after the secondary vm is ready to resume. - * Callback function resumes the guest & the device model, - * returns to xc_domain_restore. - */ - int (*postcopy)(void *data); - - /* - * A checkpoint record has been found in the stream. - * returns: - */ -#define XGR_CHECKPOINT_ERROR 0 /* Terminate processing */ -#define XGR_CHECKPOINT_SUCCESS 1 /* Continue reading more data from the stream */ -#define XGR_CHECKPOINT_FAILOVER 2 /* Failover and resume VM */ - int (*checkpoint)(void *data); - - /* - * Called after the checkpoint callback. - * - * returns: - * 0: terminate checkpointing gracefully - * 1: take another checkpoint - */ - int (*wait_checkpoint)(void *data); - - /* - * callback to send store gfn and console gfn to xl - * if we want to resume vm before xc_domain_save() - * exits. - */ - void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn, - void *data); - - /* to be provided as the last argument to each callback function */ - void *data; -}; - -/** - * This function will restore a saved domain. - * - * Domain is restored in a suspended state ready to be unpaused. - * - * @param xch a handle to an open hypervisor interface - * @param io_fd the file descriptor to restore a domain from - * @param dom the id of the domain - * @param store_evtchn the xenstore event channel for this domain to use - * @param store_mfn filled with the gfn of the store page - * @param store_domid the backend domain for xenstore - * @param console_evtchn the console event channel for this domain to use - * @param console_mfn filled with the gfn of the console page - * @param console_domid the backend domain for xenconsole - * @param stream_type XC_STREAM_PLAIN if the far end of the stream is using - * checkpointing - * @param callbacks non-NULL to receive a callback to restore toolstack - * specific data - * @param send_back_fd Only used for XC_STREAM_COLO. Contains backchannel to - * the source side. - * @return 0 on success, -1 on failure - */ -int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, - unsigned int store_evtchn, unsigned long *store_mfn, - uint32_t store_domid, unsigned int console_evtchn, - unsigned long *console_mfn, uint32_t console_domid, - xc_stream_type_t stream_type, - struct restore_callbacks *callbacks, int send_back_fd); - -/** - * This function will create a domain for a paravirtualized Linux - * using file names pointing to kernel and ramdisk - * - * @parm xch a handle to an open hypervisor interface - * @parm domid the id of the domain - * @parm mem_mb memory size in megabytes - * @parm image_name name of the kernel image file - * @parm ramdisk_name name of the ramdisk image file - * @parm cmdline command line string - * @parm flags domain creation flags - * @parm store_evtchn the store event channel for this domain to use - * @parm store_mfn returned with the mfn of the store page - * @parm console_evtchn the console event channel for this domain to use - * @parm conole_mfn returned with the mfn of the console page - * @return 0 on success, -1 on failure - */ -int xc_linux_build(xc_interface *xch, - uint32_t domid, - unsigned int mem_mb, - const char *image_name, - const char *ramdisk_name, - const char *cmdline, - const char *features, - unsigned long flags, - unsigned int store_evtchn, - unsigned long *store_mfn, - unsigned int console_evtchn, - unsigned long *console_mfn); - -/* - * Sets *lockfd to -1. - * Has deallocated everything even on error. - */ -int xc_suspend_evtchn_release(xc_interface *xch, - struct xenevtchn_handle *xce, - uint32_t domid, int suspend_evtchn, int *lockfd); - -/** - * This function eats the initial notification. - * xce must not be used for anything else - * See xc_suspend_evtchn_init_sane re lockfd. - */ -int xc_suspend_evtchn_init_exclusive(xc_interface *xch, - struct xenevtchn_handle *xce, - uint32_t domid, int port, int *lockfd); - -/* xce must not be used for anything else */ -int xc_await_suspend(xc_interface *xch, struct xenevtchn_handle *xce, - int suspend_evtchn); - -/** - * The port will be signaled immediately after this call - * The caller should check the domain status and look for the next event - * On success, *lockfd will be set to >=0 and *lockfd must be preserved - * and fed to xc_suspend_evtchn_release. (On error *lockfd is - * undefined and xc_suspend_evtchn_release is not allowed.) - */ -int xc_suspend_evtchn_init_sane(xc_interface *xch, - struct xenevtchn_handle *xce, - uint32_t domid, int port, int *lockfd); - -int xc_mark_page_online(xc_interface *xch, unsigned long start, - unsigned long end, uint32_t *status); - -int xc_mark_page_offline(xc_interface *xch, unsigned long start, - unsigned long end, uint32_t *status); - -int xc_query_page_offline_status(xc_interface *xch, unsigned long start, - unsigned long end, uint32_t *status); - -int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn); - - -/** - * Memory related information, such as PFN types, the P2M table, - * the guest word width and the guest page table levels. - */ -struct xc_domain_meminfo { - unsigned int pt_levels; - unsigned int guest_width; - xen_pfn_t *pfn_type; - xen_pfn_t *p2m_table; - unsigned long p2m_size; -}; - -int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid, - struct xc_domain_meminfo *minfo); - -int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *mem); - -/** - * This function map m2p table - * @parm xch a handle to an open hypervisor interface - * @parm max_mfn the max pfn - * @parm prot the flags to map, such as read/write etc - * @parm mfn0 return the first mfn, can be NULL - * @return mapped m2p table on success, NULL on failure - */ -xen_pfn_t *xc_map_m2p(xc_interface *xch, - unsigned long max_mfn, - int prot, - unsigned long *mfn0); -#endif /* XENGUEST_H */ diff --git a/tools/libxc/xg_cpuid_x86.c b/tools/libxc/xg_cpuid_x86.c deleted file mode 100644 index 0f24d6dd08..0000000000 --- a/tools/libxc/xg_cpuid_x86.c +++ /dev/null @@ -1,665 +0,0 @@ -/****************************************************************************** - * xc_cpuid_x86.c - * - * Compute cpuid of a domain. - * - * Copyright (c) 2008, Citrix Systems, Inc. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - */ - -#include -#include -#include -#include "xc_private.h" -#include "xc_bitops.h" -#include -#include - -enum { -#define XEN_CPUFEATURE(name, value) X86_FEATURE_##name = value, -#include -}; - -#include - -#include - -#define bitmaskof(idx) (1u << ((idx) & 31)) -#define featureword_of(idx) ((idx) >> 5) - -int xc_get_cpu_levelling_caps(xc_interface *xch, uint32_t *caps) -{ - DECLARE_SYSCTL; - int ret; - - sysctl.cmd = XEN_SYSCTL_get_cpu_levelling_caps; - ret = do_sysctl(xch, &sysctl); - - if ( !ret ) - *caps = sysctl.u.cpu_levelling_caps.caps; - - return ret; -} - -int xc_get_cpu_featureset(xc_interface *xch, uint32_t index, - uint32_t *nr_features, uint32_t *featureset) -{ - DECLARE_SYSCTL; - DECLARE_HYPERCALL_BOUNCE(featureset, - *nr_features * sizeof(*featureset), - XC_HYPERCALL_BUFFER_BOUNCE_OUT); - int ret; - - if ( xc_hypercall_bounce_pre(xch, featureset) ) - return -1; - - sysctl.cmd = XEN_SYSCTL_get_cpu_featureset; - sysctl.u.cpu_featureset.index = index; - sysctl.u.cpu_featureset.nr_features = *nr_features; - set_xen_guest_handle(sysctl.u.cpu_featureset.features, featureset); - - ret = do_sysctl(xch, &sysctl); - - xc_hypercall_bounce_post(xch, featureset); - - if ( !ret ) - *nr_features = sysctl.u.cpu_featureset.nr_features; - - return ret; -} - -uint32_t xc_get_cpu_featureset_size(void) -{ - return FEATURESET_NR_ENTRIES; -} - -const uint32_t *xc_get_static_cpu_featuremask( - enum xc_static_cpu_featuremask mask) -{ - static const uint32_t masks[][FEATURESET_NR_ENTRIES] = { -#define MASK(x) [XC_FEATUREMASK_ ## x] = INIT_ ## x ## _FEATURES - - MASK(KNOWN), - MASK(SPECIAL), - MASK(PV_MAX), - MASK(PV_DEF), - MASK(HVM_SHADOW_MAX), - MASK(HVM_SHADOW_DEF), - MASK(HVM_HAP_MAX), - MASK(HVM_HAP_DEF), - -#undef MASK - }; - - if ( (unsigned int)mask >= ARRAY_SIZE(masks) ) - return NULL; - - return masks[mask]; -} - -int xc_get_cpu_policy_size(xc_interface *xch, uint32_t *nr_leaves, - uint32_t *nr_msrs) -{ - struct xen_sysctl sysctl = {}; - int ret; - - sysctl.cmd = XEN_SYSCTL_get_cpu_policy; - - ret = do_sysctl(xch, &sysctl); - - if ( !ret ) - { - *nr_leaves = sysctl.u.cpu_policy.nr_leaves; - *nr_msrs = sysctl.u.cpu_policy.nr_msrs; - } - - return ret; -} - -int xc_get_system_cpu_policy(xc_interface *xch, uint32_t index, - uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves, - uint32_t *nr_msrs, xen_msr_entry_t *msrs) -{ - struct xen_sysctl sysctl = {}; - DECLARE_HYPERCALL_BOUNCE(leaves, - *nr_leaves * sizeof(*leaves), - XC_HYPERCALL_BUFFER_BOUNCE_OUT); - DECLARE_HYPERCALL_BOUNCE(msrs, - *nr_msrs * sizeof(*msrs), - XC_HYPERCALL_BUFFER_BOUNCE_OUT); - int ret; - - if ( xc_hypercall_bounce_pre(xch, leaves) || - xc_hypercall_bounce_pre(xch, msrs) ) - return -1; - - sysctl.cmd = XEN_SYSCTL_get_cpu_policy; - sysctl.u.cpu_policy.index = index; - sysctl.u.cpu_policy.nr_leaves = *nr_leaves; - set_xen_guest_handle(sysctl.u.cpu_policy.cpuid_policy, leaves); - sysctl.u.cpu_policy.nr_msrs = *nr_msrs; - set_xen_guest_handle(sysctl.u.cpu_policy.msr_policy, msrs); - - ret = do_sysctl(xch, &sysctl); - - xc_hypercall_bounce_post(xch, leaves); - xc_hypercall_bounce_post(xch, msrs); - - if ( !ret ) - { - *nr_leaves = sysctl.u.cpu_policy.nr_leaves; - *nr_msrs = sysctl.u.cpu_policy.nr_msrs; - } - - return ret; -} - -int xc_get_domain_cpu_policy(xc_interface *xch, uint32_t domid, - uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves, - uint32_t *nr_msrs, xen_msr_entry_t *msrs) -{ - DECLARE_DOMCTL; - DECLARE_HYPERCALL_BOUNCE(leaves, - *nr_leaves * sizeof(*leaves), - XC_HYPERCALL_BUFFER_BOUNCE_OUT); - DECLARE_HYPERCALL_BOUNCE(msrs, - *nr_msrs * sizeof(*msrs), - XC_HYPERCALL_BUFFER_BOUNCE_OUT); - int ret; - - if ( xc_hypercall_bounce_pre(xch, leaves) || - xc_hypercall_bounce_pre(xch, msrs) ) - return -1; - - domctl.cmd = XEN_DOMCTL_get_cpu_policy; - domctl.domain = domid; - domctl.u.cpu_policy.nr_leaves = *nr_leaves; - set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves); - domctl.u.cpu_policy.nr_msrs = *nr_msrs; - set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs); - - ret = do_domctl(xch, &domctl); - - xc_hypercall_bounce_post(xch, leaves); - xc_hypercall_bounce_post(xch, msrs); - - if ( !ret ) - { - *nr_leaves = domctl.u.cpu_policy.nr_leaves; - *nr_msrs = domctl.u.cpu_policy.nr_msrs; - } - - return ret; -} - -int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid, - uint32_t nr_leaves, xen_cpuid_leaf_t *leaves, - uint32_t nr_msrs, xen_msr_entry_t *msrs, - uint32_t *err_leaf_p, uint32_t *err_subleaf_p, - uint32_t *err_msr_p) -{ - DECLARE_DOMCTL; - DECLARE_HYPERCALL_BOUNCE(leaves, - nr_leaves * sizeof(*leaves), - XC_HYPERCALL_BUFFER_BOUNCE_IN); - DECLARE_HYPERCALL_BOUNCE(msrs, - nr_msrs * sizeof(*msrs), - XC_HYPERCALL_BUFFER_BOUNCE_IN); - int ret; - - if ( err_leaf_p ) - *err_leaf_p = -1; - if ( err_subleaf_p ) - *err_subleaf_p = -1; - if ( err_msr_p ) - *err_msr_p = -1; - - if ( xc_hypercall_bounce_pre(xch, leaves) ) - return -1; - - if ( xc_hypercall_bounce_pre(xch, msrs) ) - return -1; - - domctl.cmd = XEN_DOMCTL_set_cpu_policy; - domctl.domain = domid; - domctl.u.cpu_policy.nr_leaves = nr_leaves; - set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves); - domctl.u.cpu_policy.nr_msrs = nr_msrs; - set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs); - domctl.u.cpu_policy.err_leaf = -1; - domctl.u.cpu_policy.err_subleaf = -1; - domctl.u.cpu_policy.err_msr = -1; - - ret = do_domctl(xch, &domctl); - - xc_hypercall_bounce_post(xch, leaves); - xc_hypercall_bounce_post(xch, msrs); - - if ( err_leaf_p ) - *err_leaf_p = domctl.u.cpu_policy.err_leaf; - if ( err_subleaf_p ) - *err_subleaf_p = domctl.u.cpu_policy.err_subleaf; - if ( err_msr_p ) - *err_msr_p = domctl.u.cpu_policy.err_msr; - - return ret; -} - -static int compare_leaves(const void *l, const void *r) -{ - const xen_cpuid_leaf_t *lhs = l; - const xen_cpuid_leaf_t *rhs = r; - - if ( lhs->leaf != rhs->leaf ) - return lhs->leaf < rhs->leaf ? -1 : 1; - - if ( lhs->subleaf != rhs->subleaf ) - return lhs->subleaf < rhs->subleaf ? -1 : 1; - - return 0; -} - -static xen_cpuid_leaf_t *find_leaf( - xen_cpuid_leaf_t *leaves, unsigned int nr_leaves, - const struct xc_xend_cpuid *xend) -{ - const xen_cpuid_leaf_t key = { xend->leaf, xend->subleaf }; - - return bsearch(&key, leaves, nr_leaves, sizeof(*leaves), compare_leaves); -} - -static int xc_cpuid_xend_policy( - xc_interface *xch, uint32_t domid, const struct xc_xend_cpuid *xend) -{ - int rc; - xc_dominfo_t di; - unsigned int nr_leaves, nr_msrs; - uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1; - /* - * Three full policies. The host, domain max, and domain current for the - * domain type. - */ - xen_cpuid_leaf_t *host = NULL, *max = NULL, *cur = NULL; - unsigned int nr_host, nr_max, nr_cur; - - if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 || - di.domid != domid ) - { - ERROR("Failed to obtain d%d info", domid); - rc = -ESRCH; - goto fail; - } - - rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs); - if ( rc ) - { - PERROR("Failed to obtain policy info size"); - rc = -errno; - goto fail; - } - - rc = -ENOMEM; - if ( (host = calloc(nr_leaves, sizeof(*host))) == NULL || - (max = calloc(nr_leaves, sizeof(*max))) == NULL || - (cur = calloc(nr_leaves, sizeof(*cur))) == NULL ) - { - ERROR("Unable to allocate memory for %u CPUID leaves", nr_leaves); - goto fail; - } - - /* Get the domain's current policy. */ - nr_msrs = 0; - nr_cur = nr_leaves; - rc = xc_get_domain_cpu_policy(xch, domid, &nr_cur, cur, &nr_msrs, NULL); - if ( rc ) - { - PERROR("Failed to obtain d%d current policy", domid); - rc = -errno; - goto fail; - } - - /* Get the domain's max policy. */ - nr_msrs = 0; - nr_max = nr_leaves; - rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_max - : XEN_SYSCTL_cpu_policy_pv_max, - &nr_max, max, &nr_msrs, NULL); - if ( rc ) - { - PERROR("Failed to obtain %s max policy", di.hvm ? "hvm" : "pv"); - rc = -errno; - goto fail; - } - - /* Get the host policy. */ - nr_msrs = 0; - nr_host = nr_leaves; - rc = xc_get_system_cpu_policy(xch, XEN_SYSCTL_cpu_policy_host, - &nr_host, host, &nr_msrs, NULL); - if ( rc ) - { - PERROR("Failed to obtain host policy"); - rc = -errno; - goto fail; - } - - rc = -EINVAL; - for ( ; xend->leaf != XEN_CPUID_INPUT_UNUSED; ++xend ) - { - xen_cpuid_leaf_t *cur_leaf = find_leaf(cur, nr_cur, xend); - const xen_cpuid_leaf_t *max_leaf = find_leaf(max, nr_max, xend); - const xen_cpuid_leaf_t *host_leaf = find_leaf(host, nr_host, xend); - - if ( cur_leaf == NULL || max_leaf == NULL || host_leaf == NULL ) - { - ERROR("Missing leaf %#x, subleaf %#x", xend->leaf, xend->subleaf); - goto fail; - } - - for ( unsigned int i = 0; i < ARRAY_SIZE(xend->policy); i++ ) - { - uint32_t *cur_reg = &cur_leaf->a + i; - const uint32_t *max_reg = &max_leaf->a + i; - const uint32_t *host_reg = &host_leaf->a + i; - - if ( xend->policy[i] == NULL ) - continue; - - for ( unsigned int j = 0; j < 32; j++ ) - { - bool val; - - if ( xend->policy[i][j] == '1' ) - val = true; - else if ( xend->policy[i][j] == '0' ) - val = false; - else if ( xend->policy[i][j] == 'x' ) - val = test_bit(31 - j, max_reg); - else if ( xend->policy[i][j] == 'k' || - xend->policy[i][j] == 's' ) - val = test_bit(31 - j, host_reg); - else - { - ERROR("Bad character '%c' in policy[%d] string '%s'", - xend->policy[i][j], i, xend->policy[i]); - goto fail; - } - - clear_bit(31 - j, cur_reg); - if ( val ) - set_bit(31 - j, cur_reg); - } - } - } - - /* Feed the transformed currrent policy back up to Xen. */ - rc = xc_set_domain_cpu_policy(xch, domid, nr_cur, cur, 0, NULL, - &err_leaf, &err_subleaf, &err_msr); - if ( rc ) - { - PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)", - domid, err_leaf, err_subleaf, err_msr); - rc = -errno; - goto fail; - } - - /* Success! */ - - fail: - free(cur); - free(max); - free(host); - - return rc; -} - -int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid, bool restore, - const uint32_t *featureset, unsigned int nr_features, - bool pae, - const struct xc_xend_cpuid *xend) -{ - int rc; - xc_dominfo_t di; - unsigned int i, nr_leaves, nr_msrs; - xen_cpuid_leaf_t *leaves = NULL; - struct cpuid_policy *p = NULL; - uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1; - uint32_t host_featureset[FEATURESET_NR_ENTRIES] = {}; - uint32_t len = ARRAY_SIZE(host_featureset); - - if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 || - di.domid != domid ) - { - ERROR("Failed to obtain d%d info", domid); - rc = -ESRCH; - goto out; - } - - rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs); - if ( rc ) - { - PERROR("Failed to obtain policy info size"); - rc = -errno; - goto out; - } - - rc = -ENOMEM; - if ( (leaves = calloc(nr_leaves, sizeof(*leaves))) == NULL || - (p = calloc(1, sizeof(*p))) == NULL ) - goto out; - - /* Get the host policy. */ - rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_host, - &len, host_featureset); - if ( rc ) - { - /* Tolerate "buffer too small", as we've got the bits we need. */ - if ( errno == ENOBUFS ) - rc = 0; - else - { - PERROR("Failed to obtain host featureset"); - rc = -errno; - goto out; - } - } - - /* Get the domain's default policy. */ - nr_msrs = 0; - rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_default - : XEN_SYSCTL_cpu_policy_pv_default, - &nr_leaves, leaves, &nr_msrs, NULL); - if ( rc ) - { - PERROR("Failed to obtain %s default policy", di.hvm ? "hvm" : "pv"); - rc = -errno; - goto out; - } - - rc = x86_cpuid_copy_from_buffer(p, leaves, nr_leaves, - &err_leaf, &err_subleaf); - if ( rc ) - { - ERROR("Failed to deserialise CPUID (err leaf %#x, subleaf %#x) (%d = %s)", - err_leaf, err_subleaf, -rc, strerror(-rc)); - goto out; - } - - /* - * Account for feature which have been disabled by default since Xen 4.13, - * so migrated-in VM's don't risk seeing features disappearing. - */ - if ( restore ) - { - p->basic.rdrand = test_bit(X86_FEATURE_RDRAND, host_featureset); - - if ( di.hvm ) - { - p->feat.mpx = test_bit(X86_FEATURE_MPX, host_featureset); - } - } - - if ( featureset ) - { - uint32_t disabled_features[FEATURESET_NR_ENTRIES], - feat[FEATURESET_NR_ENTRIES] = {}; - static const uint32_t deep_features[] = INIT_DEEP_FEATURES; - unsigned int i, b; - - /* - * The user supplied featureset may be shorter or longer than - * FEATURESET_NR_ENTRIES. Shorter is fine, and we will zero-extend. - * Longer is fine, so long as it only padded with zeros. - */ - unsigned int user_len = min(FEATURESET_NR_ENTRIES + 0u, nr_features); - - /* Check for truncated set bits. */ - rc = -EOPNOTSUPP; - for ( i = user_len; i < nr_features; ++i ) - if ( featureset[i] != 0 ) - goto out; - - memcpy(feat, featureset, sizeof(*featureset) * user_len); - - /* Disable deep dependencies of disabled features. */ - for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i ) - disabled_features[i] = ~feat[i] & deep_features[i]; - - for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b ) - { - const uint32_t *dfs; - - if ( !test_bit(b, disabled_features) || - !(dfs = x86_cpuid_lookup_deep_deps(b)) ) - continue; - - for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i ) - { - feat[i] &= ~dfs[i]; - disabled_features[i] &= ~dfs[i]; - } - } - - cpuid_featureset_to_policy(feat, p); - } - else - { - if ( di.hvm ) - p->basic.pae = pae; - } - - if ( !di.hvm ) - { - /* - * On hardware without CPUID Faulting, PV guests see real topology. - * As a consequence, they also need to see the host htt/cmp fields. - */ - p->basic.htt = test_bit(X86_FEATURE_HTT, host_featureset); - p->extd.cmp_legacy = test_bit(X86_FEATURE_CMP_LEGACY, host_featureset); - } - else - { - /* - * Topology for HVM guests is entirely controlled by Xen. For now, we - * hardcode APIC_ID = vcpu_id * 2 to give the illusion of no SMT. - */ - p->basic.htt = true; - p->extd.cmp_legacy = false; - - /* - * Leaf 1 EBX[23:16] is Maximum Logical Processors Per Package. - * Update to reflect vLAPIC_ID = vCPU_ID * 2, but make sure to avoid - * overflow. - */ - if ( !(p->basic.lppp & 0x80) ) - p->basic.lppp *= 2; - - switch ( p->x86_vendor ) - { - case X86_VENDOR_INTEL: - for ( i = 0; (p->cache.subleaf[i].type && - i < ARRAY_SIZE(p->cache.raw)); ++i ) - { - p->cache.subleaf[i].cores_per_package = - (p->cache.subleaf[i].cores_per_package << 1) | 1; - p->cache.subleaf[i].threads_per_cache = 0; - } - break; - - case X86_VENDOR_AMD: - case X86_VENDOR_HYGON: - /* - * Leaf 0x80000008 ECX[15:12] is ApicIdCoreSize. - * Leaf 0x80000008 ECX[7:0] is NumberOfCores (minus one). - * Update to reflect vLAPIC_ID = vCPU_ID * 2. But avoid - * - overflow, - * - going out of sync with leaf 1 EBX[23:16], - * - incrementing ApicIdCoreSize when it's zero (which changes the - * meaning of bits 7:0). - * - * UPDATE: I addition to avoiding overflow, some - * proprietary operating systems have trouble with - * apic_id_size values greater than 7. Limit the value to - * 7 for now. - */ - if ( p->extd.nc < 0x7f ) - { - if ( p->extd.apic_id_size != 0 && p->extd.apic_id_size < 0x7 ) - p->extd.apic_id_size++; - - p->extd.nc = (p->extd.nc << 1) | 1; - } - break; - } - - /* - * These settings are necessary to cause earlier HVM_PARAM_NESTEDHVM / - * XEN_DOMCTL_disable_migrate settings to be reflected correctly in - * CPUID. Xen will discard these bits if configuration hasn't been - * set for the domain. - */ - p->extd.itsc = true; - p->basic.vmx = true; - p->extd.svm = true; - } - - rc = x86_cpuid_copy_to_buffer(p, leaves, &nr_leaves); - if ( rc ) - { - ERROR("Failed to serialise CPUID (%d = %s)", -rc, strerror(-rc)); - goto out; - } - - rc = xc_set_domain_cpu_policy(xch, domid, nr_leaves, leaves, 0, NULL, - &err_leaf, &err_subleaf, &err_msr); - if ( rc ) - { - PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)", - domid, err_leaf, err_subleaf, err_msr); - rc = -errno; - goto out; - } - - if ( xend && (rc = xc_cpuid_xend_policy(xch, domid, xend)) ) - goto out; - - rc = 0; - -out: - free(p); - free(leaves); - - return rc; -} diff --git a/tools/libxc/xg_dom_arm.c b/tools/libxc/xg_dom_arm.c deleted file mode 100644 index 3f66f1d890..0000000000 --- a/tools/libxc/xg_dom_arm.c +++ /dev/null @@ -1,552 +0,0 @@ -/* - * Xen domain builder -- ARM - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * Copyright (c) 2011, Citrix Systems - */ -#include -#include - -#include -#include -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" - -#define NR_MAGIC_PAGES 4 -#define CONSOLE_PFN_OFFSET 0 -#define XENSTORE_PFN_OFFSET 1 -#define MEMACCESS_PFN_OFFSET 2 -#define VUART_PFN_OFFSET 3 - -#define LPAE_SHIFT 9 - -#define PFN_4K_SHIFT (0) -#define PFN_2M_SHIFT (PFN_4K_SHIFT+LPAE_SHIFT) -#define PFN_1G_SHIFT (PFN_2M_SHIFT+LPAE_SHIFT) -#define PFN_512G_SHIFT (PFN_1G_SHIFT+LPAE_SHIFT) - -/* get guest IO ABI protocol */ -const char *xc_domain_get_native_protocol(xc_interface *xch, - uint32_t domid) -{ - return XEN_IO_PROTO_ABI_ARM; -} - -/* ------------------------------------------------------------------------ */ - -static int alloc_magic_pages(struct xc_dom_image *dom) -{ - int rc, i; - const xen_pfn_t base = GUEST_MAGIC_BASE >> XC_PAGE_SHIFT; - xen_pfn_t p2m[NR_MAGIC_PAGES]; - - BUILD_BUG_ON(NR_MAGIC_PAGES > GUEST_MAGIC_SIZE >> XC_PAGE_SHIFT); - - DOMPRINTF_CALLED(dom->xch); - - for (i = 0; i < NR_MAGIC_PAGES; i++) - p2m[i] = base + i; - - rc = xc_domain_populate_physmap_exact( - dom->xch, dom->guest_domid, NR_MAGIC_PAGES, - 0, 0, p2m); - if ( rc < 0 ) - return rc; - - dom->console_pfn = base + CONSOLE_PFN_OFFSET; - dom->xenstore_pfn = base + XENSTORE_PFN_OFFSET; - dom->vuart_gfn = base + VUART_PFN_OFFSET; - - xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn); - xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn); - xc_clear_domain_page(dom->xch, dom->guest_domid, base + MEMACCESS_PFN_OFFSET); - xc_clear_domain_page(dom->xch, dom->guest_domid, dom->vuart_gfn); - - xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_PFN, - dom->console_pfn); - xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_PFN, - dom->xenstore_pfn); - xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_MONITOR_RING_PFN, - base + MEMACCESS_PFN_OFFSET); - /* allocated by toolstack */ - xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_EVTCHN, - dom->console_evtchn); - xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN, - dom->xenstore_evtchn); - - return 0; -} - -/* ------------------------------------------------------------------------ */ - -static int start_info_arm(struct xc_dom_image *dom) -{ - DOMPRINTF_CALLED(dom->xch); - return 0; -} - -static int shared_info_arm(struct xc_dom_image *dom, void *ptr) -{ - DOMPRINTF_CALLED(dom->xch); - return 0; -} - -/* ------------------------------------------------------------------------ */ - -static int vcpu_arm32(struct xc_dom_image *dom) -{ - vcpu_guest_context_any_t any_ctx; - vcpu_guest_context_t *ctxt = &any_ctx.c; - int rc; - - DOMPRINTF_CALLED(dom->xch); - - /* clear everything */ - memset(ctxt, 0, sizeof(*ctxt)); - - ctxt->user_regs.pc32 = dom->parms.virt_entry; - - /* Linux boot protocol. See linux.Documentation/arm/Booting. */ - ctxt->user_regs.r0_usr = 0; /* SBZ */ - /* Machine ID: We use DTB therefore no machine id */ - ctxt->user_regs.r1_usr = 0xffffffff; - /* ATAGS/DTB: We currently require that the guest kernel to be - * using CONFIG_ARM_APPENDED_DTB. Ensure that r2 does not look - * like a valid pointer to a set of ATAGS or a DTB. - */ - ctxt->user_regs.r2_usr = dom->devicetree_blob ? - dom->devicetree_seg.vstart : 0xffffffff; - - ctxt->sctlr = SCTLR_GUEST_INIT; - - ctxt->ttbr0 = 0; - ctxt->ttbr1 = 0; - ctxt->ttbcr = 0; /* Defined Reset Value */ - - ctxt->user_regs.cpsr = PSR_GUEST32_INIT; - - ctxt->flags = VGCF_online; - - DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx32, - ctxt->user_regs.cpsr, ctxt->user_regs.pc32); - - rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); - if ( rc != 0 ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); - - return rc; -} - -static int vcpu_arm64(struct xc_dom_image *dom) -{ - vcpu_guest_context_any_t any_ctx; - vcpu_guest_context_t *ctxt = &any_ctx.c; - int rc; - - DOMPRINTF_CALLED(dom->xch); - /* clear everything */ - memset(ctxt, 0, sizeof(*ctxt)); - - ctxt->user_regs.pc64 = dom->parms.virt_entry; - - /* Linux boot protocol. See linux.Documentation/arm64/booting.txt. */ - ctxt->user_regs.x0 = dom->devicetree_blob ? - dom->devicetree_seg.vstart : 0xffffffff; - ctxt->user_regs.x1 = 0; - ctxt->user_regs.x2 = 0; - ctxt->user_regs.x3 = 0; - - DOMPRINTF("DTB %"PRIx64, ctxt->user_regs.x0); - - ctxt->sctlr = SCTLR_GUEST_INIT; - - ctxt->ttbr0 = 0; - ctxt->ttbr1 = 0; - ctxt->ttbcr = 0; /* Defined Reset Value */ - - ctxt->user_regs.cpsr = PSR_GUEST64_INIT; - - ctxt->flags = VGCF_online; - - DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx64, - ctxt->user_regs.cpsr, ctxt->user_regs.pc64); - - rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); - if ( rc != 0 ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); - - return rc; -} - -/* ------------------------------------------------------------------------ */ - -static int set_mode(xc_interface *xch, uint32_t domid, char *guest_type) -{ - static const struct { - char *guest; - uint32_t size; - } types[] = { - { "xen-3.0-aarch64", 64 }, - { "xen-3.0-armv7l", 32 }, - }; - DECLARE_DOMCTL; - int i,rc; - - domctl.domain = domid; - domctl.cmd = XEN_DOMCTL_set_address_size; - domctl.u.address_size.size = 0; - - for ( i = 0; i < ARRAY_SIZE(types); i++ ) - if ( !strcmp(types[i].guest, guest_type) ) - domctl.u.address_size.size = types[i].size; - if ( domctl.u.address_size.size == 0 ) - { - xc_dom_printf(xch, "%s: warning: unknown guest type %s", - __FUNCTION__, guest_type); - return -EINVAL; - } - - xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__, - guest_type, domctl.u.address_size.size); - rc = do_domctl(xch, &domctl); - if ( rc != 0 ) - xc_dom_printf(xch, "%s: warning: failed (rc=%d)", - __FUNCTION__, rc); - return rc; -} - -/* >0: success, *nr_pfns set to number actually populated - * 0: didn't try with this pfn shift (e.g. misaligned base etc) - * <0: ERROR - */ -static int populate_one_size(struct xc_dom_image *dom, int pfn_shift, - xen_pfn_t base_pfn, xen_pfn_t *nr_pfns, - xen_pfn_t *extents) -{ - /* The mask for this level */ - const uint64_t mask = ((uint64_t)1<<(pfn_shift))-1; - /* The shift, mask and next boundary for the level above this one */ - const int next_shift = pfn_shift + LPAE_SHIFT; - const uint64_t next_mask = ((uint64_t)1< next_boundary ) - end_pfn = next_boundary; - - count = ( end_pfn - base_pfn ) >> pfn_shift; - - /* Nothing to allocate */ - if ( !count ) - return 0; - - for ( i = 0 ; i < count ; i ++ ) - extents[i] = base_pfn + (i<xch, dom->guest_domid, count, - pfn_shift, 0, extents); - if ( nr <= 0 ) return nr; - DOMPRINTF("%s: populated %#x/%#x entries with shift %d", - __FUNCTION__, nr, count, pfn_shift); - - *nr_pfns = nr << pfn_shift; - - return 1; -} - -static int populate_guest_memory(struct xc_dom_image *dom, - xen_pfn_t base_pfn, xen_pfn_t nr_pfns) -{ - int rc = 0; - xen_pfn_t allocsz, pfn, *extents; - - extents = calloc(1024*1024,sizeof(xen_pfn_t)); - if ( extents == NULL ) - { - DOMPRINTF("%s: Unable to allocate extent array", __FUNCTION__); - return -1; - } - - DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)", - __FUNCTION__, - (uint64_t)base_pfn << XC_PAGE_SHIFT, - (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT, - (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT)); - - for ( pfn = 0; pfn < nr_pfns; pfn += allocsz ) - { - allocsz = min_t(int, 1024*1024, nr_pfns - pfn); -#if 0 /* Enable this to exercise/debug the code which tries to realign - * to a superpage boundary, by misaligning at the start. */ - if ( pfn == 0 ) - { - allocsz = 1; - rc = populate_one_size(dom, PFN_4K_SHIFT, - base_pfn + pfn, &allocsz, extents); - if (rc < 0) break; - if (rc > 0) continue; - /* Failed to allocate a single page? */ - break; - } -#endif - - rc = populate_one_size(dom, PFN_512G_SHIFT, - base_pfn + pfn, &allocsz, extents); - if ( rc < 0 ) break; - if ( rc > 0 ) continue; - - rc = populate_one_size(dom, PFN_1G_SHIFT, - base_pfn + pfn, &allocsz, extents); - if ( rc < 0 ) break; - if ( rc > 0 ) continue; - - rc = populate_one_size(dom, PFN_2M_SHIFT, - base_pfn + pfn, &allocsz, extents); - if ( rc < 0 ) break; - if ( rc > 0 ) continue; - - rc = populate_one_size(dom, PFN_4K_SHIFT, - base_pfn + pfn, &allocsz, extents); - if ( rc < 0 ) break; - if ( rc == 0 ) - { - DOMPRINTF("%s: Not enough RAM", __FUNCTION__); - errno = ENOMEM; - rc = -1; - goto out; - } - } - -out: - free(extents); - return rc < 0 ? rc : 0; -} - -static int meminit(struct xc_dom_image *dom) -{ - int i, rc; - uint64_t modbase; - - uint64_t ramsize = (uint64_t)dom->total_pages << XC_PAGE_SHIFT; - - const uint64_t bankbase[] = GUEST_RAM_BANK_BASES; - const uint64_t bankmax[] = GUEST_RAM_BANK_SIZES; - - /* Convenient */ - const uint64_t kernbase = dom->kernel_seg.vstart; - const uint64_t kernend = ROUNDUP(dom->kernel_seg.vend, 21/*2MB*/); - const uint64_t kernsize = kernend - kernbase; - const uint64_t dtb_size = dom->devicetree_blob ? - ROUNDUP(dom->devicetree_size, XC_PAGE_SHIFT) : 0; - const uint64_t ramdisk_size = dom->modules[0].blob ? - ROUNDUP(dom->modules[0].size, XC_PAGE_SHIFT) : 0; - const uint64_t modsize = dtb_size + ramdisk_size; - const uint64_t ram128mb = bankbase[0] + (128<<20); - - xen_pfn_t p2m_size; - uint64_t bank0end; - - assert(dom->rambase_pfn << XC_PAGE_SHIFT == bankbase[0]); - - if ( modsize + kernsize > bankmax[0] ) - { - DOMPRINTF("%s: Not enough memory for the kernel+dtb+initrd", - __FUNCTION__); - return -1; - } - - if ( ramsize == 0 ) - { - DOMPRINTF("%s: ram size is 0", __FUNCTION__); - return -1; - } - - if ( ramsize > GUEST_RAM_MAX ) - { - DOMPRINTF("%s: ram size is too large for guest address space: " - "%"PRIx64" > %llx", - __FUNCTION__, ramsize, GUEST_RAM_MAX); - return -1; - } - - rc = set_mode(dom->xch, dom->guest_domid, dom->guest_type); - if ( rc ) - return rc; - - for ( i = 0; ramsize && i < GUEST_RAM_BANKS; i++ ) - { - uint64_t banksize = ramsize > bankmax[i] ? bankmax[i] : ramsize; - - ramsize -= banksize; - - p2m_size = ( bankbase[i] + banksize - bankbase[0] ) >> XC_PAGE_SHIFT; - - dom->rambank_size[i] = banksize >> XC_PAGE_SHIFT; - } - - assert(dom->rambank_size[0] != 0); - assert(ramsize == 0); /* Too much RAM is rejected above */ - - dom->p2m_size = p2m_size; - - /* setup initial p2m and allocate guest memory */ - for ( i = 0; i < GUEST_RAM_BANKS && dom->rambank_size[i]; i++ ) - { - if ((rc = populate_guest_memory(dom, - bankbase[i] >> XC_PAGE_SHIFT, - dom->rambank_size[i]))) - return rc; - } - - /* - * We try to place dtb+initrd at 128MB or if we have less RAM - * as high as possible. If there is no space then fallback to - * just before the kernel. - * - * If changing this then consider - * xen/arch/arm/kernel.c:place_modules as well. - */ - bank0end = bankbase[0] + ((uint64_t)dom->rambank_size[0] << XC_PAGE_SHIFT); - - if ( bank0end >= ram128mb + modsize && kernend < ram128mb ) - modbase = ram128mb; - else if ( bank0end - modsize > kernend ) - modbase = bank0end - modsize; - else if (kernbase - bankbase[0] > modsize ) - modbase = kernbase - modsize; - else - return -1; - - DOMPRINTF("%s: placing boot modules at 0x%" PRIx64, __FUNCTION__, modbase); - - /* - * Must map DTB *after* initrd, to satisfy order of calls to - * xc_dom_alloc_segment in xc_dom_build_image, which must map - * things at monotonolically increasing addresses. - */ - if ( ramdisk_size ) - { - dom->modules[0].seg.vstart = modbase; - dom->modules[0].seg.vend = modbase + ramdisk_size; - - DOMPRINTF("%s: ramdisk: 0x%" PRIx64 " -> 0x%" PRIx64 "", - __FUNCTION__, - dom->modules[0].seg.vstart, dom->modules[0].seg.vend); - - modbase += ramdisk_size; - } - - if ( dtb_size ) - { - dom->devicetree_seg.vstart = modbase; - dom->devicetree_seg.vend = modbase + dtb_size; - - DOMPRINTF("%s: devicetree: 0x%" PRIx64 " -> 0x%" PRIx64 "", - __FUNCTION__, - dom->devicetree_seg.vstart, dom->devicetree_seg.vend); - - modbase += dtb_size; - } - - return 0; -} - -bool xc_dom_translated(const struct xc_dom_image *dom) -{ - return true; -} - -/* ------------------------------------------------------------------------ */ - -static int bootearly(struct xc_dom_image *dom) -{ - DOMPRINTF("%s: doing nothing", __FUNCTION__); - return 0; -} - -static int bootlate(struct xc_dom_image *dom) -{ - /* XXX - * map shared info - * map grant tables - * setup shared info - */ - return 0; -} - -/* ------------------------------------------------------------------------ */ - -static struct xc_dom_arch xc_dom_32 = { - .guest_type = "xen-3.0-armv7l", - .native_protocol = XEN_IO_PROTO_ABI_ARM, - .page_shift = PAGE_SHIFT_ARM, - .sizeof_pfn = 8, - .alloc_magic_pages = alloc_magic_pages, - .start_info = start_info_arm, - .shared_info = shared_info_arm, - .vcpu = vcpu_arm32, - .meminit = meminit, - .bootearly = bootearly, - .bootlate = bootlate, -}; - -static struct xc_dom_arch xc_dom_64 = { - .guest_type = "xen-3.0-aarch64", - .native_protocol = XEN_IO_PROTO_ABI_ARM, - .page_shift = PAGE_SHIFT_ARM, - .sizeof_pfn = 8, - .alloc_magic_pages = alloc_magic_pages, - .start_info = start_info_arm, - .shared_info = shared_info_arm, - .vcpu = vcpu_arm64, - .meminit = meminit, - .bootearly = bootearly, - .bootlate = bootlate, -}; - -static void __init register_arch_hooks(void) -{ - xc_dom_register_arch_hooks(&xc_dom_32); - xc_dom_register_arch_hooks(&xc_dom_64); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_armzimageloader.c b/tools/libxc/xg_dom_armzimageloader.c deleted file mode 100644 index 4246c8e5fa..0000000000 --- a/tools/libxc/xg_dom_armzimageloader.c +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Xen domain builder -- ARM zImage bits - * - * Parse and load ARM zImage kernel images. - * - * Copyright (C) 2012, Citrix Systems. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - */ - -#include -#include -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" - -#include /* XXX ntohl is not the right function... */ - -struct minimal_dtb_header { - uint32_t magic; - uint32_t total_size; - /* There are other fields but we don't use them yet. */ -}; - -#define DTB_MAGIC 0xd00dfeed - -/* ------------------------------------------------------------ */ -/* 32-bit zImage Support */ -/* ------------------------------------------------------------ */ - -#define ZIMAGE32_MAGIC_OFFSET 0x24 -#define ZIMAGE32_START_OFFSET 0x28 -#define ZIMAGE32_END_OFFSET 0x2c - -#define ZIMAGE32_MAGIC 0x016f2818 - -static int xc_dom_probe_zimage32_kernel(struct xc_dom_image *dom) -{ - uint32_t *zimage; - - if ( dom->kernel_blob == NULL ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: no kernel image loaded", __FUNCTION__); - return -EINVAL; - } - - if ( dom->kernel_size < 0x30 /*sizeof(struct setup_header)*/ ) - { - xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__); - return -EINVAL; - } - - zimage = (uint32_t *)dom->kernel_blob; - if ( zimage[ZIMAGE32_MAGIC_OFFSET/4] != ZIMAGE32_MAGIC ) - { - xc_dom_printf(dom->xch, "%s: kernel is not an arm32 zImage", __FUNCTION__); - return -EINVAL; - } - - return 0; -} - -static int xc_dom_parse_zimage32_kernel(struct xc_dom_image *dom) -{ - uint32_t *zimage; - uint32_t start, entry_addr; - uint64_t v_start, v_end; - uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT; - - DOMPRINTF_CALLED(dom->xch); - - zimage = (uint32_t *)dom->kernel_blob; - - /* Do not load kernel at the very first RAM address */ - v_start = rambase + 0x8000; - - if ( dom->kernel_size > UINT64_MAX - v_start ) - { - DOMPRINTF("%s: kernel is too large\n", __FUNCTION__); - return -EINVAL; - } - - v_end = v_start + dom->kernel_size; - - /* - * If start is invalid then the guest will start at some invalid - * address and crash, but this happens in guest context so doesn't - * concern us here. - */ - start = zimage[ZIMAGE32_START_OFFSET/4]; - - if (start == 0) - entry_addr = v_start; - else - entry_addr = start; - - /* find kernel segment */ - dom->kernel_seg.vstart = v_start; - dom->kernel_seg.vend = v_end; - - dom->parms.virt_entry = entry_addr; - dom->parms.virt_base = rambase; - - dom->guest_type = "xen-3.0-armv7l"; - DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "", - __FUNCTION__, dom->guest_type, - dom->kernel_seg.vstart, dom->kernel_seg.vend); - return 0; -} - -/* ------------------------------------------------------------ */ -/* 64-bit zImage Support */ -/* ------------------------------------------------------------ */ - -#define ZIMAGE64_MAGIC_V0 0x14000008 -#define ZIMAGE64_MAGIC_V1 0x644d5241 /* "ARM\x64" */ - -/* linux/Documentation/arm64/booting.txt */ -struct zimage64_hdr { - uint32_t magic0; - uint32_t res0; - uint64_t text_offset; /* Image load offset */ - uint64_t res1; - uint64_t res2; - /* zImage V1 only from here */ - uint64_t res3; - uint64_t res4; - uint64_t res5; - uint32_t magic1; - uint32_t res6; -}; -static int xc_dom_probe_zimage64_kernel(struct xc_dom_image *dom) -{ - struct zimage64_hdr *zimage; - - if ( dom->kernel_blob == NULL ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: no kernel image loaded", __FUNCTION__); - return -EINVAL; - } - - if ( dom->kernel_size < sizeof(*zimage) ) - { - xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__); - return -EINVAL; - } - - zimage = dom->kernel_blob; - if ( zimage->magic0 != ZIMAGE64_MAGIC_V0 && - zimage->magic1 != ZIMAGE64_MAGIC_V1 ) - { - xc_dom_printf(dom->xch, "%s: kernel is not an arm64 Image", __FUNCTION__); - return -EINVAL; - } - - return 0; -} - -static int xc_dom_parse_zimage64_kernel(struct xc_dom_image *dom) -{ - struct zimage64_hdr *zimage; - uint64_t v_start, v_end; - uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT; - - DOMPRINTF_CALLED(dom->xch); - - zimage = dom->kernel_blob; - - if ( zimage->text_offset > UINT64_MAX - rambase ) - { - DOMPRINTF("%s: kernel text offset is too large\n", __FUNCTION__); - return -EINVAL; - } - - v_start = rambase + zimage->text_offset; - - if ( dom->kernel_size > UINT64_MAX - v_start ) - { - DOMPRINTF("%s: kernel is too large\n", __FUNCTION__); - return -EINVAL; - } - - v_end = v_start + dom->kernel_size; - - dom->kernel_seg.vstart = v_start; - dom->kernel_seg.vend = v_end; - - /* Call the kernel at offset 0 */ - dom->parms.virt_entry = v_start; - dom->parms.virt_base = rambase; - - dom->guest_type = "xen-3.0-aarch64"; - DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "", - __FUNCTION__, dom->guest_type, - dom->kernel_seg.vstart, dom->kernel_seg.vend); - - return 0; -} - -/* ------------------------------------------------------------ */ -/* Common zImage Support */ -/* ------------------------------------------------------------ */ - -static int xc_dom_load_zimage_kernel(struct xc_dom_image *dom) -{ - void *dst; - - DOMPRINTF_CALLED(dom->xch); - - dst = xc_dom_seg_to_ptr(dom, &dom->kernel_seg); - if ( dst == NULL ) - { - DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->kernel_seg) => NULL", - __func__); - return -1; - } - - DOMPRINTF("%s: kernel seg %#"PRIx64"-%#"PRIx64, - __func__, dom->kernel_seg.vstart, dom->kernel_seg.vend); - DOMPRINTF("%s: copy %zd bytes from blob %p to dst %p", - __func__, dom->kernel_size, dom->kernel_blob, dst); - - memcpy(dst, dom->kernel_blob, dom->kernel_size); - - return 0; -} - -static struct xc_dom_loader zimage32_loader = { - .name = "Linux zImage (ARM32)", - .probe = xc_dom_probe_zimage32_kernel, - .parser = xc_dom_parse_zimage32_kernel, - .loader = xc_dom_load_zimage_kernel, -}; - -static struct xc_dom_loader zimage64_loader = { - .name = "Linux zImage (ARM64)", - .probe = xc_dom_probe_zimage64_kernel, - .parser = xc_dom_parse_zimage64_kernel, - .loader = xc_dom_load_zimage_kernel, -}; - -static void __init register_loader(void) -{ - xc_dom_register_loader(&zimage32_loader); - xc_dom_register_loader(&zimage64_loader); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_binloader.c b/tools/libxc/xg_dom_binloader.c deleted file mode 100644 index 870a921427..0000000000 --- a/tools/libxc/xg_dom_binloader.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * Some of the field descriptions were copied from "The Multiboot - * Specification", Copyright 1995, 96 Bryan Ford , - * Erich Stefan Boleyn Copyright 1999, 2000, 2001, 2002 - * Free Software Foundation, Inc. - */ - -/****************************************************************************** - * - * Loads simple binary images. It's like a .COM file in MS-DOS. No headers are - * present. The only requirement is that it must have a xen_bin_image table - * somewhere in the first 8192 bytes, starting on a 32-bit aligned address. - * Those familiar with the multiboot specification should recognize this, it's - * (almost) the same as the multiboot header. - * The layout of the xen_bin_image table is: - * - * Offset Type Name Note - * 0 uint32_t magic required - * 4 uint32_t flags required - * 8 uint32_t checksum required - * 12 uint32_t header_addr required - * 16 uint32_t load_addr required - * 20 uint32_t load_end_addr required - * 24 uint32_t bss_end_addr required - * 28 uint32_t entry_addr required - * - * - magic - * Magic number identifying the table. For images to be loaded by Xen 3, the - * magic value is 0x336ec578 ("xEn3" with the 0x80 bit of the "E" set). - * - flags - * bit 0: indicates whether the image needs to be loaded on a page boundary - * bit 1: reserved, must be 0 (the multiboot spec uses this bit to indicate - * that memory info should be passed to the image) - * bit 2: reserved, must be 0 (the multiboot spec uses this bit to indicate - * that the bootloader should pass video mode info to the image) - * bit 16: reserved, must be 1 (the multiboot spec uses this bit to indicate - * that the values in the fields header_addr - entry_addr are - * valid) - * All other bits should be set to 0. - * - checksum - * When added to "magic" and "flags", the resulting value should be 0. - * - header_addr - * Contains the virtual address corresponding to the beginning of the - * table - the memory location at which the magic value is supposed to be - * loaded. This field serves to synchronize the mapping between OS image - * offsets and virtual memory addresses. - * - load_addr - * Contains the virtual address of the beginning of the text segment. The - * offset in the OS image file at which to start loading is defined by the - * offset at which the table was found, minus (header addr - load addr). - * load addr must be less than or equal to header addr. - * - load_end_addr - * Contains the virtual address of the end of the data segment. - * (load_end_addr - load_addr) specifies how much data to load. This implies - * that the text and data segments must be consecutive in the OS image. If - * this field is zero, the domain builder assumes that the text and data - * segments occupy the whole OS image file. - * - bss_end_addr - * Contains the virtual address of the end of the bss segment. The domain - * builder initializes this area to zero, and reserves the memory it occupies - * to avoid placing boot modules and other data relevant to the loaded image - * in that area. If this field is zero, the domain builder assumes that no bss - * segment is present. - * - entry_addr - * The virtual address at which to start execution of the loaded image. - * - */ - -#include -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" - -#define round_pgup(_p) (((_p)+(PAGE_SIZE_X86-1))&PAGE_MASK_X86) -#define round_pgdown(_p) ((_p)&PAGE_MASK_X86) - -struct xen_bin_image_table -{ - uint32_t magic; - uint32_t flags; - uint32_t checksum; - uint32_t header_addr; - uint32_t load_addr; - uint32_t load_end_addr; - uint32_t bss_end_addr; - uint32_t entry_addr; -}; - -#define XEN_MULTIBOOT_MAGIC3 0x336ec578 - -#define XEN_MULTIBOOT_FLAG_ALIGN4K 0x00000001 -#define XEN_MULTIBOOT_FLAG_NEEDMEMINFO 0x00000002 -#define XEN_MULTIBOOT_FLAG_NEEDVIDINFO 0x00000004 -#define XEN_MULTIBOOT_FLAG_ADDRSVALID 0x00010000 -#define XEN_MULTIBOOT_FLAG_PAE_SHIFT 14 -#define XEN_MULTIBOOT_FLAG_PAE_MASK (3 << XEN_MULTIBOOT_FLAG_PAE_SHIFT) - -/* Flags we test for */ -#define FLAGS_MASK ((~ 0) & (~ XEN_MULTIBOOT_FLAG_ALIGN4K) & \ - (~ XEN_MULTIBOOT_FLAG_PAE_MASK)) -#define FLAGS_REQUIRED XEN_MULTIBOOT_FLAG_ADDRSVALID - -/* --------------------------------------------------------------------- */ - -static struct xen_bin_image_table *find_table(struct xc_dom_image *dom) -{ - struct xen_bin_image_table *table; - uint32_t *probe_ptr; - uint32_t *probe_end; - - if ( dom->kernel_size < sizeof(*table) ) - return NULL; - probe_ptr = dom->kernel_blob; - if ( dom->kernel_size > (8192 + sizeof(*table)) ) - probe_end = dom->kernel_blob + 8192; - else - probe_end = dom->kernel_blob + dom->kernel_size - sizeof(*table); - - for ( table = NULL; probe_ptr < probe_end; probe_ptr++ ) - { - if ( *probe_ptr == XEN_MULTIBOOT_MAGIC3 ) - { - table = (struct xen_bin_image_table *) probe_ptr; - /* Checksum correct? */ - if ( (table->magic + table->flags + table->checksum) == 0 ) - return table; - } - } - return NULL; -} - -static int xc_dom_probe_bin_kernel(struct xc_dom_image *dom) -{ - return find_table(dom) ? 0 : -EINVAL; -} - -static int xc_dom_parse_bin_kernel(struct xc_dom_image *dom) -{ - struct xen_bin_image_table *image_info; - char *image = dom->kernel_blob; - size_t image_size = dom->kernel_size; - uint32_t start_addr; - uint32_t load_end_addr; - uint32_t bss_end_addr; - uint32_t pae_flags; - - image_info = find_table(dom); - if ( !image_info ) - return -EINVAL; - - DOMPRINTF("%s: multiboot header fields", __FUNCTION__); - DOMPRINTF(" flags: 0x%" PRIx32 "", image_info->flags); - DOMPRINTF(" header_addr: 0x%" PRIx32 "", image_info->header_addr); - DOMPRINTF(" load_addr: 0x%" PRIx32 "", image_info->load_addr); - DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", image_info->load_end_addr); - DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", image_info->bss_end_addr); - DOMPRINTF(" entry_addr: 0x%" PRIx32 "", image_info->entry_addr); - - /* Check the flags */ - if ( (image_info->flags & FLAGS_MASK) != FLAGS_REQUIRED ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s: xen_bin_image_table flags required " - "0x%08" PRIx32 " found 0x%08" PRIx32 "", - __FUNCTION__, FLAGS_REQUIRED, image_info->flags & FLAGS_MASK); - return -EINVAL; - } - - /* Sanity check on the addresses */ - if ( (image_info->header_addr < image_info->load_addr) || - ((char *) image_info - image) < - (image_info->header_addr - image_info->load_addr) ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid header_addr.", - __FUNCTION__); - return -EINVAL; - } - - start_addr = image_info->header_addr - ((char *)image_info - image); - load_end_addr = image_info->load_end_addr ?: start_addr + image_size; - bss_end_addr = image_info->bss_end_addr ?: load_end_addr; - - DOMPRINTF("%s: calculated addresses", __FUNCTION__); - DOMPRINTF(" start_addr: 0x%" PRIx32 "", start_addr); - DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", load_end_addr); - DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", bss_end_addr); - - if ( (start_addr + image_size) < load_end_addr ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid load_end_addr.", - __FUNCTION__); - return -EINVAL; - } - - if ( bss_end_addr < load_end_addr) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid bss_end_addr.", - __FUNCTION__); - return -EINVAL; - } - - dom->kernel_seg.vstart = image_info->load_addr; - dom->kernel_seg.vend = bss_end_addr; - dom->parms.virt_base = start_addr; - dom->parms.virt_entry = image_info->entry_addr; - - pae_flags = image_info->flags & XEN_MULTIBOOT_FLAG_PAE_MASK; - switch (pae_flags >> XEN_MULTIBOOT_FLAG_PAE_SHIFT) { - case 0: - dom->guest_type = "xen-3.0-x86_32"; - break; - case 1: - dom->guest_type = "xen-3.0-x86_32p"; - break; - case 2: - dom->guest_type = "xen-3.0-x86_64"; - break; - case 3: - /* Kernel detects PAE at runtime. So try to figure whenever - * xen supports PAE and advertise a PAE-capable kernel in case - * it does. */ - dom->guest_type = "xen-3.0-x86_32"; - if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") ) - { - DOMPRINTF("%s: PAE fixup", __FUNCTION__); - dom->guest_type = "xen-3.0-x86_32p"; - dom->parms.pae = XEN_PAE_EXTCR3; - } - break; - } - return 0; -} - -static int xc_dom_load_bin_kernel(struct xc_dom_image *dom) -{ - struct xen_bin_image_table *image_info; - char *image = dom->kernel_blob; - char *dest; - size_t image_size = dom->kernel_size; - size_t dest_size; - uint32_t start_addr; - uint32_t load_end_addr; - uint32_t bss_end_addr; - uint32_t skip, text_size, bss_size; - - image_info = find_table(dom); - if ( !image_info ) - return -EINVAL; - - start_addr = image_info->header_addr - ((char *)image_info - image); - load_end_addr = image_info->load_end_addr ?: start_addr + image_size; - bss_end_addr = image_info->bss_end_addr ?: load_end_addr; - - /* It's possible that we need to skip the first part of the image */ - skip = image_info->load_addr - start_addr; - text_size = load_end_addr - image_info->load_addr; - bss_size = bss_end_addr - load_end_addr; - - DOMPRINTF("%s: calculated sizes", __FUNCTION__); - DOMPRINTF(" skip: 0x%" PRIx32 "", skip); - DOMPRINTF(" text_size: 0x%" PRIx32 "", text_size); - DOMPRINTF(" bss_size: 0x%" PRIx32 "", bss_size); - - dest = xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart, &dest_size); - if ( dest == NULL ) - { - DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart)" - " => NULL", __FUNCTION__); - return -EINVAL; - } - - if ( dest_size < text_size || - dest_size - text_size < bss_size ) - { - DOMPRINTF("%s: mapped region is too small for image", __FUNCTION__); - return -EINVAL; - } - - if ( image_size < skip || - image_size - skip < text_size ) - { - DOMPRINTF("%s: image is too small for declared text size", - __FUNCTION__); - return -EINVAL; - } - - memcpy(dest, image + skip, text_size); - memset(dest + text_size, 0, bss_size); - - return 0; -} - -/* ------------------------------------------------------------------------ */ - -static struct xc_dom_loader bin_loader = { - .name = "multiboot-binary", - .probe = xc_dom_probe_bin_kernel, - .parser = xc_dom_parse_bin_kernel, - .loader = xc_dom_load_bin_kernel, -}; - -static void __init register_loader(void) -{ - xc_dom_register_loader(&bin_loader); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_boot.c b/tools/libxc/xg_dom_boot.c deleted file mode 100644 index 1e31e92244..0000000000 --- a/tools/libxc/xg_dom_boot.c +++ /dev/null @@ -1,451 +0,0 @@ -/* - * Xen domain builder -- xen booter. - * - * This is the code which actually boots a fresh - * prepared domain image as xen guest domain. - * - * ==> this is the only domain builder code piece - * where xen hypercalls are allowed <== - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * written 2006 by Gerd Hoffmann . - * - */ - -#include -#include -#include -#include -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" -#include "xc_core.h" -#include -#include - -/* ------------------------------------------------------------------------ */ - -static int setup_hypercall_page(struct xc_dom_image *dom) -{ - DECLARE_DOMCTL; - xen_pfn_t pfn; - int rc; - - if ( dom->parms.virt_hypercall == -1 ) - return 0; - pfn = (dom->parms.virt_hypercall - dom->parms.virt_base) - >> XC_DOM_PAGE_SHIFT(dom); - - DOMPRINTF("%s: vaddr=0x%" PRIx64 " pfn=0x%" PRIpfn "", __FUNCTION__, - dom->parms.virt_hypercall, pfn); - domctl.cmd = XEN_DOMCTL_hypercall_init; - domctl.domain = dom->guest_domid; - domctl.u.hypercall_init.gmfn = xc_dom_p2m(dom, pfn); - rc = do_domctl(dom->xch, &domctl); - if ( rc != 0 ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: HYPERCALL_INIT failed: %d - %s)", - __FUNCTION__, errno, strerror(errno)); - return rc; -} - - -/* ------------------------------------------------------------------------ */ - -int xc_dom_compat_check(struct xc_dom_image *dom) -{ - xen_capabilities_info_t xen_caps; - char *item, *ptr; - int match, found = 0; - - strncpy(xen_caps, dom->xen_caps, XEN_CAPABILITIES_INFO_LEN - 1); - xen_caps[XEN_CAPABILITIES_INFO_LEN - 1] = '\0'; - - for ( item = strtok_r(xen_caps, " ", &ptr); - item != NULL ; item = strtok_r(NULL, " ", &ptr) ) - { - match = !strcmp(dom->guest_type, item); - DOMPRINTF("%s: supported guest type: %s%s", __FUNCTION__, - item, match ? " <= matches" : ""); - if ( match ) - found++; - } - if ( !found ) - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s: guest type %s not supported by xen kernel, sorry", - __FUNCTION__, dom->guest_type); - - return found; -} - -int xc_dom_boot_xen_init(struct xc_dom_image *dom, xc_interface *xch, uint32_t domid) -{ - dom->xch = xch; - dom->guest_domid = domid; - - dom->xen_version = xc_version(xch, XENVER_version, NULL); - if ( xc_version(xch, XENVER_capabilities, &dom->xen_caps) < 0 ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, "can't get xen capabilities"); - return -1; - } - DOMPRINTF("%s: ver %d.%d, caps %s", __FUNCTION__, - dom->xen_version >> 16, dom->xen_version & 0xff, - dom->xen_caps); - return 0; -} - -int xc_dom_boot_mem_init(struct xc_dom_image *dom) -{ - long rc; - - DOMPRINTF_CALLED(dom->xch); - - rc = dom->arch_hooks->meminit(dom); - if ( rc != 0 ) - { - xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, - "%s: can't allocate low memory for domain", - __FUNCTION__); - return rc; - } - - return 0; -} - -void *xc_dom_boot_domU_map(struct xc_dom_image *dom, xen_pfn_t pfn, - xen_pfn_t count) -{ - int page_shift = XC_DOM_PAGE_SHIFT(dom); - privcmd_mmap_entry_t *entries; - void *ptr; - int i; - int err; - - entries = xc_dom_malloc(dom, count * sizeof(privcmd_mmap_entry_t)); - if ( entries == NULL ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn - " [malloc]", __FUNCTION__, pfn, count); - return NULL; - } - - for ( i = 0; i < count; i++ ) - entries[i].mfn = xc_dom_p2m(dom, pfn + i); - - ptr = xc_map_foreign_ranges(dom->xch, dom->guest_domid, - count << page_shift, PROT_READ | PROT_WRITE, 1 << page_shift, - entries, count); - if ( ptr == NULL ) - { - err = errno; - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn - " [mmap, errno=%i (%s)]", __FUNCTION__, pfn, count, - err, strerror(err)); - return NULL; - } - - return ptr; -} - -int xc_dom_boot_image(struct xc_dom_image *dom) -{ - xc_dominfo_t info; - int rc; - - DOMPRINTF_CALLED(dom->xch); - - /* misc stuff*/ - if ( (rc = dom->arch_hooks->bootearly(dom)) != 0 ) - return rc; - - /* collect some info */ - rc = xc_domain_getinfo(dom->xch, dom->guest_domid, 1, &info); - if ( rc < 0 ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: getdomaininfo failed (rc=%d)", __FUNCTION__, rc); - return rc; - } - if ( rc == 0 || info.domid != dom->guest_domid ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: Huh? No domains found (nr_domains=%d) " - "or domid mismatch (%d != %d)", __FUNCTION__, - rc, info.domid, dom->guest_domid); - return -1; - } - dom->shared_info_mfn = info.shared_info_frame; - - /* sanity checks */ - if ( !xc_dom_compat_check(dom) ) - return -1; - - /* initial mm setup */ - if ( dom->arch_hooks->setup_pgtables && - (rc = dom->arch_hooks->setup_pgtables(dom)) != 0 ) - return rc; - - /* start info page */ - if ( dom->arch_hooks->start_info ) - dom->arch_hooks->start_info(dom); - - /* hypercall page */ - if ( (rc = setup_hypercall_page(dom)) != 0 ) - return rc; - xc_dom_log_memory_footprint(dom); - - /* misc x86 stuff */ - if ( (rc = dom->arch_hooks->bootlate(dom)) != 0 ) - return rc; - - /* let the vm run */ - if ( (rc = dom->arch_hooks->vcpu(dom)) != 0 ) - return rc; - xc_dom_unmap_all(dom); - - return rc; -} - -static xen_pfn_t xc_dom_gnttab_setup(xc_interface *xch, uint32_t domid) -{ - gnttab_setup_table_t setup; - DECLARE_HYPERCALL_BUFFER(xen_pfn_t, gmfnp); - int rc; - xen_pfn_t gmfn; - - gmfnp = xc_hypercall_buffer_alloc(xch, gmfnp, sizeof(*gmfnp)); - if (gmfnp == NULL) - return -1; - - setup.dom = domid; - setup.nr_frames = 1; - set_xen_guest_handle(setup.frame_list, gmfnp); - setup.status = 0; - - rc = xc_gnttab_op(xch, GNTTABOP_setup_table, &setup, sizeof(setup), 1); - gmfn = *gmfnp; - xc_hypercall_buffer_free(xch, gmfnp); - - if ( rc != 0 || setup.status != GNTST_okay ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to setup domU grant table " - "[errno=%d, status=%" PRId16 "]\n", - __FUNCTION__, rc != 0 ? errno : 0, setup.status); - return -1; - } - - return gmfn; -} - -static void xc_dom_set_gnttab_entry(xc_interface *xch, - grant_entry_v1_t *gnttab, - unsigned int idx, - uint32_t guest_domid, - uint32_t backend_domid, - xen_pfn_t guest_gfn) -{ - if ( guest_domid == backend_domid || guest_gfn == -1 ) - return; - - xc_dom_printf(xch, "%s: d%d gnt[%u] -> d%d 0x%"PRI_xen_pfn, - __func__, guest_domid, idx, backend_domid, guest_gfn); - - gnttab[idx].flags = GTF_permit_access; - gnttab[idx].domid = backend_domid; - gnttab[idx].frame = guest_gfn; -} - -static int compat_gnttab_seed(xc_interface *xch, uint32_t domid, - xen_pfn_t console_gfn, - xen_pfn_t xenstore_gfn, - uint32_t console_domid, - uint32_t xenstore_domid) -{ - - xen_pfn_t gnttab_gfn; - grant_entry_v1_t *gnttab; - - gnttab_gfn = xc_dom_gnttab_setup(xch, domid); - if ( gnttab_gfn == -1 ) - return -1; - - gnttab = xc_map_foreign_range(xch, - domid, - PAGE_SIZE, - PROT_READ|PROT_WRITE, - gnttab_gfn); - if ( gnttab == NULL ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to map d%d grant table " - "[errno=%d]\n", - __func__, domid, errno); - return -1; - } - - xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_CONSOLE, - domid, console_domid, console_gfn); - xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_XENSTORE, - domid, xenstore_domid, xenstore_gfn); - - if ( munmap(gnttab, PAGE_SIZE) == -1 ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to unmap d%d grant table " - "[errno=%d]\n", - __func__, domid, errno); - return -1; - } - - /* Guest shouldn't really touch its grant table until it has - * enabled its caches. But lets be nice. */ - xc_domain_cacheflush(xch, domid, gnttab_gfn, 1); - - return 0; -} - -static int compat_gnttab_hvm_seed(xc_interface *xch, uint32_t domid, - xen_pfn_t console_gfn, - xen_pfn_t xenstore_gfn, - uint32_t console_domid, - uint32_t xenstore_domid) -{ - int rc; - xen_pfn_t scratch_gfn; - struct xen_add_to_physmap xatp = { - .domid = domid, - .space = XENMAPSPACE_grant_table, - .idx = 0, - }; - struct xen_remove_from_physmap xrfp = { - .domid = domid, - }; - - rc = xc_core_arch_get_scratch_gpfn(xch, domid, &scratch_gfn); - if ( rc < 0 ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to get a scratch gfn from d%d" - "[errno=%d]\n", - __func__, domid, errno); - return -1; - } - xatp.gpfn = scratch_gfn; - xrfp.gpfn = scratch_gfn; - - xc_dom_printf(xch, "%s: d%d: pfn=0x%"PRI_xen_pfn, __func__, - domid, scratch_gfn); - - rc = do_memory_op(xch, XENMEM_add_to_physmap, &xatp, sizeof(xatp)); - if ( rc != 0 ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to add gnttab to d%d physmap " - "[errno=%d]\n", - __func__, domid, errno); - return -1; - } - - rc = compat_gnttab_seed(xch, domid, - console_gfn, xenstore_gfn, - console_domid, xenstore_domid); - if (rc != 0) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to seed gnttab entries for d%d\n", - __func__, domid); - (void) do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp, - sizeof(xrfp)); - return -1; - } - - rc = do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp, sizeof(xrfp)); - if (rc != 0) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to remove gnttab from d%d physmap " - "[errno=%d]\n", - __func__, domid, errno); - return -1; - } - - return 0; -} - -int xc_dom_gnttab_seed(xc_interface *xch, uint32_t guest_domid, - bool is_hvm, xen_pfn_t console_gfn, - xen_pfn_t xenstore_gfn, uint32_t console_domid, - uint32_t xenstore_domid) -{ - xenforeignmemory_handle* fmem = xch->fmem; - xenforeignmemory_resource_handle *fres; - void *addr = NULL; - - fres = xenforeignmemory_map_resource( - fmem, guest_domid, XENMEM_resource_grant_table, - XENMEM_resource_grant_table_id_shared, 0, 1, &addr, - PROT_READ | PROT_WRITE, 0); - if ( !fres ) - { - if ( errno == EOPNOTSUPP ) - return is_hvm ? - compat_gnttab_hvm_seed(xch, guest_domid, - console_gfn, xenstore_gfn, - console_domid, xenstore_domid) : - compat_gnttab_seed(xch, guest_domid, - console_gfn, xenstore_gfn, - console_domid, xenstore_domid); - - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: failed to acquire d%d grant table [errno=%d]\n", - __func__, guest_domid, errno); - return -1; - } - - xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_CONSOLE, - guest_domid, console_domid, console_gfn); - xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_XENSTORE, - guest_domid, xenstore_domid, xenstore_gfn); - - xenforeignmemory_unmap_resource(fmem, fres); - - return 0; -} - -int xc_dom_gnttab_init(struct xc_dom_image *dom) -{ - bool is_hvm = xc_dom_translated(dom); - xen_pfn_t console_gfn = xc_dom_p2m(dom, dom->console_pfn); - xen_pfn_t xenstore_gfn = xc_dom_p2m(dom, dom->xenstore_pfn); - - return xc_dom_gnttab_seed(dom->xch, dom->guest_domid, is_hvm, - console_gfn, xenstore_gfn, - dom->console_domid, dom->xenstore_domid); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_bzimageloader.c b/tools/libxc/xg_dom_bzimageloader.c deleted file mode 100644 index f959a77602..0000000000 --- a/tools/libxc/xg_dom_bzimageloader.c +++ /dev/null @@ -1,812 +0,0 @@ -/* - * Xen domain builder -- bzImage bits - * - * Parse and load bzImage kernel images. - * - * This relies on version 2.08 of the boot protocol, which contains an - * ELF file embedded in the bzImage. The loader extracts this ELF - * image and passes it off to the standard ELF loader. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * written 2006 by Gerd Hoffmann . - * written 2007 by Jeremy Fitzhardinge - * written 2008 by Ian Campbell - * written 2009 by Chris Lalancette - * - */ - -#include -#include -#include - -#include "xg_private.h" -#include "xg_dom_decompress.h" - -#include - -#ifndef __MINIOS__ - -#if defined(HAVE_BZLIB) - -#include - -static int xc_try_bzip2_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - bz_stream stream; - int ret; - char *out_buf; - char *tmp_buf; - int retval = -1; - unsigned int outsize; - uint64_t total; - - stream.bzalloc = NULL; - stream.bzfree = NULL; - stream.opaque = NULL; - - if ( dom->kernel_size == 0) - { - DOMPRINTF("BZIP2: Input is 0 size"); - return -1; - } - - ret = BZ2_bzDecompressInit(&stream, 0, 0); - if ( ret != BZ_OK ) - { - DOMPRINTF("BZIP2: Error initting stream"); - return -1; - } - - /* sigh. We don't know up-front how much memory we are going to need - * for the output buffer. Allocate the output buffer to be equal - * the input buffer to start, and we'll realloc as needed. - */ - outsize = dom->kernel_size; - - /* - * stream.avail_in and outsize are unsigned int, while kernel_size - * is a size_t. Check we aren't overflowing. - */ - if ( outsize != dom->kernel_size ) - { - DOMPRINTF("BZIP2: Input too large"); - goto bzip2_cleanup; - } - - out_buf = malloc(outsize); - if ( out_buf == NULL ) - { - DOMPRINTF("BZIP2: Failed to alloc memory"); - goto bzip2_cleanup; - } - - stream.next_in = dom->kernel_blob; - stream.avail_in = dom->kernel_size; - - stream.next_out = out_buf; - stream.avail_out = dom->kernel_size; - - for ( ; ; ) - { - ret = BZ2_bzDecompress(&stream); - if ( ret == BZ_STREAM_END ) - { - DOMPRINTF("BZIP2: Saw data stream end"); - retval = 0; - break; - } - if ( ret != BZ_OK ) - { - DOMPRINTF("BZIP2: error %d", ret); - free(out_buf); - goto bzip2_cleanup; - } - - if ( stream.avail_out == 0 ) - { - /* Protect against output buffer overflow */ - if ( outsize > UINT_MAX / 2 ) - { - DOMPRINTF("BZIP2: output buffer overflow"); - free(out_buf); - goto bzip2_cleanup; - } - - if ( xc_dom_kernel_check_size(dom, outsize * 2) ) - { - DOMPRINTF("BZIP2: output too large"); - free(out_buf); - goto bzip2_cleanup; - } - - tmp_buf = realloc(out_buf, outsize * 2); - if ( tmp_buf == NULL ) - { - DOMPRINTF("BZIP2: Failed to realloc memory"); - free(out_buf); - goto bzip2_cleanup; - } - out_buf = tmp_buf; - - stream.next_out = out_buf + outsize; - stream.avail_out = (outsize * 2) - outsize; - outsize *= 2; - } - else if ( stream.avail_in == 0 ) - { - /* - * If there is output buffer available then this indicates - * that BZ2_bzDecompress would like more input data to be - * provided. However our complete input buffer is in - * memory and provided upfront so if avail_in is zero this - * actually indicates a truncated input. - */ - DOMPRINTF("BZIP2: not enough input"); - free(out_buf); - goto bzip2_cleanup; - } - } - - total = (((uint64_t)stream.total_out_hi32) << 32) | stream.total_out_lo32; - - if ( xc_dom_register_external(dom, out_buf, total) ) - { - DOMPRINTF("BZIP2: Error registering stream output"); - free(out_buf); - goto bzip2_cleanup; - } - - DOMPRINTF("%s: BZIP2 decompress OK, 0x%zx -> 0x%lx", - __FUNCTION__, *size, (long unsigned int) total); - - *blob = out_buf; - *size = total; - - bzip2_cleanup: - BZ2_bzDecompressEnd(&stream); - - return retval; -} - -#else /* !defined(HAVE_BZLIB) */ - -static int xc_try_bzip2_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: BZIP2 decompress support unavailable", - __FUNCTION__); - return -1; -} - -#endif - -#if defined(HAVE_LZMA) - -#include - -static int _xc_try_lzma_decode( - struct xc_dom_image *dom, void **blob, size_t *size, - lzma_stream *stream, const char *what) -{ - lzma_ret ret; - lzma_action action = LZMA_RUN; - unsigned char *out_buf; - unsigned char *tmp_buf; - int retval = -1; - size_t outsize; - const char *msg; - - if ( dom->kernel_size == 0) - { - DOMPRINTF("%s: Input is 0 size", what); - return -1; - } - - /* sigh. We don't know up-front how much memory we are going to need - * for the output buffer. Allocate the output buffer to be equal - * the input buffer to start, and we'll realloc as needed. - */ - outsize = dom->kernel_size; - out_buf = malloc(outsize); - if ( out_buf == NULL ) - { - DOMPRINTF("%s: Failed to alloc memory", what); - goto lzma_cleanup; - } - - stream->next_in = dom->kernel_blob; - stream->avail_in = dom->kernel_size; - - stream->next_out = out_buf; - stream->avail_out = dom->kernel_size; - - for ( ; ; ) - { - ret = lzma_code(stream, action); - if ( ret == LZMA_STREAM_END ) - { - DOMPRINTF("%s: Saw data stream end", what); - retval = 0; - break; - } - if ( ret != LZMA_OK ) - { - switch ( ret ) - { - case LZMA_MEM_ERROR: - msg = strerror(ENOMEM); - break; - - case LZMA_MEMLIMIT_ERROR: - msg = "Memory usage limit reached"; - break; - - case LZMA_FORMAT_ERROR: - msg = "File format not recognized"; - break; - - case LZMA_OPTIONS_ERROR: - // FIXME: Better message? - msg = "Unsupported compression options"; - break; - - case LZMA_DATA_ERROR: - msg = "File is corrupt"; - break; - - case LZMA_BUF_ERROR: - msg = "Unexpected end of input"; - break; - - default: - msg = "Internal program error (bug)"; - break; - } - DOMPRINTF("%s: %s decompression error: %s", - __FUNCTION__, what, msg); - free(out_buf); - goto lzma_cleanup; - } - - if ( stream->avail_out == 0 ) - { - /* Protect against output buffer overflow */ - if ( outsize > SIZE_MAX / 2 ) - { - DOMPRINTF("%s: output buffer overflow", what); - free(out_buf); - goto lzma_cleanup; - } - - if ( xc_dom_kernel_check_size(dom, outsize * 2) ) - { - DOMPRINTF("%s: output too large", what); - free(out_buf); - goto lzma_cleanup; - } - - tmp_buf = realloc(out_buf, outsize * 2); - if ( tmp_buf == NULL ) - { - DOMPRINTF("%s: Failed to realloc memory", what); - free(out_buf); - goto lzma_cleanup; - } - out_buf = tmp_buf; - - stream->next_out = out_buf + outsize; - stream->avail_out = (outsize * 2) - outsize; - outsize *= 2; - } - } - - if ( xc_dom_register_external(dom, out_buf, stream->total_out) ) - { - DOMPRINTF("%s: Error registering stream output", what); - free(out_buf); - goto lzma_cleanup; - } - - DOMPRINTF("%s: %s decompress OK, 0x%zx -> 0x%zx", - __FUNCTION__, what, *size, (size_t)stream->total_out); - - *blob = out_buf; - *size = stream->total_out; - - lzma_cleanup: - lzma_end(stream); - - return retval; -} - -/* 128 Mb is the minimum size (half-way) documented to work for all inputs. */ -#define LZMA_BLOCK_SIZE (128*1024*1024) - -static int xc_try_xz_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - lzma_stream stream = LZMA_STREAM_INIT; - - if ( lzma_stream_decoder(&stream, LZMA_BLOCK_SIZE, 0) != LZMA_OK ) - { - DOMPRINTF("XZ: Failed to init decoder"); - return -1; - } - - return _xc_try_lzma_decode(dom, blob, size, &stream, "XZ"); -} - -static int xc_try_lzma_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - lzma_stream stream = LZMA_STREAM_INIT; - - if ( lzma_alone_decoder(&stream, LZMA_BLOCK_SIZE) != LZMA_OK ) - { - DOMPRINTF("LZMA: Failed to init decoder"); - return -1; - } - - return _xc_try_lzma_decode(dom, blob, size, &stream, "LZMA"); -} - -#else /* !defined(HAVE_LZMA) */ - -static int xc_try_xz_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: XZ decompress support unavailable", - __FUNCTION__); - return -1; -} - -static int xc_try_lzma_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: LZMA decompress support unavailable", - __FUNCTION__); - return -1; -} - -#endif - -#if defined(HAVE_LZO1X) - -#include - -#define LZOP_HEADER_HAS_FILTER 0x00000800 -#define LZOP_MAX_BLOCK_SIZE (64*1024*1024) - -static inline uint_fast16_t lzo_read_16(const unsigned char *buf) -{ - return buf[1] | (buf[0] << 8); -} - -static inline uint_fast32_t lzo_read_32(const unsigned char *buf) -{ - return lzo_read_16(buf + 2) | ((uint32_t)lzo_read_16(buf) << 16); -} - -static int xc_try_lzo1x_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - int ret; - const unsigned char *cur = dom->kernel_blob; - unsigned char *out_buf = NULL; - size_t left = dom->kernel_size; - const char *msg; - unsigned version; - static const unsigned char magic[] = { - 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a - }; - - /* - * lzo_uint should match size_t. Check that this is the case to be - * sure we won't overflow various lzo_uint fields. - */ - BUILD_BUG_ON(sizeof(lzo_uint) != sizeof(size_t)); - - ret = lzo_init(); - if ( ret != LZO_E_OK ) - { - DOMPRINTF("LZO1x: Failed to init library (%d)\n", ret); - return -1; - } - - if ( left < 16 || memcmp(cur, magic, 9) ) - { - DOMPRINTF("LZO1x: Unrecognized magic\n"); - return -1; - } - - /* get version (2bytes), skip library version (2), - * 'need to be extracted' version (2) and method (1) */ - version = lzo_read_16(cur + 9); - cur += 16; - left -= 16; - - if ( version >= 0x0940 ) - { - /* skip level */ - ++cur; - if ( left ) - --left; - } - - if ( left >= 4 && (lzo_read_32(cur) & LZOP_HEADER_HAS_FILTER) ) - ret = 8; /* flags + filter info */ - else - ret = 4; /* flags */ - - /* skip mode and mtime_low */ - ret += 8; - if ( version >= 0x0940 ) - ret += 4; /* skip mtime_high */ - - /* don't care about the file name, and skip checksum */ - if ( left > ret ) - ret += 1 + cur[ret] + 4; - - if ( left < ret ) - { - DOMPRINTF("LZO1x: Incomplete header\n"); - return -1; - } - cur += ret; - left -= ret; - - for ( *size = 0; ; ) - { - lzo_uint src_len, dst_len, out_len; - unsigned char *tmp_buf; - - msg = "Short input"; - if ( left < 4 ) - break; - - dst_len = lzo_read_32(cur); - if ( !dst_len ) - { - msg = "Error registering stream output"; - if ( xc_dom_register_external(dom, out_buf, *size) ) - break; - - return 0; - } - - if ( dst_len > LZOP_MAX_BLOCK_SIZE ) - { - msg = "Block size too large"; - break; - } - - if ( left < 12 ) - break; - - src_len = lzo_read_32(cur + 4); - cur += 12; /* also skip block checksum info */ - left -= 12; - - msg = "Bad source length"; - if ( src_len <= 0 || src_len > dst_len || src_len > left ) - break; - - msg = "Output buffer overflow"; - if ( *size > SIZE_MAX - dst_len ) - break; - - msg = "Decompressed image too large"; - if ( xc_dom_kernel_check_size(dom, *size + dst_len) ) - break; - - msg = "Failed to (re)alloc memory"; - tmp_buf = realloc(out_buf, *size + dst_len); - if ( tmp_buf == NULL ) - break; - - out_buf = tmp_buf; - out_len = dst_len; - - ret = lzo1x_decompress_safe(cur, src_len, - out_buf + *size, &out_len, NULL); - switch ( ret ) - { - case LZO_E_OK: - msg = "Input underrun"; - if ( out_len != dst_len ) - break; - - *blob = out_buf; - *size += out_len; - cur += src_len; - left -= src_len; - continue; - - case LZO_E_INPUT_NOT_CONSUMED: - msg = "Unconsumed input"; - break; - - case LZO_E_OUTPUT_OVERRUN: - msg = "Output overrun"; - break; - - case LZO_E_INPUT_OVERRUN: - msg = "Input overrun"; - break; - - case LZO_E_LOOKBEHIND_OVERRUN: - msg = "Look-behind overrun"; - break; - - case LZO_E_EOF_NOT_FOUND: - msg = "No EOF marker"; - break; - - case LZO_E_ERROR: - msg = "General error"; - break; - - default: - msg = "Internal program error (bug)"; - break; - } - - break; - } - - free(out_buf); - DOMPRINTF("LZO1x decompression error: %s\n", msg); - - return -1; -} - -#else /* !defined(HAVE_LZO1X) */ - -static int xc_try_lzo1x_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: LZO1x decompress support unavailable\n", - __FUNCTION__); - return -1; -} - -#endif - -#else /* __MINIOS__ */ - -int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size); -int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size); -int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size); -int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size); - -#endif /* !__MINIOS__ */ - -struct setup_header { - uint8_t _pad0[0x1f1]; /* skip uninteresting stuff */ - uint8_t setup_sects; - uint16_t root_flags; - uint32_t syssize; - uint16_t ram_size; - uint16_t vid_mode; - uint16_t root_dev; - uint16_t boot_flag; - uint16_t jump; - uint32_t header; -#define HDR_MAGIC "HdrS" -#define HDR_MAGIC_SZ 4 - uint16_t version; -#define VERSION(h,l) (((h)<<8) | (l)) - uint32_t realmode_swtch; - uint16_t start_sys; - uint16_t kernel_version; - uint8_t type_of_loader; - uint8_t loadflags; - uint16_t setup_move_size; - uint32_t code32_start; - uint32_t ramdisk_image; - uint32_t ramdisk_size; - uint32_t bootsect_kludge; - uint16_t heap_end_ptr; - uint16_t _pad1; - uint32_t cmd_line_ptr; - uint32_t initrd_addr_max; - uint32_t kernel_alignment; - uint8_t relocatable_kernel; - uint8_t _pad2[3]; - uint32_t cmdline_size; - uint32_t hardware_subarch; - uint64_t hardware_subarch_data; - uint32_t payload_offset; - uint32_t payload_length; -} __attribute__((packed)); - -extern struct xc_dom_loader elf_loader; - -static int check_magic(struct xc_dom_image *dom, const void *magic, size_t len) -{ - if (len > dom->kernel_size) - return 0; - - return (memcmp(dom->kernel_blob, magic, len) == 0); -} - -static int xc_dom_probe_bzimage_kernel(struct xc_dom_image *dom) -{ - struct setup_header *hdr; - uint64_t payload_offset, payload_length; - int ret; - - if ( dom->kernel_blob == NULL ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: no kernel image loaded", __FUNCTION__); - return -EINVAL; - } - - if ( dom->kernel_size < sizeof(struct setup_header) ) - { - xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__); - return -EINVAL; - } - - hdr = dom->kernel_blob; - - if ( memcmp(&hdr->header, HDR_MAGIC, HDR_MAGIC_SZ) != 0 ) - { - xc_dom_printf(dom->xch, "%s: kernel is not a bzImage", __FUNCTION__); - return -EINVAL; - } - - if ( hdr->version < VERSION(2,8) ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: boot protocol" - " too old (%04x)", __FUNCTION__, hdr->version); - return -EINVAL; - } - - - /* upcast to 64 bits to avoid overflow */ - /* setup_sects is u8 and so cannot overflow */ - payload_offset = (hdr->setup_sects + 1) * 512; - payload_offset += hdr->payload_offset; - payload_length = hdr->payload_length; - - if ( payload_offset >= dom->kernel_size ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload offset overflow", - __FUNCTION__); - return -EINVAL; - } - if ( (payload_offset + payload_length) > dom->kernel_size ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload length overflow", - __FUNCTION__); - return -EINVAL; - } - - dom->kernel_blob = dom->kernel_blob + payload_offset; - dom->kernel_size = payload_length; - - if ( check_magic(dom, "\037\213", 2) ) - { - ret = xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); - if ( ret == -1 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: unable to" - " gzip decompress kernel", __FUNCTION__); - return -EINVAL; - } - } - else if ( check_magic(dom, "\102\132\150", 3) ) - { - ret = xc_try_bzip2_decode(dom, &dom->kernel_blob, &dom->kernel_size); - if ( ret < 0 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s unable to BZIP2 decompress kernel", - __FUNCTION__); - return -EINVAL; - } - } - else if ( check_magic(dom, "\3757zXZ", 6) ) - { - ret = xc_try_xz_decode(dom, &dom->kernel_blob, &dom->kernel_size); - if ( ret < 0 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s unable to XZ decompress kernel", - __FUNCTION__); - return -EINVAL; - } - } - else if ( check_magic(dom, "\135\000", 2) ) - { - ret = xc_try_lzma_decode(dom, &dom->kernel_blob, &dom->kernel_size); - if ( ret < 0 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s unable to LZMA decompress kernel", - __FUNCTION__); - return -EINVAL; - } - } - else if ( check_magic(dom, "\x89LZO", 5) ) - { - ret = xc_try_lzo1x_decode(dom, &dom->kernel_blob, &dom->kernel_size); - if ( ret < 0 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s unable to LZO decompress kernel\n", - __FUNCTION__); - return -EINVAL; - } - } - else if ( check_magic(dom, "\x02\x21", 2) ) - { - ret = xc_try_lz4_decode(dom, &dom->kernel_blob, &dom->kernel_size); - if ( ret < 0 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s unable to LZ4 decompress kernel\n", - __FUNCTION__); - return -EINVAL; - } - } - else - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s: unknown compression format", __FUNCTION__); - return -EINVAL; - } - - return elf_loader.probe(dom); -} - -static int xc_dom_parse_bzimage_kernel(struct xc_dom_image *dom) -{ - return elf_loader.parser(dom); -} - -static int xc_dom_load_bzimage_kernel(struct xc_dom_image *dom) -{ - return elf_loader.loader(dom); -} - -static struct xc_dom_loader bzimage_loader = { - .name = "Linux bzImage", - .probe = xc_dom_probe_bzimage_kernel, - .parser = xc_dom_parse_bzimage_kernel, - .loader = xc_dom_load_bzimage_kernel, -}; - -static void __init register_loader(void) -{ - xc_dom_register_loader(&bzimage_loader); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_compat_linux.c b/tools/libxc/xg_dom_compat_linux.c deleted file mode 100644 index b645f0b14b..0000000000 --- a/tools/libxc/xg_dom_compat_linux.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Xen domain builder -- compatibility code. - * - * Replacements for xc_linux_build & friends, - * as example code and to make the new builder - * usable as drop-in replacement. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * written 2006 by Gerd Hoffmann . - * - */ - -#include -#include -#include -#include -#include - -#include "xenctrl.h" -#include "xg_private.h" -#include "xenctrl_dom.h" - -/* ------------------------------------------------------------------------ */ - -int xc_linux_build(xc_interface *xch, uint32_t domid, - unsigned int mem_mb, - const char *image_name, - const char *initrd_name, - const char *cmdline, - const char *features, - unsigned long flags, - unsigned int store_evtchn, - unsigned long *store_mfn, - unsigned int console_evtchn, - unsigned long *console_mfn) -{ - struct xc_dom_image *dom; - int rc; - - xc_dom_loginit(xch); - dom = xc_dom_allocate(xch, cmdline, features); - if (dom == NULL) - return -1; - if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 ) - goto out; - if ( initrd_name && strlen(initrd_name) && - ((rc = xc_dom_module_file(dom, initrd_name, NULL)) != 0) ) - goto out; - - dom->flags |= flags; - dom->console_evtchn = console_evtchn; - dom->xenstore_evtchn = store_evtchn; - - if ( (rc = xc_dom_boot_xen_init(dom, xch, domid)) != 0 ) - goto out; - if ( (rc = xc_dom_parse_image(dom)) != 0 ) - goto out; - if ( (rc = xc_dom_mem_init(dom, mem_mb)) != 0 ) - goto out; - if ( (rc = xc_dom_boot_mem_init(dom)) != 0 ) - goto out; - if ( (rc = xc_dom_build_image(dom)) != 0 ) - goto out; - if ( (rc = xc_dom_boot_image(dom)) != 0 ) - goto out; - if ( (rc = xc_dom_gnttab_init(dom)) != 0) - goto out; - - *console_mfn = xc_dom_p2m(dom, dom->console_pfn); - *store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); - - out: - xc_dom_release(dom); - return rc; -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_core.c b/tools/libxc/xg_dom_core.c deleted file mode 100644 index 1c91cce315..0000000000 --- a/tools/libxc/xg_dom_core.c +++ /dev/null @@ -1,1272 +0,0 @@ -/* - * Xen domain builder -- core bits. - * - * The core code goes here: - * - allocate and release domain structs. - * - memory management functions. - * - misc helper functions. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * written 2006 by Gerd Hoffmann . - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" -#include "_paths.h" - -/* ------------------------------------------------------------------------ */ -/* debugging */ - - - -static const char *default_logfile = XEN_LOG_DIR "/domain-builder-ng.log"; - -int xc_dom_loginit(xc_interface *xch) { - if (xch->dombuild_logger) return 0; - - if (!xch->dombuild_logger_file) { - xch->dombuild_logger_file = fopen(default_logfile, "a"); - if (!xch->dombuild_logger_file) { - PERROR("Could not open logfile `%s'", default_logfile); - return -1; - } - } - - xch->dombuild_logger = xch->dombuild_logger_tofree = - (xentoollog_logger*) - xtl_createlogger_stdiostream(xch->dombuild_logger_file, XTL_DETAIL, - XTL_STDIOSTREAM_SHOW_DATE|XTL_STDIOSTREAM_SHOW_PID); - if (!xch->dombuild_logger) - return -1; - - xc_dom_printf(xch, "### ----- xc domain builder logfile opened -----"); - - return 0; -} - -void xc_dom_printf(xc_interface *xch, const char *fmt, ...) -{ - va_list args; - if (!xch->dombuild_logger) return; - va_start(args, fmt); - xtl_logv(xch->dombuild_logger, XTL_DETAIL, -1, "domainbuilder", fmt, args); - va_end(args); -} - -void xc_dom_panic_func(xc_interface *xch, - const char *file, int line, xc_error_code err, - const char *fmt, ...) -{ - va_list args; - char msg[XC_MAX_ERROR_MSG_LEN]; - - va_start(args, fmt); - vsnprintf(msg, sizeof(msg), fmt, args); - va_end(args); - msg[sizeof(msg)-1] = 0; - - xc_report(xch, - xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler, - XTL_ERROR, err, "panic: %s:%d: %s", - file, line, msg); -} - -static void print_mem(struct xc_dom_image *dom, const char *name, size_t mem) -{ - if ( mem > (32 * 1024 * 1024) ) - DOMPRINTF("%-24s : %zd MB", name, mem / (1024 * 1024)); - else if ( mem > (32 * 1024) ) - DOMPRINTF("%-24s : %zd kB", name, mem / 1024); - else - DOMPRINTF("%-24s : %zd bytes", name, mem); -} - -void xc_dom_log_memory_footprint(struct xc_dom_image *dom) -{ - DOMPRINTF("domain builder memory footprint"); - DOMPRINTF(" allocated"); - print_mem(dom, " malloc", dom->alloc_malloc); - print_mem(dom, " anon mmap", dom->alloc_mem_map); - DOMPRINTF(" mapped"); - print_mem(dom, " file mmap", dom->alloc_file_map); - print_mem(dom, " domU mmap", dom->alloc_domU_map); -} - -/* ------------------------------------------------------------------------ */ -/* simple memory pool */ - -void *xc_dom_malloc(struct xc_dom_image *dom, size_t size) -{ - struct xc_dom_mem *block; - - if ( size > SIZE_MAX - sizeof(*block) ) - { - DOMPRINTF("%s: unreasonable allocation size", __FUNCTION__); - return NULL; - } - block = malloc(sizeof(*block) + size); - if ( block == NULL ) - { - DOMPRINTF("%s: allocation failed", __FUNCTION__); - return NULL; - } - memset(block, 0, sizeof(*block) + size); - block->type = XC_DOM_MEM_TYPE_MALLOC_INTERNAL; - block->next = dom->memblocks; - dom->memblocks = block; - dom->alloc_malloc += sizeof(*block) + size; - if ( size > (100 * 1024) ) - print_mem(dom, __FUNCTION__, size); - return block->memory; -} - -void *xc_dom_malloc_page_aligned(struct xc_dom_image *dom, size_t size) -{ - struct xc_dom_mem *block; - - block = malloc(sizeof(*block)); - if ( block == NULL ) - { - DOMPRINTF("%s: allocation failed", __FUNCTION__); - return NULL; - } - memset(block, 0, sizeof(*block)); - block->len = size; - block->ptr = mmap(NULL, block->len, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, - -1, 0); - if ( block->ptr == MAP_FAILED ) - { - DOMPRINTF("%s: mmap failed", __FUNCTION__); - free(block); - return NULL; - } - block->type = XC_DOM_MEM_TYPE_MMAP; - block->next = dom->memblocks; - dom->memblocks = block; - dom->alloc_malloc += sizeof(*block); - dom->alloc_mem_map += block->len; - if ( size > (100 * 1024) ) - print_mem(dom, __FUNCTION__, size); - return block->ptr; -} - -int xc_dom_register_external(struct xc_dom_image *dom, void *ptr, size_t size) -{ - struct xc_dom_mem *block; - - block = malloc(sizeof(*block)); - if ( block == NULL ) - { - DOMPRINTF("%s: allocation failed", __FUNCTION__); - return -1; - } - memset(block, 0, sizeof(*block)); - block->ptr = ptr; - block->len = size; - block->type = XC_DOM_MEM_TYPE_MALLOC_EXTERNAL; - block->next = dom->memblocks; - dom->memblocks = block; - dom->alloc_malloc += sizeof(*block); - dom->alloc_mem_map += block->len; - return 0; -} - -void *xc_dom_malloc_filemap(struct xc_dom_image *dom, - const char *filename, size_t * size, - const size_t max_size) -{ - struct xc_dom_mem *block = NULL; - int fd = -1; - off_t offset; - - fd = open(filename, O_RDONLY); - if ( fd == -1 ) { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "failed to open file '%s': %s", - filename, strerror(errno)); - goto err; - } - - if ( (lseek(fd, 0, SEEK_SET) == -1) || - ((offset = lseek(fd, 0, SEEK_END)) == -1) ) { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "failed to seek on file '%s': %s", - filename, strerror(errno)); - goto err; - } - - *size = offset; - - if ( max_size && *size > max_size ) - { - xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, - "tried to map file which is too large"); - goto err; - } - - if ( !*size ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "'%s': zero length file", filename); - goto err; - } - - block = malloc(sizeof(*block)); - if ( block == NULL ) { - xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, - "failed to allocate block (%zu bytes)", - sizeof(*block)); - goto err; - } - - memset(block, 0, sizeof(*block)); - block->len = *size; - block->ptr = mmap(NULL, block->len, PROT_READ, - MAP_SHARED, fd, 0); - if ( block->ptr == MAP_FAILED ) { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "failed to mmap file '%s': %s", - filename, strerror(errno)); - goto err; - } - - block->type = XC_DOM_MEM_TYPE_MMAP; - block->next = dom->memblocks; - dom->memblocks = block; - dom->alloc_malloc += sizeof(*block); - dom->alloc_file_map += block->len; - close(fd); - if ( *size > (100 * 1024) ) - print_mem(dom, __FUNCTION__, *size); - return block->ptr; - - err: - if ( fd != -1 ) - close(fd); - free(block); - DOMPRINTF("%s: failed (on file `%s')", __FUNCTION__, filename); - return NULL; -} - -static void xc_dom_free_all(struct xc_dom_image *dom) -{ - struct xc_dom_mem *block; - - while ( (block = dom->memblocks) != NULL ) - { - dom->memblocks = block->next; - switch ( block->type ) - { - case XC_DOM_MEM_TYPE_MALLOC_INTERNAL: - break; - case XC_DOM_MEM_TYPE_MALLOC_EXTERNAL: - free(block->ptr); - break; - case XC_DOM_MEM_TYPE_MMAP: - munmap(block->ptr, block->len); - break; - } - free(block); - } -} - -char *xc_dom_strdup(struct xc_dom_image *dom, const char *str) -{ - size_t len = strlen(str) + 1; - char *nstr = xc_dom_malloc(dom, len); - - if ( nstr == NULL ) - return NULL; - memcpy(nstr, str, len); - return nstr; -} - -/* ------------------------------------------------------------------------ */ -/* decompression buffer sizing */ -int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz) -{ - /* No limit */ - if ( !dom->max_kernel_size ) - return 0; - - if ( sz > dom->max_kernel_size ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "kernel image too large"); - return 1; - } - - return 0; -} - -/* ------------------------------------------------------------------------ */ -/* read files, copy memory blocks, with transparent gunzip */ - -size_t xc_dom_check_gzip(xc_interface *xch, void *blob, size_t ziplen) -{ - unsigned char *gzlen; - size_t unziplen; - - if ( ziplen < 6 ) - /* Too small. We need (i.e. the subsequent code relies on) - * 2 bytes for the magic number plus 4 bytes length. */ - return 0; - - if ( strncmp(blob, "\037\213", 2) ) - /* not gzipped */ - return 0; - - gzlen = blob + ziplen - 4; - unziplen = (size_t)gzlen[3] << 24 | gzlen[2] << 16 | gzlen[1] << 8 | gzlen[0]; - if ( unziplen > XC_DOM_DECOMPRESS_MAX ) - { - xc_dom_printf - (xch, - "%s: size (zip %zd, unzip %zd) looks insane, skip gunzip", - __FUNCTION__, ziplen, unziplen); - return 0; - } - - return unziplen + 16; -} - -int xc_dom_do_gunzip(xc_interface *xch, - void *src, size_t srclen, void *dst, size_t dstlen) -{ - z_stream zStream; - int rc; - - memset(&zStream, 0, sizeof(zStream)); - zStream.next_in = src; - zStream.avail_in = srclen; - zStream.next_out = dst; - zStream.avail_out = dstlen; - rc = inflateInit2(&zStream, (MAX_WBITS + 32)); /* +32 means "handle gzip" */ - if ( rc != Z_OK ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: inflateInit2 failed (rc=%d)", __FUNCTION__, rc); - return -1; - } - rc = inflate(&zStream, Z_FINISH); - inflateEnd(&zStream); - if ( rc != Z_STREAM_END ) - { - xc_dom_panic(xch, XC_INTERNAL_ERROR, - "%s: inflate failed (rc=%d)", __FUNCTION__, rc); - return -1; - } - - xc_dom_printf(xch, "%s: unzip ok, 0x%zx -> 0x%zx", - __FUNCTION__, srclen, dstlen); - return 0; -} - -int xc_dom_try_gunzip(struct xc_dom_image *dom, void **blob, size_t * size) -{ - void *unzip; - size_t unziplen; - - unziplen = xc_dom_check_gzip(dom->xch, *blob, *size); - if ( unziplen == 0 ) - return 0; - - if ( xc_dom_kernel_check_size(dom, unziplen) ) - return 0; - - unzip = xc_dom_malloc(dom, unziplen); - if ( unzip == NULL ) - return -1; - - if ( xc_dom_do_gunzip(dom->xch, *blob, *size, unzip, unziplen) == -1 ) - return -1; - - *blob = unzip; - *size = unziplen; - return 0; -} - -/* ------------------------------------------------------------------------ */ -/* domain memory */ - -void *xc_dom_pfn_to_ptr(struct xc_dom_image *dom, xen_pfn_t pfn, - xen_pfn_t count) -{ - xen_pfn_t count_out_dummy; - return xc_dom_pfn_to_ptr_retcount(dom, pfn, count, &count_out_dummy); -} - -void *xc_dom_pfn_to_ptr_retcount(struct xc_dom_image *dom, xen_pfn_t pfn, - xen_pfn_t count, xen_pfn_t *count_out) -{ - struct xc_dom_phys *phys; - xen_pfn_t offset; - unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom); - char *mode = "unset"; - - *count_out = 0; - - offset = pfn - dom->rambase_pfn; - if ( offset > dom->total_pages || /* multiple checks to avoid overflows */ - count > dom->total_pages || - offset > dom->total_pages - count ) - { - DOMPRINTF("%s: pfn %"PRI_xen_pfn" out of range (0x%" PRIpfn " > 0x%" PRIpfn ")", - __FUNCTION__, pfn, offset, dom->total_pages); - return NULL; - } - - /* already allocated? */ - for ( phys = dom->phys_pages; phys != NULL; phys = phys->next ) - { - if ( pfn >= (phys->first + phys->count) ) - continue; - if ( count ) - { - /* size given: must be completely within the already allocated block */ - if ( (pfn + count) <= phys->first ) - continue; - if ( (pfn < phys->first) || - ((pfn + count) > (phys->first + phys->count)) ) - { - DOMPRINTF("%s: request overlaps allocated block" - " (req 0x%" PRIpfn "+0x%" PRIpfn "," - " blk 0x%" PRIpfn "+0x%" PRIpfn ")", - __FUNCTION__, pfn, count, phys->first, - phys->count); - return NULL; - } - *count_out = count; - } - else - { - /* no size given: block must be allocated already, - just hand out a pointer to it */ - if ( pfn < phys->first ) - continue; - if ( pfn >= phys->first + phys->count ) - continue; - *count_out = phys->count - (pfn - phys->first); - } - return phys->ptr + ((pfn - phys->first) << page_shift); - } - - /* allocating is allowed with size specified only */ - if ( count == 0 ) - { - DOMPRINTF("%s: no block found, no size given," - " can't malloc (pfn 0x%" PRIpfn ")", - __FUNCTION__, pfn); - return NULL; - } - - /* not found, no overlap => allocate */ - phys = xc_dom_malloc(dom, sizeof(*phys)); - if ( phys == NULL ) - return NULL; - memset(phys, 0, sizeof(*phys)); - phys->first = pfn; - phys->count = count; - - if ( dom->guest_domid ) - { - mode = "domU mapping"; - phys->ptr = xc_dom_boot_domU_map(dom, phys->first, phys->count); - if ( phys->ptr == NULL ) - return NULL; - dom->alloc_domU_map += phys->count << page_shift; - } - else - { - int err; - - mode = "anonymous memory"; - phys->ptr = mmap(NULL, phys->count << page_shift, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, - -1, 0); - if ( phys->ptr == MAP_FAILED ) - { - err = errno; - xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, - "%s: oom: can't allocate 0x%" PRIpfn " pages" - " [mmap, errno=%i (%s)]", - __FUNCTION__, count, err, strerror(err)); - return NULL; - } - dom->alloc_mem_map += phys->count << page_shift; - } - -#if 1 - DOMPRINTF("%s: %s: pfn 0x%" PRIpfn "+0x%" PRIpfn " at %p", - __FUNCTION__, mode, phys->first, phys->count, phys->ptr); -#endif - phys->next = dom->phys_pages; - dom->phys_pages = phys; - return phys->ptr; -} - -static int xc_dom_chk_alloc_pages(struct xc_dom_image *dom, char *name, - xen_pfn_t pages) -{ - unsigned int page_size = XC_DOM_PAGE_SIZE(dom); - - if ( pages > dom->total_pages || /* multiple test avoids overflow probs */ - dom->pfn_alloc_end - dom->rambase_pfn > dom->total_pages || - pages > dom->total_pages - dom->pfn_alloc_end + dom->rambase_pfn ) - { - xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, - "%s: segment %s too large (0x%"PRIpfn" > " - "0x%"PRIpfn" - 0x%"PRIpfn" pages)", __FUNCTION__, name, - pages, dom->total_pages, - dom->pfn_alloc_end - dom->rambase_pfn); - return -1; - } - - dom->pfn_alloc_end += pages; - dom->virt_alloc_end += pages * page_size; - - if ( dom->allocate ) - dom->allocate(dom); - - return 0; -} - -static int xc_dom_alloc_pad(struct xc_dom_image *dom, xen_vaddr_t boundary) -{ - unsigned int page_size = XC_DOM_PAGE_SIZE(dom); - xen_pfn_t pages; - - if ( boundary & (page_size - 1) ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: segment boundary isn't page aligned (0x%" PRIx64 ")", - __FUNCTION__, boundary); - return -1; - } - if ( boundary < dom->virt_alloc_end ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: segment boundary too low (0x%" PRIx64 " < 0x%" PRIx64 - ")", __FUNCTION__, boundary, dom->virt_alloc_end); - return -1; - } - pages = (boundary - dom->virt_alloc_end) / page_size; - - return xc_dom_chk_alloc_pages(dom, "padding", pages); -} - -int xc_dom_alloc_segment(struct xc_dom_image *dom, - struct xc_dom_seg *seg, char *name, - xen_vaddr_t start, xen_vaddr_t size) -{ - unsigned int page_size = XC_DOM_PAGE_SIZE(dom); - xen_pfn_t pages; - void *ptr; - - if ( start && xc_dom_alloc_pad(dom, start) ) - return -1; - - pages = (size + page_size - 1) / page_size; - start = dom->virt_alloc_end; - - seg->pfn = dom->pfn_alloc_end; - seg->pages = pages; - - if ( xc_dom_chk_alloc_pages(dom, name, pages) ) - return -1; - - /* map and clear pages */ - ptr = xc_dom_seg_to_ptr(dom, seg); - if ( ptr == NULL ) - return -1; - memset(ptr, 0, pages * page_size); - - seg->vstart = start; - seg->vend = dom->virt_alloc_end; - - DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " -> 0x%" PRIx64 - " (pfn 0x%" PRIpfn " + 0x%" PRIpfn " pages)", - __FUNCTION__, name, seg->vstart, seg->vend, seg->pfn, pages); - - return 0; -} - -xen_pfn_t xc_dom_alloc_page(struct xc_dom_image *dom, char *name) -{ - xen_vaddr_t start; - xen_pfn_t pfn; - - start = dom->virt_alloc_end; - pfn = dom->pfn_alloc_end - dom->rambase_pfn; - - if ( xc_dom_chk_alloc_pages(dom, name, 1) ) - return INVALID_PFN; - - DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " (pfn 0x%" PRIpfn ")", - __FUNCTION__, name, start, pfn); - return pfn; -} - -void xc_dom_unmap_one(struct xc_dom_image *dom, xen_pfn_t pfn) -{ - unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom); - struct xc_dom_phys *phys, *prev = NULL; - - for ( phys = dom->phys_pages; phys != NULL; phys = phys->next ) - { - if ( (pfn >= phys->first) && (pfn < (phys->first + phys->count)) ) - break; - prev = phys; - } - if ( !phys ) - { - DOMPRINTF("%s: Huh? no mapping with pfn 0x%" PRIpfn "", - __FUNCTION__, pfn); - return; - } - - munmap(phys->ptr, phys->count << page_shift); - if ( prev ) - prev->next = phys->next; - else - dom->phys_pages = phys->next; - - xc_domain_cacheflush(dom->xch, dom->guest_domid, phys->first, phys->count); -} - -void xc_dom_unmap_all(struct xc_dom_image *dom) -{ - while ( dom->phys_pages ) - xc_dom_unmap_one(dom, dom->phys_pages->first); -} - -/* ------------------------------------------------------------------------ */ -/* pluggable kernel loaders */ - -static struct xc_dom_loader *first_loader = NULL; -static struct xc_dom_arch *first_hook = NULL; - -void xc_dom_register_loader(struct xc_dom_loader *loader) -{ - loader->next = first_loader; - first_loader = loader; -} - -static struct xc_dom_loader *xc_dom_find_loader(struct xc_dom_image *dom) -{ - struct xc_dom_loader *loader = first_loader; - - while ( loader != NULL ) - { - DOMPRINTF("%s: trying %s loader ... ", __FUNCTION__, loader->name); - if ( loader->probe(dom) == 0 ) - { - DOMPRINTF("loader probe OK"); - return loader; - } - DOMPRINTF("loader probe failed"); - loader = loader->next; - } - xc_dom_panic(dom->xch, - XC_INVALID_KERNEL, "%s: no loader found", __FUNCTION__); - return NULL; -} - -void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks) -{ - hooks->next = first_hook; - first_hook = hooks; -} - -int xc_dom_set_arch_hooks(struct xc_dom_image *dom) -{ - struct xc_dom_arch *hooks = first_hook; - - while ( hooks != NULL ) - { - if ( !strcmp(hooks->guest_type, dom->guest_type) ) - { - if ( hooks->arch_private_size ) - { - dom->arch_private = malloc(hooks->arch_private_size); - if ( dom->arch_private == NULL ) - return -1; - memset(dom->arch_private, 0, hooks->arch_private_size); - dom->alloc_malloc += hooks->arch_private_size; - } - dom->arch_hooks = hooks; - return 0; - } - hooks = hooks->next; - } - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s: not found (type %s)", __FUNCTION__, dom->guest_type); - return -1; -} - -/* ------------------------------------------------------------------------ */ -/* public interface */ - -void xc_dom_release(struct xc_dom_image *dom) -{ - DOMPRINTF_CALLED(dom->xch); - if ( dom->phys_pages ) - xc_dom_unmap_all(dom); - xc_dom_free_all(dom); - free(dom->arch_private); - free(dom); -} - -struct xc_dom_image *xc_dom_allocate(xc_interface *xch, - const char *cmdline, const char *features) -{ - struct xc_dom_image *dom; - - xc_dom_printf(xch, "%s: cmdline=\"%s\", features=\"%s\"", - __FUNCTION__, cmdline ? cmdline : "", - features ? features : ""); - dom = malloc(sizeof(*dom)); - if ( !dom ) - goto err; - - memset(dom, 0, sizeof(*dom)); - dom->xch = xch; - - dom->max_kernel_size = XC_DOM_DECOMPRESS_MAX; - dom->max_module_size = XC_DOM_DECOMPRESS_MAX; - dom->max_devicetree_size = XC_DOM_DECOMPRESS_MAX; - - if ( cmdline ) - dom->cmdline = xc_dom_strdup(dom, cmdline); - if ( features ) - elf_xen_parse_features(features, dom->f_requested, NULL); - - dom->parms.virt_base = UNSET_ADDR; - dom->parms.virt_entry = UNSET_ADDR; - dom->parms.virt_hypercall = UNSET_ADDR; - dom->parms.virt_hv_start_low = UNSET_ADDR; - dom->parms.elf_paddr_offset = UNSET_ADDR; - dom->parms.p2m_base = UNSET_ADDR; - - dom->flags = SIF_VIRT_P2M_4TOOLS; - - dom->alloc_malloc += sizeof(*dom); - return dom; - - err: - if ( dom ) - xc_dom_release(dom); - return NULL; -} - -int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz) -{ - DOMPRINTF("%s: kernel_max_size=%zx", __FUNCTION__, sz); - dom->max_kernel_size = sz; - return 0; -} - -int xc_dom_module_max_size(struct xc_dom_image *dom, size_t sz) -{ - DOMPRINTF("%s: module_max_size=%zx", __FUNCTION__, sz); - dom->max_module_size = sz; - return 0; -} - -int xc_dom_devicetree_max_size(struct xc_dom_image *dom, size_t sz) -{ - DOMPRINTF("%s: devicetree_max_size=%zx", __FUNCTION__, sz); - dom->max_devicetree_size = sz; - return 0; -} - -int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename) -{ - DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); - dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size, - dom->max_kernel_size); - if ( dom->kernel_blob == NULL ) - return -1; - return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); -} - -int xc_dom_module_file(struct xc_dom_image *dom, const char *filename, const char *cmdline) -{ - unsigned int mod = dom->num_modules++; - - DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); - dom->modules[mod].blob = - xc_dom_malloc_filemap(dom, filename, &dom->modules[mod].size, - dom->max_module_size); - - if ( dom->modules[mod].blob == NULL ) - return -1; - - if ( cmdline ) - { - dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline); - - if ( dom->modules[mod].cmdline == NULL ) - return -1; - } - else - { - dom->modules[mod].cmdline = NULL; - } - - return 0; -} - -int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename) -{ -#if defined (__arm__) || defined(__aarch64__) - DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename); - dom->devicetree_blob = - xc_dom_malloc_filemap(dom, filename, &dom->devicetree_size, - dom->max_devicetree_size); - - if ( dom->devicetree_blob == NULL ) - return -1; - return 0; -#else - errno = -EINVAL; - return -1; -#endif -} - -int xc_dom_kernel_mem(struct xc_dom_image *dom, const void *mem, size_t memsize) -{ - DOMPRINTF_CALLED(dom->xch); - dom->kernel_blob = (void *)mem; - dom->kernel_size = memsize; - return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size); -} - -int xc_dom_module_mem(struct xc_dom_image *dom, const void *mem, - size_t memsize, const char *cmdline) -{ - unsigned int mod = dom->num_modules++; - - DOMPRINTF_CALLED(dom->xch); - - dom->modules[mod].blob = (void *)mem; - dom->modules[mod].size = memsize; - - if ( cmdline ) - { - dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline); - - if ( dom->modules[mod].cmdline == NULL ) - return -1; - } - else - { - dom->modules[mod].cmdline = NULL; - } - - return 0; -} - -int xc_dom_devicetree_mem(struct xc_dom_image *dom, const void *mem, - size_t memsize) -{ - DOMPRINTF_CALLED(dom->xch); - dom->devicetree_blob = (void *)mem; - dom->devicetree_size = memsize; - return 0; -} - -int xc_dom_parse_image(struct xc_dom_image *dom) -{ - int i; - - DOMPRINTF_CALLED(dom->xch); - - /* parse kernel image */ - dom->kernel_loader = xc_dom_find_loader(dom); - if ( dom->kernel_loader == NULL ) - goto err; - if ( dom->kernel_loader->parser(dom) != 0 ) - goto err; - if ( dom->guest_type == NULL ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: guest_type not set", __FUNCTION__); - goto err; - } - - /* check features */ - for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ ) - { - dom->f_active[i] |= dom->f_requested[i]; /* cmd line */ - dom->f_active[i] |= dom->parms.f_required[i]; /* kernel */ - if ( (dom->f_active[i] & dom->parms.f_supported[i]) != - dom->f_active[i] ) - { - xc_dom_panic(dom->xch, XC_INVALID_PARAM, - "%s: unsupported feature requested", __FUNCTION__); - goto err; - } - } - return 0; - - err: - return -1; -} - -int xc_dom_rambase_init(struct xc_dom_image *dom, uint64_t rambase) -{ - dom->rambase_pfn = rambase >> XC_PAGE_SHIFT; - dom->pfn_alloc_end = dom->rambase_pfn; - DOMPRINTF("%s: RAM starts at %"PRI_xen_pfn, - __FUNCTION__, dom->rambase_pfn); - return 0; -} - -int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb) -{ - unsigned int page_shift; - xen_pfn_t nr_pages; - - if ( xc_dom_set_arch_hooks(dom) ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set", - __FUNCTION__); - return -1; - } - - page_shift = XC_DOM_PAGE_SHIFT(dom); - nr_pages = mem_mb << (20 - page_shift); - - DOMPRINTF("%s: mem %d MB, pages 0x%" PRIpfn " pages, %dk each", - __FUNCTION__, mem_mb, nr_pages, 1 << (page_shift-10)); - dom->total_pages = nr_pages; - - DOMPRINTF("%s: 0x%" PRIpfn " pages", - __FUNCTION__, dom->total_pages); - - return 0; -} - -static int xc_dom_build_module(struct xc_dom_image *dom, unsigned int mod) -{ - size_t unziplen, modulelen; - void *modulemap; - char name[10]; - - if ( !dom->modules[mod].seg.vstart ) - unziplen = xc_dom_check_gzip(dom->xch, - dom->modules[mod].blob, dom->modules[mod].size); - else - unziplen = 0; - - modulelen = max(unziplen, dom->modules[mod].size); - if ( dom->max_module_size ) - { - if ( unziplen && modulelen > dom->max_module_size ) - { - modulelen = min(unziplen, dom->modules[mod].size); - if ( unziplen > modulelen ) - unziplen = 0; - } - if ( modulelen > dom->max_module_size ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "module %u image too large", mod); - goto err; - } - } - - snprintf(name, sizeof(name), "module%u", mod); - if ( xc_dom_alloc_segment(dom, &dom->modules[mod].seg, name, - dom->modules[mod].seg.vstart, modulelen) != 0 ) - goto err; - modulemap = xc_dom_seg_to_ptr(dom, &dom->modules[mod].seg); - if ( modulemap == NULL ) - { - DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->modules[%u].seg) => NULL", - __FUNCTION__, mod); - goto err; - } - if ( unziplen ) - { - if ( xc_dom_do_gunzip(dom->xch, dom->modules[mod].blob, dom->modules[mod].size, - modulemap, unziplen) != -1 ) - return 0; - if ( dom->modules[mod].size > modulelen ) - goto err; - } - - /* Fall back to handing over the raw blob. */ - memcpy(modulemap, dom->modules[mod].blob, dom->modules[mod].size); - /* If an unzip attempt was made, the buffer may no longer be all zero. */ - if ( unziplen > dom->modules[mod].size ) - memset(modulemap + dom->modules[mod].size, 0, - unziplen - dom->modules[mod].size); - - return 0; - - err: - return -1; -} - -static int populate_acpi_pages(struct xc_dom_image *dom, - xen_pfn_t *extents, - unsigned int num_pages) -{ - int rc; - xc_interface *xch = dom->xch; - uint32_t domid = dom->guest_domid; - unsigned long idx; - unsigned long first_high_idx = 4UL << (30 - PAGE_SHIFT); /* 4GB */ - - for ( ; num_pages; num_pages--, extents++ ) - { - - if ( xc_domain_populate_physmap(xch, domid, 1, 0, 0, extents) == 1 ) - continue; - - if ( dom->highmem_end ) - { - idx = --dom->highmem_end; - if ( idx == first_high_idx ) - dom->highmem_end = 0; - } - else - { - idx = --dom->lowmem_end; - } - - rc = xc_domain_add_to_physmap(xch, domid, - XENMAPSPACE_gmfn, - idx, *extents); - if ( rc ) - return rc; - } - - return 0; -} - -static int xc_dom_load_acpi(struct xc_dom_image *dom) -{ - int j, i = 0; - unsigned num_pages; - xen_pfn_t *extents, base; - void *ptr; - - while ( (i < MAX_ACPI_MODULES) && dom->acpi_modules[i].length ) - { - DOMPRINTF("%s: %d bytes at address %" PRIx64, __FUNCTION__, - dom->acpi_modules[i].length, - dom->acpi_modules[i].guest_addr_out); - - num_pages = (dom->acpi_modules[i].length + - (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK) + - (XC_PAGE_SIZE - 1)) >> XC_PAGE_SHIFT; - extents = malloc(num_pages * sizeof(*extents)); - if ( !extents ) - { - DOMPRINTF("%s: Out of memory", __FUNCTION__); - goto err; - } - - base = dom->acpi_modules[i].guest_addr_out >> XC_PAGE_SHIFT; - for ( j = 0; j < num_pages; j++ ) - extents[j] = base + j; - if ( populate_acpi_pages(dom, extents, num_pages) ) - { - DOMPRINTF("%s: Can populate ACPI pages", __FUNCTION__); - goto err; - } - - ptr = xc_map_foreign_range(dom->xch, dom->guest_domid, - XC_PAGE_SIZE * num_pages, - PROT_READ | PROT_WRITE, base); - if ( !ptr ) - { - DOMPRINTF("%s: Can't map %d pages at 0x%"PRI_xen_pfn, - __FUNCTION__, num_pages, base); - goto err; - } - - memcpy((uint8_t *)ptr + - (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK), - dom->acpi_modules[i].data, dom->acpi_modules[i].length); - munmap(ptr, XC_PAGE_SIZE * num_pages); - - free(extents); - i++; - } - - return 0; - -err: - free(extents); - return -1; -} - -int xc_dom_build_image(struct xc_dom_image *dom) -{ - unsigned int page_size; - bool unmapped_initrd; - unsigned int mod; - - DOMPRINTF_CALLED(dom->xch); - - /* check for arch hooks */ - if ( dom->arch_hooks == NULL ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set", - __FUNCTION__); - goto err; - } - page_size = XC_DOM_PAGE_SIZE(dom); - if ( dom->parms.virt_base != UNSET_ADDR ) - dom->virt_alloc_end = dom->parms.virt_base; - - /* load kernel */ - if ( xc_dom_alloc_segment(dom, &dom->kernel_seg, "kernel", - dom->kernel_seg.vstart, - dom->kernel_seg.vend - - dom->kernel_seg.vstart) != 0 ) - goto err; - if ( dom->kernel_loader->loader(dom) != 0 ) - goto err; - - /* Don't load ramdisk / other modules now if no initial mapping required. */ - for ( mod = 0; mod < dom->num_modules; mod++ ) - { - unmapped_initrd = (dom->parms.unmapped_initrd && - !dom->modules[mod].seg.vstart); - - if ( dom->modules[mod].blob && !unmapped_initrd ) - { - if ( xc_dom_build_module(dom, mod) != 0 ) - goto err; - - if ( mod == 0 ) - { - dom->initrd_start = dom->modules[mod].seg.vstart; - dom->initrd_len = - dom->modules[mod].seg.vend - dom->modules[mod].seg.vstart; - } - } - } - - /* load devicetree */ - if ( dom->devicetree_blob ) - { - void *devicetreemap; - - if ( xc_dom_alloc_segment(dom, &dom->devicetree_seg, "devicetree", - dom->devicetree_seg.vstart, - dom->devicetree_size) != 0 ) - goto err; - devicetreemap = xc_dom_seg_to_ptr(dom, &dom->devicetree_seg); - if ( devicetreemap == NULL ) - { - DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->devicetree_seg) => NULL", - __FUNCTION__); - goto err; - } - memcpy(devicetreemap, dom->devicetree_blob, dom->devicetree_size); - } - - /* load ACPI tables */ - if ( xc_dom_load_acpi(dom) != 0 ) - goto err; - - /* allocate other pages */ - if ( !dom->arch_hooks->p2m_base_supported || - dom->parms.p2m_base >= dom->parms.virt_base || - (dom->parms.p2m_base & (XC_DOM_PAGE_SIZE(dom) - 1)) ) - dom->parms.p2m_base = UNSET_ADDR; - if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base == UNSET_ADDR && - dom->arch_hooks->alloc_p2m_list(dom) != 0 ) - goto err; - if ( dom->arch_hooks->alloc_magic_pages(dom) != 0 ) - goto err; - if ( dom->arch_hooks->alloc_pgtables && - dom->arch_hooks->alloc_pgtables(dom) != 0 ) - goto err; - if ( dom->alloc_bootstack ) - { - dom->bootstack_pfn = xc_dom_alloc_page(dom, "boot stack"); - if ( dom->bootstack_pfn == INVALID_PFN ) - goto err; - } - - DOMPRINTF("%-20s: virt_alloc_end : 0x%" PRIx64 "", - __FUNCTION__, dom->virt_alloc_end); - DOMPRINTF("%-20s: virt_pgtab_end : 0x%" PRIx64 "", - __FUNCTION__, dom->virt_pgtab_end); - - /* Make sure all memory mapped by initial page tables is available */ - if ( dom->virt_pgtab_end && xc_dom_alloc_pad(dom, dom->virt_pgtab_end) ) - return -1; - - for ( mod = 0; mod < dom->num_modules; mod++ ) - { - unmapped_initrd = (dom->parms.unmapped_initrd && - !dom->modules[mod].seg.vstart); - - /* Load ramdisk / other modules if no initial mapping required. */ - if ( dom->modules[mod].blob && unmapped_initrd ) - { - if ( xc_dom_build_module(dom, mod) != 0 ) - goto err; - - if ( mod == 0 ) - { - dom->flags |= SIF_MOD_START_PFN; - dom->initrd_start = dom->modules[mod].seg.pfn; - dom->initrd_len = page_size * dom->modules[mod].seg.pages; - } - } - } - - /* Allocate p2m list if outside of initial kernel mapping. */ - if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base != UNSET_ADDR ) - { - if ( dom->arch_hooks->alloc_p2m_list(dom) != 0 ) - goto err; - dom->p2m_seg.vstart = dom->parms.p2m_base; - } - - return 0; - - err: - return -1; -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_decompress.h b/tools/libxc/xg_dom_decompress.h deleted file mode 100644 index c5ab2e59eb..0000000000 --- a/tools/libxc/xg_dom_decompress.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __MINIOS__ -# include "xenctrl_dom.h" -#else -# include "xg_dom_decompress_unsafe.h" -#endif - -int xc_try_lz4_decode(struct xc_dom_image *dom, void **blob, size_t *size); - diff --git a/tools/libxc/xg_dom_decompress_lz4.c b/tools/libxc/xg_dom_decompress_lz4.c deleted file mode 100644 index 97ba620d86..0000000000 --- a/tools/libxc/xg_dom_decompress_lz4.c +++ /dev/null @@ -1,141 +0,0 @@ -#include -#include -#include -#include - -#include "xg_private.h" -#include "xg_dom_decompress.h" - -#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; - -#define likely(a) a -#define unlikely(a) a - -static inline uint_fast16_t le16_to_cpup(const unsigned char *buf) -{ - return buf[0] | (buf[1] << 8); -} - -static inline uint_fast32_t le32_to_cpup(const unsigned char *buf) -{ - return le16_to_cpup(buf) | ((uint32_t)le16_to_cpup(buf + 2) << 16); -} - -#include "../../xen/include/xen/lz4.h" -#include "../../xen/common/decompress.h" - -#ifndef __MINIOS__ - -#include "../../xen/common/lz4/decompress.c" - -#define ARCHIVE_MAGICNUMBER 0x184C2102 - -int xc_try_lz4_decode( - struct xc_dom_image *dom, void **blob, size_t *psize) -{ - int ret = -1; - unsigned char *inp = *blob, *output, *outp; - ssize_t size = *psize - 4; - size_t out_len, dest_len, chunksize; - const char *msg; - - if (size < 4) { - msg = "input too small"; - goto exit_0; - } - - out_len = get_unaligned_le32(inp + size); - if (xc_dom_kernel_check_size(dom, out_len)) { - msg = "Decompressed image too large"; - goto exit_0; - } - - output = malloc(out_len); - if (!output) { - msg = "Could not allocate output buffer"; - goto exit_0; - } - outp = output; - - chunksize = get_unaligned_le32(inp); - if (chunksize == ARCHIVE_MAGICNUMBER) { - inp += 4; - size -= 4; - } else { - msg = "invalid header"; - goto exit_2; - } - - for (;;) { - if (size < 4) { - msg = "missing data"; - goto exit_2; - } - chunksize = get_unaligned_le32(inp); - if (chunksize == ARCHIVE_MAGICNUMBER) { - inp += 4; - size -= 4; - continue; - } - inp += 4; - size -= 4; - if (chunksize > size) { - msg = "insufficient input data"; - goto exit_2; - } - - dest_len = out_len - (outp - output); - ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp, - &dest_len); - if (ret < 0) { - msg = "decoding failed"; - goto exit_2; - } - - ret = -1; - outp += dest_len; - size -= chunksize; - - if (size == 0) - { - if ( xc_dom_register_external(dom, output, out_len) ) - { - msg = "Error registering stream output"; - goto exit_2; - } - *blob = output; - *psize = out_len; - return 0; - } - - if (size < 0) { - msg = "data corrupted"; - goto exit_2; - } - - inp += chunksize; - } - -exit_2: - free(output); -exit_0: - DOMPRINTF("LZ4 decompression error: %s\n", msg); - return ret; -} - -#else /* __MINIOS__ */ - -#include "../../xen/common/unlz4.c" - -int xc_try_lz4_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - return xc_dom_decompress_unsafe(unlz4, dom, blob, size); -} - -#endif diff --git a/tools/libxc/xg_dom_decompress_unsafe.c b/tools/libxc/xg_dom_decompress_unsafe.c deleted file mode 100644 index 21d964787d..0000000000 --- a/tools/libxc/xg_dom_decompress_unsafe.c +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include -#include - -#include "xg_private.h" -#include "xg_dom_decompress_unsafe.h" - -static struct xc_dom_image *unsafe_dom; -static unsigned char *output_blob; -static unsigned int output_size; - -static void unsafe_error(const char *msg) -{ - xc_dom_panic(unsafe_dom->xch, XC_INVALID_KERNEL, "%s", msg); -} - -static int unsafe_flush(void *src, unsigned int size) -{ - void *n = realloc(output_blob, output_size + size); - if (!n) - return -1; - output_blob = n; - - memcpy(&output_blob[output_size], src, size); - output_size += size; - return size; -} - -int xc_dom_decompress_unsafe( - decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size) -{ - int ret; - - unsafe_dom = dom; - output_blob = NULL; - output_size = 0; - - ret = fn(dom->kernel_blob, dom->kernel_size, NULL, unsafe_flush, NULL, NULL, unsafe_error); - - if (ret) - free(output_blob); - else { - *blob = output_blob; - *size = output_size; - } - - return ret; -} diff --git a/tools/libxc/xg_dom_decompress_unsafe.h b/tools/libxc/xg_dom_decompress_unsafe.h deleted file mode 100644 index fb84b6add8..0000000000 --- a/tools/libxc/xg_dom_decompress_unsafe.h +++ /dev/null @@ -1,20 +0,0 @@ -#include "xenctrl_dom.h" - -typedef int decompress_fn(unsigned char *inbuf, unsigned int len, - int (*fill)(void*, unsigned int), - int (*flush)(void*, unsigned int), - unsigned char *outbuf, unsigned int *posp, - void (*error)(const char *x)); - -int xc_dom_decompress_unsafe( - decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size) - __attribute__((visibility("internal"))); - -int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size) - __attribute__((visibility("internal"))); -int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size) - __attribute__((visibility("internal"))); -int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size) - __attribute__((visibility("internal"))); -int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size) - __attribute__((visibility("internal"))); diff --git a/tools/libxc/xg_dom_decompress_unsafe_bzip2.c b/tools/libxc/xg_dom_decompress_unsafe_bzip2.c deleted file mode 100644 index 9d3709e6cc..0000000000 --- a/tools/libxc/xg_dom_decompress_unsafe_bzip2.c +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include -#include - -#include "xg_private.h" -#include "xg_dom_decompress_unsafe.h" - -#include "../../xen/common/bunzip2.c" - -int xc_try_bzip2_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - return xc_dom_decompress_unsafe(bunzip2, dom, blob, size); -} diff --git a/tools/libxc/xg_dom_decompress_unsafe_lzma.c b/tools/libxc/xg_dom_decompress_unsafe_lzma.c deleted file mode 100644 index 5d178f0c43..0000000000 --- a/tools/libxc/xg_dom_decompress_unsafe_lzma.c +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include -#include - -#include "xg_private.h" -#include "xg_dom_decompress_unsafe.h" - -#include "../../xen/common/unlzma.c" - -int xc_try_lzma_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - return xc_dom_decompress_unsafe(unlzma, dom, blob, size); -} diff --git a/tools/libxc/xg_dom_decompress_unsafe_lzo1x.c b/tools/libxc/xg_dom_decompress_unsafe_lzo1x.c deleted file mode 100644 index a4f8ebd42d..0000000000 --- a/tools/libxc/xg_dom_decompress_unsafe_lzo1x.c +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include -#include -#include -#include - -#include "xg_private.h" -#include "xg_dom_decompress_unsafe.h" - -typedef uint8_t u8; -typedef uint32_t u32; -typedef uint16_t u16; -typedef uint64_t u64; - -#define likely(a) a -#define noinline -#define unlikely(a) a - -static inline u16 be16_to_cpup(const u16 *p) -{ - u16 v = *p; -#if BYTE_ORDER == LITTLE_ENDIAN - return (((v & 0x00ffU) << 8) | - ((v & 0xff00U) >> 8)); -#else - return v; -#endif -} - -static inline u32 be32_to_cpup(const u32 *p) -{ - u32 v = *p; -#if BYTE_ORDER == LITTLE_ENDIAN - return (((v & 0x000000ffUL) << 24) | - ((v & 0x0000ff00UL) << 8) | - ((v & 0x00ff0000UL) >> 8) | - ((v & 0xff000000UL) >> 24)); -#else - return v; -#endif -} - -#include "../../xen/common/lzo.c" -#include "../../xen/common/unlzo.c" - -int xc_try_lzo1x_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - return xc_dom_decompress_unsafe(unlzo, dom, blob, size); -} diff --git a/tools/libxc/xg_dom_decompress_unsafe_xz.c b/tools/libxc/xg_dom_decompress_unsafe_xz.c deleted file mode 100644 index ff6824b38d..0000000000 --- a/tools/libxc/xg_dom_decompress_unsafe_xz.c +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "xg_private.h" -#include "xg_dom_decompress_unsafe.h" - -// TODO -#define XZ_DEC_X86 - -typedef char bool_t; -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint32_t __le32; - -static inline u32 cpu_to_le32(const u32 v) -{ -#if BYTE_ORDER == BIG_ENDIAN - return (((v & 0x000000ffUL) << 24) | - ((v & 0x0000ff00UL) << 8) | - ((v & 0x00ff0000UL) >> 8) | - ((v & 0xff000000UL) >> 24)); -#else - return v; -#endif -} - -static inline u32 le32_to_cpup(const u32 *p) -{ - return cpu_to_le32(*p); -} - -#define __force -#define always_inline - -#include "../../xen/common/unxz.c" - -int xc_try_xz_decode( - struct xc_dom_image *dom, void **blob, size_t *size) -{ - return xc_dom_decompress_unsafe(unxz, dom, blob, size); -} diff --git a/tools/libxc/xg_dom_elfloader.c b/tools/libxc/xg_dom_elfloader.c deleted file mode 100644 index 7043c3bbba..0000000000 --- a/tools/libxc/xg_dom_elfloader.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Xen domain builder -- ELF bits. - * - * Parse and load ELF kernel images. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * written 2006 by Gerd Hoffmann . - * - */ - -#include -#include -#include -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" -#include "xc_bitops.h" - -#define XEN_VER "xen-3.0" - -/* ------------------------------------------------------------------------ */ - -static void log_callback(struct elf_binary *elf, void *caller_data, - bool iserr, const char *fmt, va_list al) { - xc_interface *xch = caller_data; - - xc_reportv(xch, - xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler, - iserr ? XTL_ERROR : XTL_DETAIL, - iserr ? XC_INVALID_KERNEL : XC_ERROR_NONE, - fmt, al); -} - -void xc_elf_set_logfile(xc_interface *xch, struct elf_binary *elf, - int verbose) { - elf_set_log(elf, log_callback, xch, verbose /* convert to bool */); -} - -/* ------------------------------------------------------------------------ */ - -static char *xc_dom_guest_type(struct xc_dom_image *dom, - struct elf_binary *elf) -{ - uint64_t machine = elf_uval(elf, elf->ehdr, e_machine); - - if ( dom->container_type == XC_DOM_HVM_CONTAINER && - dom->parms.phys_entry != UNSET_ADDR32 ) - return "hvm-3.0-x86_32"; - if ( dom->container_type == XC_DOM_HVM_CONTAINER ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s: image not capable of booting inside a HVM container", - __FUNCTION__); - return NULL; - } - - switch ( machine ) - { - case EM_386: - switch ( dom->parms.pae ) - { - case XEN_PAE_BIMODAL: - if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") ) - return "xen-3.0-x86_32p"; - return "xen-3.0-x86_32"; - case XEN_PAE_EXTCR3: - case XEN_PAE_YES: - return "xen-3.0-x86_32p"; - case XEN_PAE_NO: - default: - return "xen-3.0-x86_32"; - } - case EM_X86_64: - return "xen-3.0-x86_64"; - default: - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s: unknown image type %"PRIu64, - __FUNCTION__, machine); - return NULL; - } -} - -/* ------------------------------------------------------------------------ */ -/* parse elf binary */ - -static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose) -{ - if ( dom->kernel_blob == NULL ) - { - if ( verbose ) - xc_dom_panic(dom->xch, - XC_INTERNAL_ERROR, "%s: no kernel image loaded", - __FUNCTION__); - return -EINVAL; - } - - if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) ) - { - if ( verbose ) - xc_dom_panic(dom->xch, - XC_INVALID_KERNEL, "%s: kernel is not an ELF image", - __FUNCTION__); - return -EINVAL; - } - return 0; -} - -static elf_negerrnoval xc_dom_probe_elf_kernel(struct xc_dom_image *dom) -{ - struct elf_binary elf; - int rc; - - rc = check_elf_kernel(dom, 0); - if ( rc != 0 ) - return rc; - - rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size); - if ( rc != 0 ) - return rc; - - /* - * We need to check that it contains Xen ELFNOTES, - * or else we might be trying to load a plain ELF. - */ - elf_parse_binary(&elf); - rc = elf_xen_parse(&elf, &dom->parms); - if ( rc != 0 ) - return rc; - - return 0; -} - -static elf_negerrnoval xc_dom_parse_elf_kernel(struct xc_dom_image *dom) -{ - struct elf_binary *elf; - elf_negerrnoval rc; - - rc = check_elf_kernel(dom, 1); - if ( rc != 0 ) - return rc; - - elf = xc_dom_malloc(dom, sizeof(*elf)); - if ( elf == NULL ) - return -ENOMEM; - dom->private_loader = elf; - rc = elf_init(elf, dom->kernel_blob, dom->kernel_size) != 0 ? -EINVAL : 0; - xc_elf_set_logfile(dom->xch, elf, 1); - if ( rc != 0 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image", - __FUNCTION__); - return rc; - } - - /* parse binary and get xen meta info */ - elf_parse_binary(elf); - if ( elf_xen_parse(elf, &dom->parms) != 0 ) - { - rc = -EINVAL; - goto out; - } - - if ( elf_xen_feature_get(XENFEAT_dom0, dom->parms.f_required) ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Kernel does not" - " support unprivileged (DomU) operation", __FUNCTION__); - rc = -EINVAL; - goto out; - } - - /* find kernel segment */ - dom->kernel_seg.vstart = dom->parms.virt_kstart; - dom->kernel_seg.vend = dom->parms.virt_kend; - - dom->guest_type = xc_dom_guest_type(dom, elf); - if ( dom->guest_type == NULL ) - return -EINVAL; - DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "", - __FUNCTION__, dom->guest_type, - dom->kernel_seg.vstart, dom->kernel_seg.vend); - rc = 0; -out: - if ( elf_check_broken(elf) ) - DOMPRINTF("%s: ELF broken: %s", __FUNCTION__, - elf_check_broken(elf)); - - return rc; -} - -static elf_errorstatus xc_dom_load_elf_kernel(struct xc_dom_image *dom) -{ - struct elf_binary *elf = dom->private_loader; - elf_errorstatus rc; - xen_pfn_t pages; - - elf->dest_base = xc_dom_seg_to_ptr_pages(dom, &dom->kernel_seg, &pages); - if ( elf->dest_base == NULL ) - { - DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom,dom->kernel_seg)" - " => NULL", __FUNCTION__); - return -1; - } - elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom); - - rc = elf_load_binary(elf); - if ( rc < 0 ) - { - DOMPRINTF("%s: failed to load elf binary", __FUNCTION__); - return rc; - } - return 0; -} - -/* ------------------------------------------------------------------------ */ - -struct xc_dom_loader elf_loader = { - .name = "ELF-generic", - .probe = xc_dom_probe_elf_kernel, - .parser = xc_dom_parse_elf_kernel, - .loader = xc_dom_load_elf_kernel, -}; - -static void __init register_loader(void) -{ - xc_dom_register_loader(&elf_loader); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_hvmloader.c b/tools/libxc/xg_dom_hvmloader.c deleted file mode 100644 index 995a0f3dc3..0000000000 --- a/tools/libxc/xg_dom_hvmloader.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Xen domain builder -- HVM specific bits. - * - * Parse and load ELF firmware images for HVM domains. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -#include -#include -#include -#include -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" -#include "xc_bitops.h" - -/* ------------------------------------------------------------------------ */ -/* parse elf binary */ - -static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose) -{ - if ( dom->kernel_blob == NULL ) - { - if ( verbose ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: no kernel image loaded", __func__); - return -EINVAL; - } - - if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) ) - { - if ( verbose ) - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, - "%s: kernel is not an ELF image", __func__); - return -EINVAL; - } - return 0; -} - -static elf_negerrnoval xc_dom_probe_hvm_kernel(struct xc_dom_image *dom) -{ - struct elf_binary elf; - int rc; - - /* This loader is designed for HVM guest firmware. */ - if ( dom->container_type != XC_DOM_HVM_CONTAINER ) - return -EINVAL; - - rc = check_elf_kernel(dom, 0); - if ( rc != 0 ) - return rc; - - rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size); - if ( rc != 0 ) - return rc; - - /* - * We need to check that there are no Xen ELFNOTES, or - * else we might be trying to load a PV kernel. - */ - elf_parse_binary(&elf); - rc = elf_xen_parse(&elf, &dom->parms); - if ( rc == 0 ) - return -EINVAL; - - return 0; -} - -static elf_errorstatus xc_dom_parse_hvm_kernel(struct xc_dom_image *dom) - /* - * This function sometimes returns -1 for error and sometimes - * an errno value. ?!?! - */ -{ - struct elf_binary *elf; - elf_errorstatus rc; - - rc = check_elf_kernel(dom, 1); - if ( rc != 0 ) - return rc; - - elf = xc_dom_malloc(dom, sizeof(*elf)); - if ( elf == NULL ) - return -1; - dom->private_loader = elf; - rc = elf_init(elf, dom->kernel_blob, dom->kernel_size); - xc_elf_set_logfile(dom->xch, elf, 1); - if ( rc != 0 ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image", - __func__); - return rc; - } - - if ( !elf_32bit(elf) ) - { - xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: ELF image is not 32bit", - __func__); - return -EINVAL; - } - - /* parse binary and get xen meta info */ - elf_parse_binary(elf); - - /* find kernel segment */ - dom->kernel_seg.vstart = elf->pstart; - dom->kernel_seg.vend = elf->pend; - - dom->guest_type = "hvm-3.0-x86_32"; - - if ( elf_check_broken(elf) ) - DOMPRINTF("%s: ELF broken: %s", __func__, elf_check_broken(elf)); - - return rc; -} - -static int module_init_one(struct xc_dom_image *dom, - struct xc_hvm_firmware_module *module, - char *name) -{ - struct xc_dom_seg seg; - void *dest; - - if ( module->length && !module->guest_addr_out ) - { - if ( xc_dom_alloc_segment(dom, &seg, name, 0, module->length) ) - goto err; - dest = xc_dom_seg_to_ptr(dom, &seg); - if ( dest == NULL ) - { - DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &seg) => NULL", - __FUNCTION__); - goto err; - } - memcpy(dest, module->data, module->length); - module->guest_addr_out = seg.vstart; - - assert(dom->mmio_start > 0 && dom->mmio_start < UINT32_MAX); - if ( module->guest_addr_out > dom->mmio_start || - module->guest_addr_out + module->length > dom->mmio_start ) - { - DOMPRINTF("%s: Module %s would be loaded abrove 4GB", - __FUNCTION__, name); - goto err; - } - } - - return 0; -err: - return -1; -} - -static int modules_init(struct xc_dom_image *dom) -{ - int rc; - - rc = module_init_one(dom, &dom->system_firmware_module, - "System Firmware module"); - if ( rc ) goto err; - /* Only one module can be added */ - rc = module_init_one(dom, &dom->acpi_modules[0], "ACPI module"); - if ( rc ) goto err; - rc = module_init_one(dom, &dom->smbios_module, "SMBIOS module"); - if ( rc ) goto err; - - return 0; -err: - return -1; -} - -static elf_errorstatus xc_dom_load_hvm_kernel(struct xc_dom_image *dom) -{ - struct elf_binary *elf = dom->private_loader; - privcmd_mmap_entry_t *entries = NULL; - size_t pages = (elf->pend - elf->pstart + PAGE_SIZE - 1) >> PAGE_SHIFT; - elf_errorstatus rc; - int i; - - /* Map address space for initial elf image. */ - entries = calloc(pages, sizeof(privcmd_mmap_entry_t)); - if ( entries == NULL ) - return -ENOMEM; - - for ( i = 0; i < pages; i++ ) - entries[i].mfn = (elf->pstart >> PAGE_SHIFT) + i; - - elf->dest_base = xc_map_foreign_ranges( - dom->xch, dom->guest_domid, pages << PAGE_SHIFT, - PROT_READ | PROT_WRITE, 1 << PAGE_SHIFT, - entries, pages); - if ( elf->dest_base == NULL ) - { - DOMPRINTF("%s: unable to map guest memory space", __func__); - rc = -EFAULT; - goto error; - } - - elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom); - - rc = elf_load_binary(elf); - if ( rc < 0 ) - { - DOMPRINTF("%s: failed to load elf binary", __func__); - goto error; - } - - munmap(elf->dest_base, elf->dest_size); - - rc = modules_init(dom); - if ( rc != 0 ) - { - DOMPRINTF("%s: unable to load modules.", __func__); - goto error; - } - - dom->parms.phys_entry = elf_uval(elf, elf->ehdr, e_entry); - - free(entries); - return 0; - - error: - assert(rc != 0); - free(entries); - return rc; -} - -/* ------------------------------------------------------------------------ */ - -struct xc_dom_loader hvm_loader = { - .name = "HVM-generic", - .probe = xc_dom_probe_hvm_kernel, - .parser = xc_dom_parse_hvm_kernel, - .loader = xc_dom_load_hvm_kernel, -}; - -static void __init register_loader(void) -{ - xc_dom_register_loader(&hvm_loader); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_dom_x86.c b/tools/libxc/xg_dom_x86.c deleted file mode 100644 index 842dbcccdd..0000000000 --- a/tools/libxc/xg_dom_x86.c +++ /dev/null @@ -1,1945 +0,0 @@ -/* - * Xen domain builder -- i386 and x86_64 bits. - * - * Most architecture-specific code for x86 goes here. - * - prepare page tables. - * - fill architecture-specific structs. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * written 2006 by Gerd Hoffmann . - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "xg_private.h" -#include "xenctrl_dom.h" -#include "xenctrl.h" - -/* ------------------------------------------------------------------------ */ - -#define SUPERPAGE_BATCH_SIZE 512 - -#define SUPERPAGE_2MB_SHIFT 9 -#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) -#define SUPERPAGE_1GB_SHIFT 18 -#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) - -#define X86_CR0_PE 0x01 -#define X86_CR0_ET 0x10 - -#define X86_DR6_DEFAULT 0xffff0ff0u -#define X86_DR7_DEFAULT 0x00000400u - -#define MTRR_TYPE_WRBACK 6 -#define MTRR_DEF_TYPE_ENABLE (1u << 11) - -#define SPECIALPAGE_PAGING 0 -#define SPECIALPAGE_ACCESS 1 -#define SPECIALPAGE_SHARING 2 -#define SPECIALPAGE_BUFIOREQ 3 -#define SPECIALPAGE_XENSTORE 4 -#define SPECIALPAGE_IOREQ 5 -#define SPECIALPAGE_IDENT_PT 6 -#define SPECIALPAGE_CONSOLE 7 -#define special_pfn(x) \ - (X86_HVM_END_SPECIAL_REGION - X86_HVM_NR_SPECIAL_PAGES + (x)) - -#define NR_IOREQ_SERVER_PAGES 8 -#define ioreq_server_pfn(x) (special_pfn(0) - NR_IOREQ_SERVER_PAGES + (x)) - -#define bits_to_mask(bits) (((xen_vaddr_t)1 << (bits))-1) -#define round_down(addr, mask) ((addr) & ~(mask)) -#define round_up(addr, mask) ((addr) | (mask)) -#define round_pg_up(addr) (((addr) + PAGE_SIZE_X86 - 1) & ~(PAGE_SIZE_X86 - 1)) - -#define HVMLOADER_MODULE_MAX_COUNT 2 -#define HVMLOADER_MODULE_CMDLINE_SIZE MAX_GUEST_CMDLINE - -struct xc_dom_params { - unsigned levels; - xen_vaddr_t vaddr_mask; - x86_pgentry_t lvl_prot[4]; -}; - -struct xc_dom_x86_mapping_lvl { - xen_vaddr_t from; - xen_vaddr_t to; - xen_pfn_t pfn; - unsigned int pgtables; -}; - -struct xc_dom_x86_mapping { - struct xc_dom_x86_mapping_lvl area; - struct xc_dom_x86_mapping_lvl lvls[4]; -}; - -struct xc_dom_image_x86 { - unsigned n_mappings; -#define MAPPING_MAX 2 - struct xc_dom_x86_mapping maps[MAPPING_MAX]; - const struct xc_dom_params *params; - - /* PV: Pointer to the in-guest P2M. */ - void *p2m_guest; -}; - -/* get guest IO ABI protocol */ -const char *xc_domain_get_native_protocol(xc_interface *xch, - uint32_t domid) -{ - int ret; - uint32_t guest_width; - const char *protocol; - - ret = xc_domain_get_guest_width(xch, domid, &guest_width); - - if ( ret ) - return NULL; - - switch (guest_width) { - case 4: /* 32 bit guest */ - protocol = XEN_IO_PROTO_ABI_X86_32; - break; - case 8: /* 64 bit guest */ - protocol = XEN_IO_PROTO_ABI_X86_64; - break; - default: - protocol = NULL; - } - - return protocol; -} - -static int count_pgtables(struct xc_dom_image *dom, xen_vaddr_t from, - xen_vaddr_t to, xen_pfn_t pfn) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - struct xc_dom_x86_mapping *map, *map_cmp; - xen_pfn_t pfn_end; - xen_vaddr_t mask; - unsigned bits; - int l, m; - - if ( domx86->n_mappings == MAPPING_MAX ) - { - xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, - "%s: too many mappings\n", __FUNCTION__); - return -ENOMEM; - } - map = domx86->maps + domx86->n_mappings; - - pfn_end = pfn + ((to - from) >> PAGE_SHIFT_X86); - if ( pfn_end >= dom->p2m_size ) - { - xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, - "%s: not enough memory for initial mapping (%#"PRIpfn" > %#"PRIpfn")", - __FUNCTION__, pfn_end, dom->p2m_size); - return -ENOMEM; - } - for ( m = 0; m < domx86->n_mappings; m++ ) - { - map_cmp = domx86->maps + m; - if ( from < map_cmp->area.to && to > map_cmp->area.from ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: overlapping mappings\n", __FUNCTION__); - return -EINVAL; - } - } - - memset(map, 0, sizeof(*map)); - map->area.from = from & domx86->params->vaddr_mask; - map->area.to = to & domx86->params->vaddr_mask; - - for ( l = domx86->params->levels - 1; l >= 0; l-- ) - { - map->lvls[l].pfn = dom->pfn_alloc_end + map->area.pgtables; - if ( l == domx86->params->levels - 1 ) - { - /* Top level page table in first mapping only. */ - if ( domx86->n_mappings == 0 ) - { - map->lvls[l].from = 0; - map->lvls[l].to = domx86->params->vaddr_mask; - map->lvls[l].pgtables = 1; - map->area.pgtables++; - } - continue; - } - - bits = PAGE_SHIFT_X86 + (l + 1) * PGTBL_LEVEL_SHIFT_X86; - mask = bits_to_mask(bits); - map->lvls[l].from = map->area.from & ~mask; - map->lvls[l].to = map->area.to | mask; - - if ( domx86->params->levels == PGTBL_LEVELS_I386 && - domx86->n_mappings == 0 && to < 0xc0000000 && l == 1 ) - { - DOMPRINTF("%s: PAE: extra l2 page table for l3#3", __FUNCTION__); - map->lvls[l].to = domx86->params->vaddr_mask; - } - - for ( m = 0; m < domx86->n_mappings; m++ ) - { - map_cmp = domx86->maps + m; - if ( map_cmp->lvls[l].from == map_cmp->lvls[l].to ) - continue; - if ( map->lvls[l].from >= map_cmp->lvls[l].from && - map->lvls[l].to <= map_cmp->lvls[l].to ) - { - map->lvls[l].from = 0; - map->lvls[l].to = 0; - break; - } - assert(map->lvls[l].from >= map_cmp->lvls[l].from || - map->lvls[l].to <= map_cmp->lvls[l].to); - if ( map->lvls[l].from >= map_cmp->lvls[l].from && - map->lvls[l].from <= map_cmp->lvls[l].to ) - map->lvls[l].from = map_cmp->lvls[l].to + 1; - if ( map->lvls[l].to >= map_cmp->lvls[l].from && - map->lvls[l].to <= map_cmp->lvls[l].to ) - map->lvls[l].to = map_cmp->lvls[l].from - 1; - } - if ( map->lvls[l].from < map->lvls[l].to ) - map->lvls[l].pgtables = - ((map->lvls[l].to - map->lvls[l].from) >> bits) + 1; - DOMPRINTF("%s: 0x%016" PRIx64 "/%d: 0x%016" PRIx64 " -> 0x%016" PRIx64 - ", %d table(s)", __FUNCTION__, mask, bits, - map->lvls[l].from, map->lvls[l].to, map->lvls[l].pgtables); - map->area.pgtables += map->lvls[l].pgtables; - } - - return 0; -} - -static int alloc_pgtables_pv(struct xc_dom_image *dom) -{ - int pages, extra_pages; - xen_vaddr_t try_virt_end; - struct xc_dom_image_x86 *domx86 = dom->arch_private; - struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings; - - extra_pages = dom->alloc_bootstack ? 1 : 0; - extra_pages += (512 * 1024) / PAGE_SIZE_X86; /* 512kB padding */ - pages = extra_pages; - for ( ; ; ) - { - try_virt_end = round_up(dom->virt_alloc_end + pages * PAGE_SIZE_X86, - bits_to_mask(22)); /* 4MB alignment */ - - if ( count_pgtables(dom, dom->parms.virt_base, try_virt_end, 0) ) - return -1; - - pages = map->area.pgtables + extra_pages; - if ( dom->virt_alloc_end + pages * PAGE_SIZE_X86 <= try_virt_end + 1 ) - break; - } - map->area.pfn = 0; - domx86->n_mappings++; - dom->virt_pgtab_end = try_virt_end + 1; - - return xc_dom_alloc_segment(dom, &dom->pgtables_seg, "page tables", 0, - map->area.pgtables * PAGE_SIZE_X86); -} - -/* ------------------------------------------------------------------------ */ -/* i386 pagetables */ - -static int alloc_pgtables_x86_32_pae(struct xc_dom_image *dom) -{ - static const struct xc_dom_params x86_32_params = { - .levels = PGTBL_LEVELS_I386, - .vaddr_mask = bits_to_mask(VIRT_BITS_I386), - .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED, - /* - * 64bit Xen runs 32bit PV guests with the PAE entries in an L3 - * pagetable. They don't behave exactly like native PAE paging. - */ - .lvl_prot[1 ... 2] = - _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, - }; - struct xc_dom_image_x86 *domx86 = dom->arch_private; - - domx86->params = &x86_32_params; - - return alloc_pgtables_pv(dom); -} - -#define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86) -#define pgentry_to_pfn(entry) ((xen_pfn_t)((entry) >> PAGE_SHIFT_X86)) - -/* - * Move the l3 page table page below 4G for guests which do not - * support the extended-cr3 format. The l3 is currently empty so we - * do not need to preserve the current contents. - */ -static xen_pfn_t move_l3_below_4G(struct xc_dom_image *dom, - xen_pfn_t l3pfn, - xen_pfn_t l3mfn) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - uint32_t *p2m_guest = domx86->p2m_guest; - xen_pfn_t new_l3mfn; - struct xc_mmu *mmu; - void *l3tab; - - mmu = xc_alloc_mmu_updates(dom->xch, dom->guest_domid); - if ( mmu == NULL ) - { - DOMPRINTF("%s: failed at %d", __FUNCTION__, __LINE__); - return l3mfn; - } - - xc_dom_unmap_one(dom, l3pfn); - - new_l3mfn = xc_make_page_below_4G(dom->xch, dom->guest_domid, l3mfn); - if ( !new_l3mfn ) - goto out; - - p2m_guest[l3pfn] = dom->pv_p2m[l3pfn] = new_l3mfn; - - if ( xc_add_mmu_update(dom->xch, mmu, - (((unsigned long long)new_l3mfn) - << XC_DOM_PAGE_SHIFT(dom)) | - MMU_MACHPHYS_UPDATE, l3pfn) ) - goto out; - - if ( xc_flush_mmu_updates(dom->xch, mmu) ) - goto out; - - /* - * This ensures that the entire pgtables_seg is mapped by a single - * mmap region. arch_setup_bootlate() relies on this to be able to - * unmap and pin the pagetables. - */ - if ( xc_dom_seg_to_ptr(dom, &dom->pgtables_seg) == NULL ) - goto out; - - l3tab = xc_dom_pfn_to_ptr(dom, l3pfn, 1); - if ( l3tab == NULL ) - { - DOMPRINTF("%s: xc_dom_pfn_to_ptr(dom, l3pfn, 1) => NULL", - __FUNCTION__); - goto out; /* our one call site will call xc_dom_panic and fail */ - } - memset(l3tab, 0, XC_DOM_PAGE_SIZE(dom)); - - DOMPRINTF("%s: successfully relocated L3 below 4G. " - "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn"=>%#"PRIpfn")", - __FUNCTION__, l3pfn, l3mfn, new_l3mfn); - - l3mfn = new_l3mfn; - - out: - free(mmu); - - return l3mfn; -} - -static x86_pgentry_t *get_pg_table(struct xc_dom_image *dom, int m, int l) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - struct xc_dom_x86_mapping *map; - x86_pgentry_t *pg; - - map = domx86->maps + m; - pg = xc_dom_pfn_to_ptr(dom, map->lvls[l].pfn, 0); - if ( pg ) - return pg; - - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: xc_dom_pfn_to_ptr failed", __FUNCTION__); - return NULL; -} - -static x86_pgentry_t get_pg_prot(struct xc_dom_image *dom, int l, xen_pfn_t pfn) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - struct xc_dom_x86_mapping *map; - xen_pfn_t pfn_s, pfn_e; - x86_pgentry_t prot; - unsigned m; - - prot = domx86->params->lvl_prot[l]; - if ( l > 0 ) - return prot; - - for ( m = 0; m < domx86->n_mappings; m++ ) - { - map = domx86->maps + m; - pfn_s = map->lvls[domx86->params->levels - 1].pfn; - pfn_e = map->area.pgtables + pfn_s; - if ( pfn >= pfn_s && pfn < pfn_e ) - return prot & ~_PAGE_RW; - } - - return prot; -} - -static int setup_pgtables_pv(struct xc_dom_image *dom) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - struct xc_dom_x86_mapping *map1, *map2; - struct xc_dom_x86_mapping_lvl *lvl; - xen_vaddr_t from, to; - xen_pfn_t pfn, p, p_s, p_e; - x86_pgentry_t *pg; - unsigned m1, m2; - int l; - - for ( l = domx86->params->levels - 1; l >= 0; l-- ) - for ( m1 = 0; m1 < domx86->n_mappings; m1++ ) - { - map1 = domx86->maps + m1; - from = map1->lvls[l].from; - to = map1->lvls[l].to; - pg = get_pg_table(dom, m1, l); - if ( !pg ) - return -1; - for ( m2 = 0; m2 < domx86->n_mappings; m2++ ) - { - map2 = domx86->maps + m2; - lvl = (l > 0) ? map2->lvls + l - 1 : &map2->area; - if ( l > 0 && lvl->pgtables == 0 ) - continue; - if ( lvl->from >= to || lvl->to <= from ) - continue; - p_s = (max(from, lvl->from) - from) >> - (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86); - p_e = (min(to, lvl->to) - from) >> - (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86); - pfn = ((max(from, lvl->from) - lvl->from) >> - (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86)) + lvl->pfn; - for ( p = p_s; p <= p_e; p++ ) - { - pg[p] = pfn_to_paddr(xc_dom_p2m(dom, pfn)) | - get_pg_prot(dom, l, pfn); - pfn++; - } - } - } - - return 0; -} - -static int setup_pgtables_x86_32_pae(struct xc_dom_image *dom) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - uint32_t *p2m_guest = domx86->p2m_guest; - xen_pfn_t l3mfn, l3pfn, i; - - /* Copy dom->pv_p2m[] into the guest. */ - for ( i = 0; i < dom->p2m_size; ++i ) - { - if ( dom->pv_p2m[i] != INVALID_PFN ) - p2m_guest[i] = dom->pv_p2m[i]; - else - p2m_guest[i] = -1; - } - - l3pfn = domx86->maps[0].lvls[2].pfn; - l3mfn = xc_dom_p2m(dom, l3pfn); - if ( dom->parms.pae == XEN_PAE_YES ) - { - if ( l3mfn >= 0x100000 ) - l3mfn = move_l3_below_4G(dom, l3pfn, l3mfn); - - if ( l3mfn >= 0x100000 ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,"%s: cannot move L3" - " below 4G. extended-cr3 not supported by guest. " - "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn")", - __FUNCTION__, l3pfn, l3mfn); - return -EINVAL; - } - } - - return setup_pgtables_pv(dom); -} - -/* ------------------------------------------------------------------------ */ -/* x86_64 pagetables */ - -static int alloc_pgtables_x86_64(struct xc_dom_image *dom) -{ - const static struct xc_dom_params x86_64_params = { - .levels = PGTBL_LEVELS_X86_64, - .vaddr_mask = bits_to_mask(VIRT_BITS_X86_64), - .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED, - .lvl_prot[1 ... 3] = - _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, - }; - struct xc_dom_image_x86 *domx86 = dom->arch_private; - - domx86->params = &x86_64_params; - - return alloc_pgtables_pv(dom); -} - -static int setup_pgtables_x86_64(struct xc_dom_image *dom) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - uint64_t *p2m_guest = domx86->p2m_guest; - xen_pfn_t i; - - /* Copy dom->pv_p2m[] into the guest. */ - for ( i = 0; i < dom->p2m_size; ++i ) - { - if ( dom->pv_p2m[i] != INVALID_PFN ) - p2m_guest[i] = dom->pv_p2m[i]; - else - p2m_guest[i] = -1; - } - - return setup_pgtables_pv(dom); -} - -/* ------------------------------------------------------------------------ */ - -static int alloc_p2m_list(struct xc_dom_image *dom, size_t p2m_alloc_size) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - - if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach", - 0, p2m_alloc_size) ) - return -1; - - domx86->p2m_guest = xc_dom_seg_to_ptr(dom, &dom->p2m_seg); - if ( domx86->p2m_guest == NULL ) - return -1; - - return 0; -} - -static int alloc_p2m_list_x86_32(struct xc_dom_image *dom) -{ - size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn; - - p2m_alloc_size = round_pg_up(p2m_alloc_size); - return alloc_p2m_list(dom, p2m_alloc_size); -} - -static int alloc_p2m_list_x86_64(struct xc_dom_image *dom) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings; - size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn; - xen_vaddr_t from, to; - unsigned lvl; - - p2m_alloc_size = round_pg_up(p2m_alloc_size); - if ( dom->parms.p2m_base != UNSET_ADDR ) - { - from = dom->parms.p2m_base; - to = from + p2m_alloc_size - 1; - if ( count_pgtables(dom, from, to, dom->pfn_alloc_end) ) - return -1; - - map->area.pfn = dom->pfn_alloc_end; - for ( lvl = 0; lvl < 4; lvl++ ) - map->lvls[lvl].pfn += p2m_alloc_size >> PAGE_SHIFT_X86; - domx86->n_mappings++; - p2m_alloc_size += map->area.pgtables << PAGE_SHIFT_X86; - } - - return alloc_p2m_list(dom, p2m_alloc_size); -} - -/* ------------------------------------------------------------------------ */ - -static int alloc_magic_pages_pv(struct xc_dom_image *dom) -{ - dom->start_info_pfn = xc_dom_alloc_page(dom, "start info"); - if ( dom->start_info_pfn == INVALID_PFN ) - return -1; - - dom->xenstore_pfn = xc_dom_alloc_page(dom, "xenstore"); - if ( dom->xenstore_pfn == INVALID_PFN ) - return -1; - xc_clear_domain_page(dom->xch, dom->guest_domid, - xc_dom_p2m(dom, dom->xenstore_pfn)); - - dom->console_pfn = xc_dom_alloc_page(dom, "console"); - if ( dom->console_pfn == INVALID_PFN ) - return -1; - xc_clear_domain_page(dom->xch, dom->guest_domid, - xc_dom_p2m(dom, dom->console_pfn)); - - dom->alloc_bootstack = 1; - - return 0; -} - -static void build_hvm_info(void *hvm_info_page, struct xc_dom_image *dom) -{ - struct hvm_info_table *hvm_info = (struct hvm_info_table *) - (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET); - uint8_t sum; - int i; - - memset(hvm_info_page, 0, PAGE_SIZE); - - /* Fill in the header. */ - memcpy(hvm_info->signature, "HVM INFO", sizeof(hvm_info->signature)); - hvm_info->length = sizeof(struct hvm_info_table); - - /* Sensible defaults: these can be overridden by the caller. */ - hvm_info->apic_mode = 1; - hvm_info->nr_vcpus = 1; - memset(hvm_info->vcpu_online, 0xff, sizeof(hvm_info->vcpu_online)); - - /* Memory parameters. */ - hvm_info->low_mem_pgend = dom->lowmem_end >> PAGE_SHIFT; - hvm_info->high_mem_pgend = dom->highmem_end >> PAGE_SHIFT; - hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0); - - /* Finish with the checksum. */ - for ( i = 0, sum = 0; i < hvm_info->length; i++ ) - sum += ((uint8_t *)hvm_info)[i]; - hvm_info->checksum = -sum; -} - -static int alloc_magic_pages_hvm(struct xc_dom_image *dom) -{ - unsigned long i; - uint32_t *ident_pt, domid = dom->guest_domid; - int rc; - xen_pfn_t special_array[X86_HVM_NR_SPECIAL_PAGES]; - xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES]; - xc_interface *xch = dom->xch; - size_t start_info_size = sizeof(struct hvm_start_info); - - /* Allocate and clear special pages. */ - for ( i = 0; i < X86_HVM_NR_SPECIAL_PAGES; i++ ) - special_array[i] = special_pfn(i); - - rc = xc_domain_populate_physmap_exact(xch, domid, X86_HVM_NR_SPECIAL_PAGES, - 0, 0, special_array); - if ( rc != 0 ) - { - DOMPRINTF("Could not allocate special pages."); - goto error_out; - } - - if ( xc_clear_domain_pages(xch, domid, special_pfn(0), - X86_HVM_NR_SPECIAL_PAGES) ) - goto error_out; - - xc_hvm_param_set(xch, domid, HVM_PARAM_STORE_PFN, - special_pfn(SPECIALPAGE_XENSTORE)); - xc_hvm_param_set(xch, domid, HVM_PARAM_BUFIOREQ_PFN, - special_pfn(SPECIALPAGE_BUFIOREQ)); - xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_PFN, - special_pfn(SPECIALPAGE_IOREQ)); - xc_hvm_param_set(xch, domid, HVM_PARAM_CONSOLE_PFN, - special_pfn(SPECIALPAGE_CONSOLE)); - xc_hvm_param_set(xch, domid, HVM_PARAM_PAGING_RING_PFN, - special_pfn(SPECIALPAGE_PAGING)); - xc_hvm_param_set(xch, domid, HVM_PARAM_MONITOR_RING_PFN, - special_pfn(SPECIALPAGE_ACCESS)); - xc_hvm_param_set(xch, domid, HVM_PARAM_SHARING_RING_PFN, - special_pfn(SPECIALPAGE_SHARING)); - - start_info_size += - sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; - - start_info_size += - HVMLOADER_MODULE_CMDLINE_SIZE * HVMLOADER_MODULE_MAX_COUNT; - - start_info_size += - dom->e820_entries * sizeof(struct hvm_memmap_table_entry); - - if ( !dom->device_model ) - { - if ( dom->cmdline ) - { - dom->cmdline_size = ROUNDUP(strlen(dom->cmdline) + 1, 8); - start_info_size += dom->cmdline_size; - } - } - else - { - /* - * Allocate and clear additional ioreq server pages. The default - * server will use the IOREQ and BUFIOREQ special pages above. - */ - for ( i = 0; i < NR_IOREQ_SERVER_PAGES; i++ ) - ioreq_server_array[i] = ioreq_server_pfn(i); - - rc = xc_domain_populate_physmap_exact(xch, domid, NR_IOREQ_SERVER_PAGES, 0, - 0, ioreq_server_array); - if ( rc != 0 ) - { - DOMPRINTF("Could not allocate ioreq server pages."); - goto error_out; - } - - if ( xc_clear_domain_pages(xch, domid, ioreq_server_pfn(0), - NR_IOREQ_SERVER_PAGES) ) - goto error_out; - - /* Tell the domain where the pages are and how many there are */ - xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_SERVER_PFN, - ioreq_server_pfn(0)); - xc_hvm_param_set(xch, domid, HVM_PARAM_NR_IOREQ_SERVER_PAGES, - NR_IOREQ_SERVER_PAGES); - } - - rc = xc_dom_alloc_segment(dom, &dom->start_info_seg, - "HVM start info", 0, start_info_size); - if ( rc != 0 ) - { - DOMPRINTF("Unable to reserve memory for the start info"); - goto out; - } - - /* - * Identity-map page table is required for running with CR0.PG=0 when - * using Intel EPT. Create a 32-bit non-PAE page directory of superpages. - */ - if ( (ident_pt = xc_map_foreign_range( - xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE, - special_pfn(SPECIALPAGE_IDENT_PT))) == NULL ) - goto error_out; - for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ ) - ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER | - _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); - munmap(ident_pt, PAGE_SIZE); - xc_hvm_param_set(xch, domid, HVM_PARAM_IDENT_PT, - special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT); - - dom->console_pfn = special_pfn(SPECIALPAGE_CONSOLE); - xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn); - - dom->xenstore_pfn = special_pfn(SPECIALPAGE_XENSTORE); - xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn); - - dom->parms.virt_hypercall = -1; - - rc = 0; - goto out; - error_out: - rc = -1; - out: - - return rc; -} - -/* ------------------------------------------------------------------------ */ - -static int start_info_x86_32(struct xc_dom_image *dom) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - start_info_x86_32_t *start_info = - xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1); - xen_pfn_t shinfo = - xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn; - - DOMPRINTF_CALLED(dom->xch); - - if ( start_info == NULL ) - { - DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__); - return -1; /* our caller throws away our return value :-/ */ - } - - memset(start_info, 0, sizeof(*start_info)); - strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic)); - start_info->magic[sizeof(start_info->magic) - 1] = '\0'; - start_info->nr_pages = dom->total_pages; - start_info->shared_info = shinfo << PAGE_SHIFT_X86; - start_info->pt_base = dom->pgtables_seg.vstart; - start_info->nr_pt_frames = domx86->maps[0].area.pgtables; - start_info->mfn_list = dom->p2m_seg.vstart; - - start_info->flags = dom->flags; - start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); - start_info->store_evtchn = dom->xenstore_evtchn; - start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); - start_info->console.domU.evtchn = dom->console_evtchn; - - if ( dom->modules[0].blob ) - { - start_info->mod_start = dom->initrd_start; - start_info->mod_len = dom->initrd_len; - } - - if ( dom->cmdline ) - { - strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE); - start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0'; - } - - return 0; -} - -static int start_info_x86_64(struct xc_dom_image *dom) -{ - struct xc_dom_image_x86 *domx86 = dom->arch_private; - start_info_x86_64_t *start_info = - xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1); - xen_pfn_t shinfo = - xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn; - - DOMPRINTF_CALLED(dom->xch); - - if ( start_info == NULL ) - { - DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__); - return -1; /* our caller throws away our return value :-/ */ - } - - memset(start_info, 0, sizeof(*start_info)); - strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic)); - start_info->magic[sizeof(start_info->magic) - 1] = '\0'; - start_info->nr_pages = dom->total_pages; - start_info->shared_info = shinfo << PAGE_SHIFT_X86; - start_info->pt_base = dom->pgtables_seg.vstart; - start_info->nr_pt_frames = domx86->maps[0].area.pgtables; - start_info->mfn_list = dom->p2m_seg.vstart; - if ( dom->parms.p2m_base != UNSET_ADDR ) - { - start_info->first_p2m_pfn = dom->p2m_seg.pfn; - start_info->nr_p2m_frames = dom->p2m_seg.pages; - } - - start_info->flags = dom->flags; - start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); - start_info->store_evtchn = dom->xenstore_evtchn; - start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); - start_info->console.domU.evtchn = dom->console_evtchn; - - if ( dom->modules[0].blob ) - { - start_info->mod_start = dom->initrd_start; - start_info->mod_len = dom->initrd_len; - } - - if ( dom->cmdline ) - { - strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE); - start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0'; - } - - return 0; -} - -static int shared_info_x86_32(struct xc_dom_image *dom, void *ptr) -{ - shared_info_x86_32_t *shared_info = ptr; - int i; - - DOMPRINTF_CALLED(dom->xch); - - memset(shared_info, 0, sizeof(*shared_info)); - for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) - shared_info->vcpu_info[i].evtchn_upcall_mask = 1; - return 0; -} - -static int shared_info_x86_64(struct xc_dom_image *dom, void *ptr) -{ - shared_info_x86_64_t *shared_info = ptr; - int i; - - DOMPRINTF_CALLED(dom->xch); - - memset(shared_info, 0, sizeof(*shared_info)); - for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) - shared_info->vcpu_info[i].evtchn_upcall_mask = 1; - return 0; -} - -/* ------------------------------------------------------------------------ */ - -static int vcpu_x86_32(struct xc_dom_image *dom) -{ - vcpu_guest_context_any_t any_ctx; - vcpu_guest_context_x86_32_t *ctxt = &any_ctx.x32; - xen_pfn_t cr3_pfn; - int rc; - - DOMPRINTF_CALLED(dom->xch); - - /* clear everything */ - memset(ctxt, 0, sizeof(*ctxt)); - - ctxt->user_regs.eip = dom->parms.virt_entry; - ctxt->user_regs.esp = - dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86; - ctxt->user_regs.esi = - dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86; - ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */ - - ctxt->debugreg[6] = X86_DR6_DEFAULT; - ctxt->debugreg[7] = X86_DR7_DEFAULT; - - ctxt->flags = VGCF_in_kernel_X86_32 | VGCF_online_X86_32; - if ( dom->parms.pae == XEN_PAE_EXTCR3 || - dom->parms.pae == XEN_PAE_BIMODAL ) - ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3); - - cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn); - ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_32(cr3_pfn); - DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "", - __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn); - - ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_32; - ctxt->user_regs.es = FLAT_KERNEL_DS_X86_32; - ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_32; - ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_32; - ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_32; - ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_32; - - ctxt->kernel_ss = ctxt->user_regs.ss; - ctxt->kernel_sp = ctxt->user_regs.esp; - - rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); - if ( rc != 0 ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); - - return rc; -} - -static int vcpu_x86_64(struct xc_dom_image *dom) -{ - vcpu_guest_context_any_t any_ctx; - vcpu_guest_context_x86_64_t *ctxt = &any_ctx.x64; - xen_pfn_t cr3_pfn; - int rc; - - DOMPRINTF_CALLED(dom->xch); - - /* clear everything */ - memset(ctxt, 0, sizeof(*ctxt)); - - ctxt->user_regs.rip = dom->parms.virt_entry; - ctxt->user_regs.rsp = - dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86; - ctxt->user_regs.rsi = - dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86; - ctxt->user_regs.rflags = 1 << 9; /* Interrupt Enable */ - - ctxt->debugreg[6] = X86_DR6_DEFAULT; - ctxt->debugreg[7] = X86_DR7_DEFAULT; - - ctxt->flags = VGCF_in_kernel_X86_64 | VGCF_online_X86_64; - cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn); - ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_64(cr3_pfn); - DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "", - __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn); - - ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_64; - ctxt->user_regs.es = FLAT_KERNEL_DS_X86_64; - ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_64; - ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_64; - ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_64; - ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_64; - - ctxt->kernel_ss = ctxt->user_regs.ss; - ctxt->kernel_sp = ctxt->user_regs.esp; - - rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); - if ( rc != 0 ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); - - return rc; -} - -const static void *hvm_get_save_record(const void *ctx, unsigned int type, - unsigned int instance) -{ - const struct hvm_save_descriptor *header; - - for ( header = ctx; - header->typecode != HVM_SAVE_CODE(END); - ctx += sizeof(*header) + header->length, header = ctx ) - if ( header->typecode == type && header->instance == instance ) - return ctx + sizeof(*header); - - return NULL; -} - -static int vcpu_hvm(struct xc_dom_image *dom) -{ - struct { - struct hvm_save_descriptor header_d; - HVM_SAVE_TYPE(HEADER) header; - struct hvm_save_descriptor cpu_d; - HVM_SAVE_TYPE(CPU) cpu; - struct hvm_save_descriptor end_d; - HVM_SAVE_TYPE(END) end; - } bsp_ctx; - uint8_t *full_ctx = NULL; - int rc; - - DOMPRINTF_CALLED(dom->xch); - - assert(dom->max_vcpus); - - /* - * Get the full HVM context in order to have the header, it is not - * possible to get the header with getcontext_partial, and crafting one - * from userspace is also not an option since cpuid is trapped and - * modified by Xen. - */ - - rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, NULL, 0); - if ( rc <= 0 ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: unable to fetch HVM context size (rc=%d)", - __func__, rc); - goto out; - } - - full_ctx = calloc(1, rc); - if ( full_ctx == NULL ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: unable to allocate memory for HVM context (rc=%d)", - __func__, rc); - rc = -ENOMEM; - goto out; - } - - rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, full_ctx, rc); - if ( rc <= 0 ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: unable to fetch HVM context (rc=%d)", - __func__, rc); - goto out; - } - - /* Copy the header to our partial context. */ - memset(&bsp_ctx, 0, sizeof(bsp_ctx)); - memcpy(&bsp_ctx, full_ctx, - sizeof(struct hvm_save_descriptor) + HVM_SAVE_LENGTH(HEADER)); - - /* Set the CPU descriptor. */ - bsp_ctx.cpu_d.typecode = HVM_SAVE_CODE(CPU); - bsp_ctx.cpu_d.instance = 0; - bsp_ctx.cpu_d.length = HVM_SAVE_LENGTH(CPU); - - /* Set the cached part of the relevant segment registers. */ - bsp_ctx.cpu.cs_base = 0; - bsp_ctx.cpu.ds_base = 0; - bsp_ctx.cpu.es_base = 0; - bsp_ctx.cpu.ss_base = 0; - bsp_ctx.cpu.tr_base = 0; - bsp_ctx.cpu.cs_limit = ~0u; - bsp_ctx.cpu.ds_limit = ~0u; - bsp_ctx.cpu.es_limit = ~0u; - bsp_ctx.cpu.ss_limit = ~0u; - bsp_ctx.cpu.tr_limit = 0x67; - bsp_ctx.cpu.cs_arbytes = 0xc9b; - bsp_ctx.cpu.ds_arbytes = 0xc93; - bsp_ctx.cpu.es_arbytes = 0xc93; - bsp_ctx.cpu.ss_arbytes = 0xc93; - bsp_ctx.cpu.tr_arbytes = 0x8b; - - /* Set the control registers. */ - bsp_ctx.cpu.cr0 = X86_CR0_PE | X86_CR0_ET; - - /* Set the IP. */ - bsp_ctx.cpu.rip = dom->parms.phys_entry; - - bsp_ctx.cpu.dr6 = X86_DR6_DEFAULT; - bsp_ctx.cpu.dr7 = X86_DR7_DEFAULT; - - if ( dom->start_info_seg.pfn ) - bsp_ctx.cpu.rbx = dom->start_info_seg.pfn << PAGE_SHIFT; - - /* Set the end descriptor. */ - bsp_ctx.end_d.typecode = HVM_SAVE_CODE(END); - bsp_ctx.end_d.instance = 0; - bsp_ctx.end_d.length = HVM_SAVE_LENGTH(END); - - /* TODO: maybe this should be a firmware option instead? */ - if ( !dom->device_model ) - { - struct { - struct hvm_save_descriptor header_d; - HVM_SAVE_TYPE(HEADER) header; - struct hvm_save_descriptor mtrr_d; - HVM_SAVE_TYPE(MTRR) mtrr; - struct hvm_save_descriptor end_d; - HVM_SAVE_TYPE(END) end; - } mtrr = { - .header_d = bsp_ctx.header_d, - .header = bsp_ctx.header, - .mtrr_d.typecode = HVM_SAVE_CODE(MTRR), - .mtrr_d.length = HVM_SAVE_LENGTH(MTRR), - .end_d = bsp_ctx.end_d, - .end = bsp_ctx.end, - }; - const HVM_SAVE_TYPE(MTRR) *mtrr_record = - hvm_get_save_record(full_ctx, HVM_SAVE_CODE(MTRR), 0); - unsigned int i; - - if ( !mtrr_record ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: unable to get MTRR save record", __func__); - goto out; - } - - memcpy(&mtrr.mtrr, mtrr_record, sizeof(mtrr.mtrr)); - - /* - * Enable MTRR, set default type to WB. - * TODO: add MMIO areas as UC when passthrough is supported. - */ - mtrr.mtrr.msr_mtrr_def_type = MTRR_TYPE_WRBACK | MTRR_DEF_TYPE_ENABLE; - - for ( i = 0; i < dom->max_vcpus; i++ ) - { - mtrr.mtrr_d.instance = i; - rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid, - (uint8_t *)&mtrr, sizeof(mtrr)); - if ( rc != 0 ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc); - } - } - - /* - * Loading the BSP context should be done in the last call to setcontext, - * since each setcontext call will put all vCPUs down. - */ - rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid, - (uint8_t *)&bsp_ctx, sizeof(bsp_ctx)); - if ( rc != 0 ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc); - - out: - free(full_ctx); - return rc; -} - -/* ------------------------------------------------------------------------ */ - -static int x86_compat(xc_interface *xch, uint32_t domid, char *guest_type) -{ - static const struct { - char *guest; - uint32_t size; - } types[] = { - { "xen-3.0-x86_32p", 32 }, - { "xen-3.0-x86_64", 64 }, - }; - DECLARE_DOMCTL; - int i,rc; - - memset(&domctl, 0, sizeof(domctl)); - domctl.domain = domid; - domctl.cmd = XEN_DOMCTL_set_address_size; - for ( i = 0; i < ARRAY_SIZE(types); i++ ) - if ( !strcmp(types[i].guest, guest_type) ) - domctl.u.address_size.size = types[i].size; - if ( domctl.u.address_size.size == 0 ) - /* nothing to do */ - return 0; - - xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__, - guest_type, domctl.u.address_size.size); - rc = do_domctl(xch, &domctl); - if ( rc != 0 ) - xc_dom_printf(xch, "%s: warning: failed (rc=%d)", - __FUNCTION__, rc); - return rc; -} - -static int meminit_pv(struct xc_dom_image *dom) -{ - int rc; - xen_pfn_t pfn, allocsz, mfn, total, pfn_base; - int i, j, k; - xen_vmemrange_t dummy_vmemrange[1]; - unsigned int dummy_vnode_to_pnode[1]; - xen_vmemrange_t *vmemranges; - unsigned int *vnode_to_pnode; - unsigned int nr_vmemranges, nr_vnodes; - - rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type); - if ( rc ) - return rc; - - /* try to claim pages for early warning of insufficient memory avail */ - if ( dom->claim_enabled ) - { - rc = xc_domain_claim_pages(dom->xch, dom->guest_domid, - dom->total_pages); - if ( rc ) - return rc; - } - - /* Setup dummy vNUMA information if it's not provided. Note - * that this is a valid state if libxl doesn't provide any - * vNUMA information. - * - * The dummy values make libxc allocate all pages from - * arbitrary physical nodes. This is the expected behaviour if - * no vNUMA configuration is provided to libxc. - * - * Note that the following hunk is just for the convenience of - * allocation code. No defaulting happens in libxc. - */ - if ( dom->nr_vmemranges == 0 ) - { - nr_vmemranges = 1; - vmemranges = dummy_vmemrange; - vmemranges[0].start = 0; - vmemranges[0].end = (uint64_t)dom->total_pages << PAGE_SHIFT; - vmemranges[0].flags = 0; - vmemranges[0].nid = 0; - - nr_vnodes = 1; - vnode_to_pnode = dummy_vnode_to_pnode; - vnode_to_pnode[0] = XC_NUMA_NO_NODE; - } - else - { - nr_vmemranges = dom->nr_vmemranges; - nr_vnodes = dom->nr_vnodes; - vmemranges = dom->vmemranges; - vnode_to_pnode = dom->vnode_to_pnode; - } - - total = dom->p2m_size = 0; - for ( i = 0; i < nr_vmemranges; i++ ) - { - total += ((vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT); - dom->p2m_size = max(dom->p2m_size, - (xen_pfn_t)(vmemranges[i].end >> PAGE_SHIFT)); - } - if ( total != dom->total_pages ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: vNUMA page count mismatch (0x%"PRIpfn" != 0x%"PRIpfn")", - __func__, total, dom->total_pages); - return -EINVAL; - } - - dom->pv_p2m = xc_dom_malloc(dom, sizeof(*dom->pv_p2m) * dom->p2m_size); - if ( dom->pv_p2m == NULL ) - return -EINVAL; - for ( pfn = 0; pfn < dom->p2m_size; pfn++ ) - dom->pv_p2m[pfn] = INVALID_PFN; - - /* allocate guest memory */ - for ( i = 0; i < nr_vmemranges; i++ ) - { - unsigned int memflags; - uint64_t pages, super_pages; - unsigned int pnode = vnode_to_pnode[vmemranges[i].nid]; - xen_pfn_t extents[SUPERPAGE_BATCH_SIZE]; - xen_pfn_t pfn_base_idx; - - memflags = 0; - if ( pnode != XC_NUMA_NO_NODE ) - memflags |= XENMEMF_exact_node(pnode); - - pages = (vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT; - super_pages = pages >> SUPERPAGE_2MB_SHIFT; - pfn_base = vmemranges[i].start >> PAGE_SHIFT; - - for ( pfn = pfn_base; pfn < pfn_base+pages; pfn++ ) - dom->pv_p2m[pfn] = pfn; - - pfn_base_idx = pfn_base; - while ( super_pages ) { - uint64_t count = min_t(uint64_t, super_pages, SUPERPAGE_BATCH_SIZE); - super_pages -= count; - - for ( pfn = pfn_base_idx, j = 0; - pfn < pfn_base_idx + (count << SUPERPAGE_2MB_SHIFT); - pfn += SUPERPAGE_2MB_NR_PFNS, j++ ) - extents[j] = dom->pv_p2m[pfn]; - rc = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count, - SUPERPAGE_2MB_SHIFT, memflags, - extents); - if ( rc < 0 ) - return rc; - - /* Expand the returned mfns into the p2m array. */ - pfn = pfn_base_idx; - for ( j = 0; j < rc; j++ ) - { - mfn = extents[j]; - for ( k = 0; k < SUPERPAGE_2MB_NR_PFNS; k++, pfn++ ) - dom->pv_p2m[pfn] = mfn + k; - } - pfn_base_idx = pfn; - } - - for ( j = pfn_base_idx - pfn_base; j < pages; j += allocsz ) - { - allocsz = min_t(uint64_t, 1024 * 1024, pages - j); - rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid, - allocsz, 0, memflags, &dom->pv_p2m[pfn_base + j]); - - if ( rc ) - { - if ( pnode != XC_NUMA_NO_NODE ) - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: failed to allocate 0x%"PRIx64" pages (v=%d, p=%d)", - __func__, pages, i, pnode); - else - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: failed to allocate 0x%"PRIx64" pages", - __func__, pages); - return rc; - } - } - rc = 0; - } - - /* Ensure no unclaimed pages are left unused. - * OK to call if hadn't done the earlier claim call. */ - xc_domain_claim_pages(dom->xch, dom->guest_domid, 0 /* cancel claim */); - - return rc; -} - -/* - * Check whether there exists mmio hole in the specified memory range. - * Returns 1 if exists, else returns 0. - */ -static int check_mmio_hole(uint64_t start, uint64_t memsize, - uint64_t mmio_start, uint64_t mmio_size) -{ - if ( start + memsize <= mmio_start || start >= mmio_start + mmio_size ) - return 0; - else - return 1; -} - -static int meminit_hvm(struct xc_dom_image *dom) -{ - unsigned long i, vmemid, nr_pages = dom->total_pages; - unsigned long p2m_size; - unsigned long target_pages = dom->target_pages; - unsigned long cur_pages, cur_pfn; - int rc; - unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, - stat_1gb_pages = 0; - unsigned int memflags = 0; - int claim_enabled = dom->claim_enabled; - uint64_t total_pages; - xen_vmemrange_t dummy_vmemrange[2]; - unsigned int dummy_vnode_to_pnode[1]; - xen_vmemrange_t *vmemranges; - unsigned int *vnode_to_pnode; - unsigned int nr_vmemranges, nr_vnodes; - xc_interface *xch = dom->xch; - uint32_t domid = dom->guest_domid; - - if ( nr_pages > target_pages ) - memflags |= XENMEMF_populate_on_demand; - - if ( dom->nr_vmemranges == 0 ) - { - /* Build dummy vnode information - * - * Guest physical address space layout: - * [0, hole_start) [hole_start, 4G) [4G, highmem_end) - * - * Of course if there is no high memory, the second vmemrange - * has no effect on the actual result. - */ - - dummy_vmemrange[0].start = 0; - dummy_vmemrange[0].end = dom->lowmem_end; - dummy_vmemrange[0].flags = 0; - dummy_vmemrange[0].nid = 0; - nr_vmemranges = 1; - - if ( dom->highmem_end > (1ULL << 32) ) - { - dummy_vmemrange[1].start = 1ULL << 32; - dummy_vmemrange[1].end = dom->highmem_end; - dummy_vmemrange[1].flags = 0; - dummy_vmemrange[1].nid = 0; - - nr_vmemranges++; - } - - dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE; - nr_vnodes = 1; - vmemranges = dummy_vmemrange; - vnode_to_pnode = dummy_vnode_to_pnode; - } - else - { - if ( nr_pages > target_pages ) - { - DOMPRINTF("Cannot enable vNUMA and PoD at the same time"); - goto error_out; - } - - nr_vmemranges = dom->nr_vmemranges; - nr_vnodes = dom->nr_vnodes; - vmemranges = dom->vmemranges; - vnode_to_pnode = dom->vnode_to_pnode; - } - - total_pages = 0; - p2m_size = 0; - for ( i = 0; i < nr_vmemranges; i++ ) - { - DOMPRINTF("range: start=0x%"PRIx64" end=0x%"PRIx64, vmemranges[i].start, vmemranges[i].end); - - total_pages += ((vmemranges[i].end - vmemranges[i].start) - >> PAGE_SHIFT); - p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ? - p2m_size : (vmemranges[i].end >> PAGE_SHIFT); - } - - if ( total_pages != nr_pages ) - { - DOMPRINTF("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%lx)", - total_pages, nr_pages); - goto error_out; - } - - dom->p2m_size = p2m_size; - - /* - * Try to claim pages for early warning of insufficient memory available. - * This should go before xc_domain_set_pod_target, becuase that function - * actually allocates memory for the guest. Claiming after memory has been - * allocated is pointless. - */ - if ( claim_enabled ) { - rc = xc_domain_claim_pages(xch, domid, - target_pages - dom->vga_hole_size); - if ( rc != 0 ) - { - DOMPRINTF("Could not allocate memory for HVM guest as we cannot claim memory!"); - goto error_out; - } - } - - if ( memflags & XENMEMF_populate_on_demand ) - { - /* - * Subtract VGA_HOLE_SIZE from target_pages for the VGA - * "hole". Xen will adjust the PoD cache size so that domain - * tot_pages will be target_pages - VGA_HOLE_SIZE after - * this call. - */ - rc = xc_domain_set_pod_target(xch, domid, - target_pages - dom->vga_hole_size, - NULL, NULL, NULL); - if ( rc != 0 ) - { - DOMPRINTF("Could not set PoD target for HVM guest.\n"); - goto error_out; - } - } - - /* - * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. - * - * We attempt to allocate 1GB pages if possible. It falls back on 2MB - * pages if 1GB allocation fails. 4KB pages will be used eventually if - * both fail. - */ - if ( dom->device_model ) - { - xen_pfn_t extents[0xa0]; - - for ( i = 0; i < ARRAY_SIZE(extents); ++i ) - extents[i] = i; - - rc = xc_domain_populate_physmap_exact( - xch, domid, 0xa0, 0, memflags, extents); - if ( rc != 0 ) - { - DOMPRINTF("Could not populate low memory (< 0xA0).\n"); - goto error_out; - } - } - - stat_normal_pages = 0; - for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ ) - { - unsigned int new_memflags = memflags; - uint64_t end_pages; - unsigned int vnode = vmemranges[vmemid].nid; - unsigned int pnode = vnode_to_pnode[vnode]; - - if ( pnode != XC_NUMA_NO_NODE ) - new_memflags |= XENMEMF_exact_node(pnode); - - end_pages = vmemranges[vmemid].end >> PAGE_SHIFT; - /* - * Consider vga hole belongs to the vmemrange that covers - * 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just - * before this loop. - */ - if ( vmemranges[vmemid].start == 0 && dom->device_model ) - { - cur_pages = 0xc0; - stat_normal_pages += 0xc0; - } - else - cur_pages = vmemranges[vmemid].start >> PAGE_SHIFT; - - rc = 0; - while ( (rc == 0) && (end_pages > cur_pages) ) - { - /* Clip count to maximum 1GB extent. */ - unsigned long count = end_pages - cur_pages; - unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS; - - if ( count > max_pages ) - count = max_pages; - - cur_pfn = cur_pages; - - /* Take care the corner cases of super page tails */ - if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) && - (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) ) - count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1); - else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) && - (count > SUPERPAGE_1GB_NR_PFNS) ) - count &= ~(SUPERPAGE_1GB_NR_PFNS - 1); - - /* Attemp to allocate 1GB super page. Because in each pass - * we only allocate at most 1GB, we don't have to clip - * super page boundaries. - */ - if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 && - /* Check if there exists MMIO hole in the 1GB memory - * range */ - !check_mmio_hole(cur_pfn << PAGE_SHIFT, - SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT, - dom->mmio_start, dom->mmio_size) ) - { - long done; - unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT; - xen_pfn_t sp_extents[nr_extents]; - - for ( i = 0; i < nr_extents; i++ ) - sp_extents[i] = cur_pages + (i << SUPERPAGE_1GB_SHIFT); - - done = xc_domain_populate_physmap(xch, domid, nr_extents, - SUPERPAGE_1GB_SHIFT, - new_memflags, sp_extents); - - if ( done > 0 ) - { - stat_1gb_pages += done; - done <<= SUPERPAGE_1GB_SHIFT; - cur_pages += done; - count -= done; - } - } - - if ( count != 0 ) - { - /* Clip count to maximum 8MB extent. */ - max_pages = SUPERPAGE_2MB_NR_PFNS * 4; - if ( count > max_pages ) - count = max_pages; - - /* Clip partial superpage extents to superpage - * boundaries. */ - if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) && - (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) ) - count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1); - else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) && - (count > SUPERPAGE_2MB_NR_PFNS) ) - count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */ - - /* Attempt to allocate superpage extents. */ - if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 ) - { - long done; - unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT; - xen_pfn_t sp_extents[nr_extents]; - - for ( i = 0; i < nr_extents; i++ ) - sp_extents[i] = cur_pages + (i << SUPERPAGE_2MB_SHIFT); - - done = xc_domain_populate_physmap(xch, domid, nr_extents, - SUPERPAGE_2MB_SHIFT, - new_memflags, sp_extents); - - if ( done > 0 ) - { - stat_2mb_pages += done; - done <<= SUPERPAGE_2MB_SHIFT; - cur_pages += done; - count -= done; - } - } - } - - /* Fall back to 4kB extents. */ - if ( count != 0 ) - { - xen_pfn_t extents[count]; - - for ( i = 0; i < count; ++i ) - extents[i] = cur_pages + i; - - rc = xc_domain_populate_physmap_exact( - xch, domid, count, 0, new_memflags, extents); - cur_pages += count; - stat_normal_pages += count; - } - } - - if ( rc != 0 ) - { - DOMPRINTF("Could not allocate memory for HVM guest."); - goto error_out; - } - } - - DPRINTF("PHYSICAL MEMORY ALLOCATION:\n"); - DPRINTF(" 4KB PAGES: 0x%016lx\n", stat_normal_pages); - DPRINTF(" 2MB PAGES: 0x%016lx\n", stat_2mb_pages); - DPRINTF(" 1GB PAGES: 0x%016lx\n", stat_1gb_pages); - - rc = 0; - goto out; - error_out: - rc = -1; - out: - - /* ensure no unclaimed pages are left unused */ - xc_domain_claim_pages(xch, domid, 0 /* cancels the claim */); - - return rc; -} - -/* ------------------------------------------------------------------------ */ - -static int bootearly(struct xc_dom_image *dom) -{ - if ( dom->container_type == XC_DOM_PV_CONTAINER && - elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active) ) - { - DOMPRINTF("PV Autotranslate guests no longer supported"); - errno = EOPNOTSUPP; - return -1; - } - - return 0; -} - -static int bootlate_pv(struct xc_dom_image *dom) -{ - static const struct { - char *guest; - unsigned long pgd_type; - } types[] = { - { "xen-3.0-x86_32", MMUEXT_PIN_L2_TABLE}, - { "xen-3.0-x86_32p", MMUEXT_PIN_L3_TABLE}, - { "xen-3.0-x86_64", MMUEXT_PIN_L4_TABLE}, - }; - unsigned long pgd_type = 0; - shared_info_t *shared_info; - xen_pfn_t shinfo; - int i, rc; - - for ( i = 0; i < ARRAY_SIZE(types); i++ ) - if ( !strcmp(types[i].guest, dom->guest_type) ) - pgd_type = types[i].pgd_type; - - /* Drop references to all initial page tables before pinning. */ - xc_dom_unmap_one(dom, dom->pgtables_seg.pfn); - xc_dom_unmap_one(dom, dom->p2m_seg.pfn); - rc = pin_table(dom->xch, pgd_type, - xc_dom_p2m(dom, dom->pgtables_seg.pfn), - dom->guest_domid); - if ( rc != 0 ) - { - xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, - "%s: pin_table failed (pfn 0x%" PRIpfn ", rc=%d)", - __FUNCTION__, dom->pgtables_seg.pfn, rc); - return rc; - } - shinfo = dom->shared_info_mfn; - - /* setup shared_info page */ - DOMPRINTF("%s: shared_info: pfn 0x%" PRIpfn ", mfn 0x%" PRIpfn "", - __FUNCTION__, dom->shared_info_pfn, dom->shared_info_mfn); - shared_info = xc_map_foreign_range(dom->xch, dom->guest_domid, - PAGE_SIZE_X86, - PROT_READ | PROT_WRITE, - shinfo); - if ( shared_info == NULL ) - return -1; - dom->arch_hooks->shared_info(dom, shared_info); - munmap(shared_info, PAGE_SIZE_X86); - - return 0; -} - -/* - * The memory layout of the start_info page and the modules, and where the - * addresses are stored: - * - * /----------------------------------\ - * | struct hvm_start_info | - * +----------------------------------+ <- start_info->modlist_paddr - * | struct hvm_modlist_entry[0] | - * +----------------------------------+ - * | struct hvm_modlist_entry[1] | - * +----------------------------------+ <- modlist[0].cmdline_paddr - * | cmdline of module 0 | - * | char[HVMLOADER_MODULE_NAME_SIZE] | - * +----------------------------------+ <- modlist[1].cmdline_paddr - * | cmdline of module 1 | - * +----------------------------------+ - */ -static void add_module_to_list(struct xc_dom_image *dom, - struct xc_hvm_firmware_module *module, - const char *cmdline, - struct hvm_modlist_entry *modlist, - struct hvm_start_info *start_info) -{ - uint32_t index = start_info->nr_modules; - void *modules_cmdline_start = modlist + HVMLOADER_MODULE_MAX_COUNT; - uint64_t modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + - ((uintptr_t)modlist - (uintptr_t)start_info); - uint64_t modules_cmdline_paddr = modlist_paddr + - sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; - - if ( module->length == 0 ) - return; - - assert(start_info->nr_modules < HVMLOADER_MODULE_MAX_COUNT); - - modlist[index].paddr = module->guest_addr_out; - modlist[index].size = module->length; - - if ( cmdline ) - { - assert(strnlen(cmdline, HVMLOADER_MODULE_CMDLINE_SIZE) - < HVMLOADER_MODULE_CMDLINE_SIZE); - strncpy(modules_cmdline_start + HVMLOADER_MODULE_CMDLINE_SIZE * index, - cmdline, HVMLOADER_MODULE_CMDLINE_SIZE); - modlist[index].cmdline_paddr = modules_cmdline_paddr + - HVMLOADER_MODULE_CMDLINE_SIZE * index; - } - - start_info->nr_modules++; -} - -static int bootlate_hvm(struct xc_dom_image *dom) -{ - uint32_t domid = dom->guest_domid; - xc_interface *xch = dom->xch; - struct hvm_start_info *start_info; - size_t modsize; - struct hvm_modlist_entry *modlist; - struct hvm_memmap_table_entry *memmap; - unsigned int i; - - start_info = xc_map_foreign_range(xch, domid, dom->start_info_seg.pages << - XC_DOM_PAGE_SHIFT(dom), - PROT_READ | PROT_WRITE, - dom->start_info_seg.pfn); - if ( start_info == NULL ) - { - DOMPRINTF("Unable to map HVM start info page"); - return -1; - } - - modlist = (void*)(start_info + 1) + dom->cmdline_size; - - if ( !dom->device_model ) - { - if ( dom->cmdline ) - { - char *cmdline = (void*)(start_info + 1); - - strncpy(cmdline, dom->cmdline, dom->cmdline_size); - start_info->cmdline_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + - ((uintptr_t)cmdline - (uintptr_t)start_info); - } - - /* ACPI module 0 is the RSDP */ - start_info->rsdp_paddr = dom->acpi_modules[0].guest_addr_out ? : 0; - } - else - { - add_module_to_list(dom, &dom->system_firmware_module, "firmware", - modlist, start_info); - } - - for ( i = 0; i < dom->num_modules; i++ ) - { - struct xc_hvm_firmware_module mod; - uint64_t base = dom->parms.virt_base != UNSET_ADDR ? - dom->parms.virt_base : 0; - - mod.guest_addr_out = - dom->modules[i].seg.vstart - base; - mod.length = - dom->modules[i].seg.vend - dom->modules[i].seg.vstart; - - DOMPRINTF("Adding module %u guest_addr %"PRIx64" len %u", - i, mod.guest_addr_out, mod.length); - - add_module_to_list(dom, &mod, dom->modules[i].cmdline, - modlist, start_info); - } - - if ( start_info->nr_modules ) - { - start_info->modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + - ((uintptr_t)modlist - (uintptr_t)start_info); - } - - /* - * Check a couple of XEN_HVM_MEMMAP_TYPEs to verify consistency with - * their corresponding e820 numerical values. - */ - BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_RAM != E820_RAM); - BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_ACPI != E820_ACPI); - - modsize = HVMLOADER_MODULE_MAX_COUNT * - (sizeof(*modlist) + HVMLOADER_MODULE_CMDLINE_SIZE); - memmap = (void*)modlist + modsize; - - start_info->memmap_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + - ((uintptr_t)modlist - (uintptr_t)start_info) + modsize; - start_info->memmap_entries = dom->e820_entries; - for ( i = 0; i < dom->e820_entries; i++ ) - { - memmap[i].addr = dom->e820[i].addr; - memmap[i].size = dom->e820[i].size; - memmap[i].type = dom->e820[i].type; - } - - start_info->magic = XEN_HVM_START_MAGIC_VALUE; - start_info->version = 1; - - munmap(start_info, dom->start_info_seg.pages << XC_DOM_PAGE_SHIFT(dom)); - - if ( dom->device_model ) - { - void *hvm_info_page; - - if ( (hvm_info_page = xc_map_foreign_range( - xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE, - HVM_INFO_PFN)) == NULL ) - return -1; - build_hvm_info(hvm_info_page, dom); - munmap(hvm_info_page, PAGE_SIZE); - } - - return 0; -} - -bool xc_dom_translated(const struct xc_dom_image *dom) -{ - /* HVM guests are translated. PV guests are not. */ - return dom->container_type == XC_DOM_HVM_CONTAINER; -} - -/* ------------------------------------------------------------------------ */ - -static struct xc_dom_arch xc_dom_32_pae = { - .guest_type = "xen-3.0-x86_32p", - .native_protocol = XEN_IO_PROTO_ABI_X86_32, - .page_shift = PAGE_SHIFT_X86, - .sizeof_pfn = 4, - .p2m_base_supported = 0, - .arch_private_size = sizeof(struct xc_dom_image_x86), - .alloc_magic_pages = alloc_magic_pages_pv, - .alloc_pgtables = alloc_pgtables_x86_32_pae, - .alloc_p2m_list = alloc_p2m_list_x86_32, - .setup_pgtables = setup_pgtables_x86_32_pae, - .start_info = start_info_x86_32, - .shared_info = shared_info_x86_32, - .vcpu = vcpu_x86_32, - .meminit = meminit_pv, - .bootearly = bootearly, - .bootlate = bootlate_pv, -}; - -static struct xc_dom_arch xc_dom_64 = { - .guest_type = "xen-3.0-x86_64", - .native_protocol = XEN_IO_PROTO_ABI_X86_64, - .page_shift = PAGE_SHIFT_X86, - .sizeof_pfn = 8, - .p2m_base_supported = 1, - .arch_private_size = sizeof(struct xc_dom_image_x86), - .alloc_magic_pages = alloc_magic_pages_pv, - .alloc_pgtables = alloc_pgtables_x86_64, - .alloc_p2m_list = alloc_p2m_list_x86_64, - .setup_pgtables = setup_pgtables_x86_64, - .start_info = start_info_x86_64, - .shared_info = shared_info_x86_64, - .vcpu = vcpu_x86_64, - .meminit = meminit_pv, - .bootearly = bootearly, - .bootlate = bootlate_pv, -}; - -static struct xc_dom_arch xc_hvm_32 = { - .guest_type = "hvm-3.0-x86_32", - .native_protocol = XEN_IO_PROTO_ABI_X86_32, - .page_shift = PAGE_SHIFT_X86, - .sizeof_pfn = 4, - .alloc_magic_pages = alloc_magic_pages_hvm, - .vcpu = vcpu_hvm, - .meminit = meminit_hvm, - .bootearly = bootearly, - .bootlate = bootlate_hvm, -}; - -static void __init register_arch_hooks(void) -{ - xc_dom_register_arch_hooks(&xc_dom_32_pae); - xc_dom_register_arch_hooks(&xc_dom_64); - xc_dom_register_arch_hooks(&xc_hvm_32); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_domain.c b/tools/libxc/xg_domain.c deleted file mode 100644 index 58713cd35d..0000000000 --- a/tools/libxc/xg_domain.c +++ /dev/null @@ -1,149 +0,0 @@ -/****************************************************************************** - * xg_domain.c - * - * API for manipulating and obtaining information on domains. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * Copyright (c) 2003, K A Fraser. - */ - -#include "xg_private.h" -#include "xc_core.h" - -int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *minfo) -{ - struct domain_info_context _di = { .guest_width = minfo->guest_width, - .p2m_size = minfo->p2m_size}; - struct domain_info_context *dinfo = &_di; - - free(minfo->pfn_type); - if ( minfo->p2m_table ) - munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE); - minfo->p2m_table = NULL; - - return 0; -} - -int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid, - struct xc_domain_meminfo *minfo) -{ - struct domain_info_context _di; - struct domain_info_context *dinfo = &_di; - - xc_dominfo_t info; - shared_info_any_t *live_shinfo; - xen_capabilities_info_t xen_caps = ""; - int i; - - /* Only be initialized once */ - if ( minfo->pfn_type || minfo->p2m_table ) - { - errno = EINVAL; - return -1; - } - - if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ) - { - PERROR("Could not get domain info"); - return -1; - } - - if ( xc_domain_get_guest_width(xch, domid, &minfo->guest_width) ) - { - PERROR("Could not get domain address size"); - return -1; - } - _di.guest_width = minfo->guest_width; - - /* Get page table levels (see get_platform_info() in xg_save_restore.h */ - if ( xc_version(xch, XENVER_capabilities, &xen_caps) ) - { - PERROR("Could not get Xen capabilities (for page table levels)"); - return -1; - } - if ( strstr(xen_caps, "xen-3.0-x86_64") ) - /* Depends on whether it's a compat 32-on-64 guest */ - minfo->pt_levels = ( (minfo->guest_width == 8) ? 4 : 3 ); - else if ( strstr(xen_caps, "xen-3.0-x86_32p") ) - minfo->pt_levels = 3; - else if ( strstr(xen_caps, "xen-3.0-x86_32") ) - minfo->pt_levels = 2; - else - { - errno = EFAULT; - return -1; - } - - /* We need the shared info page for mapping the P2M */ - live_shinfo = xc_map_foreign_range(xch, domid, PAGE_SIZE, PROT_READ, - info.shared_info_frame); - if ( !live_shinfo ) - { - PERROR("Could not map the shared info frame (MFN 0x%lx)", - info.shared_info_frame); - return -1; - } - - if ( xc_core_arch_map_p2m_writable(xch, minfo->guest_width, &info, - live_shinfo, &minfo->p2m_table, - &minfo->p2m_size) ) - { - PERROR("Could not map the P2M table"); - munmap(live_shinfo, PAGE_SIZE); - return -1; - } - munmap(live_shinfo, PAGE_SIZE); - _di.p2m_size = minfo->p2m_size; - - /* Make space and prepare for getting the PFN types */ - minfo->pfn_type = calloc(sizeof(*minfo->pfn_type), minfo->p2m_size); - if ( !minfo->pfn_type ) - { - PERROR("Could not allocate memory for the PFN types"); - goto failed; - } - for ( i = 0; i < minfo->p2m_size; i++ ) - minfo->pfn_type[i] = xc_pfn_to_mfn(i, minfo->p2m_table, - minfo->guest_width); - - /* Retrieve PFN types in batches */ - for ( i = 0; i < minfo->p2m_size ; i+=1024 ) - { - int count = ((minfo->p2m_size - i ) > 1024 ) ? - 1024: (minfo->p2m_size - i); - - if ( xc_get_pfn_type_batch(xch, domid, count, minfo->pfn_type + i) ) - { - PERROR("Could not get %d-eth batch of PFN types", (i+1)/1024); - goto failed; - } - } - - return 0; - -failed: - if ( minfo->pfn_type ) - { - free(minfo->pfn_type); - minfo->pfn_type = NULL; - } - if ( minfo->p2m_table ) - { - munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE); - minfo->p2m_table = NULL; - } - - return -1; -} diff --git a/tools/libxc/xg_nomigrate.c b/tools/libxc/xg_nomigrate.c deleted file mode 100644 index 6795c62ddc..0000000000 --- a/tools/libxc/xg_nomigrate.c +++ /dev/null @@ -1,50 +0,0 @@ -/****************************************************************************** - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - * - * Copyright (c) 2011, Citrix Systems - */ - -#include -#include -#include -#include - -int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t flags, - struct save_callbacks *callbacks, - xc_stream_type_t stream_type, int recv_fd) -{ - errno = ENOSYS; - return -1; -} - -int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, - unsigned int store_evtchn, unsigned long *store_mfn, - uint32_t store_domid, unsigned int console_evtchn, - unsigned long *console_mfn, uint32_t console_domid, - xc_stream_type_t stream_type, - struct restore_callbacks *callbacks, int send_back_fd) -{ - errno = ENOSYS; - return -1; -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_offline_page.c b/tools/libxc/xg_offline_page.c deleted file mode 100644 index 77e8889b11..0000000000 --- a/tools/libxc/xg_offline_page.c +++ /dev/null @@ -1,708 +0,0 @@ -/****************************************************************************** - * xc_offline_page.c - * - * Helper functions to offline/online one page - * - * Copyright (c) 2003, K A Fraser. - * Copyright (c) 2009, Intel Corporation. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - */ - -#include -#include -#include -#include -#include -#include - -#include "xc_private.h" -#include "xenctrl_dom.h" -#include "xg_private.h" -#include "xg_save_restore.h" - -struct pte_backup_entry -{ - xen_pfn_t table_mfn; - int offset; -}; - -#define DEFAULT_BACKUP_COUNT 1024 -struct pte_backup -{ - struct pte_backup_entry *entries; - int max; - int cur; -}; - -static struct domain_info_context _dinfo; -static struct domain_info_context *dinfo = &_dinfo; - -int xc_mark_page_online(xc_interface *xch, unsigned long start, - unsigned long end, uint32_t *status) -{ - DECLARE_SYSCTL; - DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH); - int ret = -1; - - if ( !status || (end < start) ) - { - errno = EINVAL; - return -1; - } - if ( xc_hypercall_bounce_pre(xch, status) ) - { - ERROR("Could not bounce memory for xc_mark_page_online\n"); - return -1; - } - - sysctl.cmd = XEN_SYSCTL_page_offline_op; - sysctl.u.page_offline.start = start; - sysctl.u.page_offline.cmd = sysctl_page_online; - sysctl.u.page_offline.end = end; - set_xen_guest_handle(sysctl.u.page_offline.status, status); - ret = xc_sysctl(xch, &sysctl); - - xc_hypercall_bounce_post(xch, status); - - return ret; -} - -int xc_mark_page_offline(xc_interface *xch, unsigned long start, - unsigned long end, uint32_t *status) -{ - DECLARE_SYSCTL; - DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH); - int ret = -1; - - if ( !status || (end < start) ) - { - errno = EINVAL; - return -1; - } - if ( xc_hypercall_bounce_pre(xch, status) ) - { - ERROR("Could not bounce memory for xc_mark_page_offline"); - return -1; - } - - sysctl.cmd = XEN_SYSCTL_page_offline_op; - sysctl.u.page_offline.start = start; - sysctl.u.page_offline.cmd = sysctl_page_offline; - sysctl.u.page_offline.end = end; - set_xen_guest_handle(sysctl.u.page_offline.status, status); - ret = xc_sysctl(xch, &sysctl); - - xc_hypercall_bounce_post(xch, status); - - return ret; -} - -int xc_query_page_offline_status(xc_interface *xch, unsigned long start, - unsigned long end, uint32_t *status) -{ - DECLARE_SYSCTL; - DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH); - int ret = -1; - - if ( !status || (end < start) ) - { - errno = EINVAL; - return -1; - } - if ( xc_hypercall_bounce_pre(xch, status) ) - { - ERROR("Could not bounce memory for xc_query_page_offline_status\n"); - return -1; - } - - sysctl.cmd = XEN_SYSCTL_page_offline_op; - sysctl.u.page_offline.start = start; - sysctl.u.page_offline.cmd = sysctl_query_page_offline; - sysctl.u.page_offline.end = end; - set_xen_guest_handle(sysctl.u.page_offline.status, status); - ret = xc_sysctl(xch, &sysctl); - - xc_hypercall_bounce_post(xch, status); - - return ret; -} - - /* - * There should no update to the grant when domain paused - */ -static int xc_is_page_granted_v1(xc_interface *xch, xen_pfn_t gpfn, - grant_entry_v1_t *gnttab, int gnt_num) -{ - int i = 0; - - if (!gnttab) - return 0; - - for (i = 0; i < gnt_num; i++) - if ( ((gnttab[i].flags & GTF_type_mask) != GTF_invalid) && - (gnttab[i].frame == gpfn) ) - break; - - return (i != gnt_num); -} - -static int xc_is_page_granted_v2(xc_interface *xch, xen_pfn_t gpfn, - grant_entry_v2_t *gnttab, int gnt_num) -{ - int i = 0; - - if (!gnttab) - return 0; - - for (i = 0; i < gnt_num; i++) - if ( ((gnttab[i].hdr.flags & GTF_type_mask) != GTF_invalid) && - (gnttab[i].full_page.frame == gpfn) ) - break; - - return (i != gnt_num); -} - -static int backup_ptes(xen_pfn_t table_mfn, int offset, - struct pte_backup *backup) -{ - if (!backup) - return -EINVAL; - - if (backup->max == backup->cur) - { - backup->entries = realloc(backup->entries, - backup->max * 2 * sizeof(struct pte_backup_entry)); - if (backup->entries == NULL) - return -1; - else - backup->max *= 2; - } - - backup->entries[backup->cur].table_mfn = table_mfn; - backup->entries[backup->cur++].offset = offset; - - return 0; -} - -/* - * return: - * 1 when MMU update is required - * 0 when no changes - * <0 when error happen - */ -typedef int (*pte_func)(xc_interface *xch, - uint64_t pte, uint64_t *new_pte, - unsigned long table_mfn, int table_offset, - struct pte_backup *backup, - unsigned long no_use); - -static int __clear_pte(xc_interface *xch, - uint64_t pte, uint64_t *new_pte, - unsigned long table_mfn, int table_offset, - struct pte_backup *backup, - unsigned long mfn) -{ - /* If no new_pte pointer, same as no changes needed */ - if (!new_pte || !backup) - return -EINVAL; - - if ( !(pte & _PAGE_PRESENT)) - return 0; - - /* XXX Check for PSE bit here */ - /* Hit one entry */ - if ( ((pte >> PAGE_SHIFT_X86) & MFN_MASK_X86) == mfn) - { - *new_pte = pte & ~_PAGE_PRESENT; - if (!backup_ptes(table_mfn, table_offset, backup)) - return 1; - } - - return 0; -} - -static int __update_pte(xc_interface *xch, - uint64_t pte, uint64_t *new_pte, - unsigned long table_mfn, int table_offset, - struct pte_backup *backup, - unsigned long new_mfn) -{ - int index; - - if (!new_pte) - return 0; - - for (index = 0; index < backup->cur; index ++) - if ( (backup->entries[index].table_mfn == table_mfn) && - (backup->entries[index].offset == table_offset) ) - break; - - if (index != backup->cur) - { - if (pte & _PAGE_PRESENT) - ERROR("Page present while in backup ptes\n"); - pte &= ~MFN_MASK_X86; - pte |= (new_mfn << PAGE_SHIFT_X86) | _PAGE_PRESENT; - *new_pte = pte; - return 1; - } - - return 0; -} - -static int change_pte(xc_interface *xch, uint32_t domid, - struct xc_domain_meminfo *minfo, - struct pte_backup *backup, - struct xc_mmu *mmu, - pte_func func, - unsigned long data) -{ - int pte_num, rc; - uint64_t i; - void *content = NULL; - - pte_num = PAGE_SIZE / ((minfo->pt_levels == 2) ? 4 : 8); - - for (i = 0; i < minfo->p2m_size; i++) - { - xen_pfn_t table_mfn = xc_pfn_to_mfn(i, minfo->p2m_table, - minfo->guest_width); - uint64_t pte, new_pte; - int j; - - if ( (table_mfn == INVALID_PFN) || - ((minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == - XEN_DOMCTL_PFINFO_XTAB) ) - continue; - - if ( minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) - { - content = xc_map_foreign_range(xch, domid, PAGE_SIZE, - PROT_READ, table_mfn); - if (!content) - goto failed; - - for (j = 0; j < pte_num; j++) - { - if ( minfo->pt_levels == 2 ) - pte = ((const uint32_t*)content)[j]; - else - pte = ((const uint64_t*)content)[j]; - - rc = func(xch, pte, &new_pte, table_mfn, j, backup, data); - - switch (rc) - { - case 1: - if ( xc_add_mmu_update(xch, mmu, - table_mfn << PAGE_SHIFT | - j * ( (minfo->pt_levels == 2) ? - sizeof(uint32_t): sizeof(uint64_t)) | - MMU_PT_UPDATE_PRESERVE_AD, - new_pte) ) - goto failed; - break; - - case 0: - break; - - default: - goto failed; - } - } - - munmap(content, PAGE_SIZE); - content = NULL; - } - } - - if ( xc_flush_mmu_updates(xch, mmu) ) - goto failed; - - return 0; -failed: - /* XXX Shall we take action if we have fail to swap? */ - if (content) - munmap(content, PAGE_SIZE); - - return -1; -} - -static int update_pte(xc_interface *xch, uint32_t domid, - struct xc_domain_meminfo *minfo, - struct pte_backup *backup, - struct xc_mmu *mmu, - unsigned long new_mfn) -{ - return change_pte(xch, domid, minfo, backup, mmu, - __update_pte, new_mfn); -} - -static int clear_pte(xc_interface *xch, uint32_t domid, - struct xc_domain_meminfo *minfo, - struct pte_backup *backup, - struct xc_mmu *mmu, - xen_pfn_t mfn) -{ - return change_pte(xch, domid, minfo, backup, mmu, - __clear_pte, mfn); -} - -/* - * Check if a page can be exchanged successfully - */ - -static int is_page_exchangable(xc_interface *xch, uint32_t domid, xen_pfn_t mfn, - xc_dominfo_t *info) -{ - uint32_t status; - int rc; - - /* domain checking */ - if ( !domid || (domid > DOMID_FIRST_RESERVED) ) - { - DPRINTF("Dom0's page can't be LM"); - return 0; - } - if (info->hvm) - { - DPRINTF("Currently we can only live change PV guest's page\n"); - return 0; - } - - /* Check if pages are offline pending or not */ - rc = xc_query_page_offline_status(xch, mfn, mfn, &status); - - if ( rc || !(status & PG_OFFLINE_STATUS_OFFLINE_PENDING) ) - { - ERROR("Page %lx is not offline pending %x\n", - mfn, status); - return 0; - } - - return 1; -} - -xen_pfn_t *xc_map_m2p(xc_interface *xch, - unsigned long max_mfn, - int prot, - unsigned long *mfn0) -{ - privcmd_mmap_entry_t *entries; - unsigned long m2p_chunks, m2p_size; - xen_pfn_t *m2p; - xen_pfn_t *extent_start; - int i; - - m2p = NULL; - m2p_size = M2P_SIZE(max_mfn); - m2p_chunks = M2P_CHUNKS(max_mfn); - - extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t)); - if ( !extent_start ) - { - ERROR("failed to allocate space for m2p mfns"); - goto err0; - } - - if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) ) - { - PERROR("xc_get_m2p_mfns"); - goto err1; - } - - entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t)); - if (entries == NULL) - { - ERROR("failed to allocate space for mmap entries"); - goto err1; - } - - for ( i = 0; i < m2p_chunks; i++ ) - entries[i].mfn = extent_start[i]; - - m2p = xc_map_foreign_ranges(xch, DOMID_XEN, - m2p_size, prot, M2P_CHUNK_SIZE, - entries, m2p_chunks); - if (m2p == NULL) - { - PERROR("xc_mmap_foreign_ranges failed"); - goto err2; - } - - if (mfn0) - *mfn0 = entries[0].mfn; - -err2: - free(entries); -err1: - free(extent_start); - -err0: - return m2p; -} - -/* The domain should be suspended when called here */ -int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn) -{ - xc_dominfo_t info; - struct xc_domain_meminfo minfo; - struct xc_mmu *mmu = NULL; - struct pte_backup old_ptes = {NULL, 0, 0}; - grant_entry_v1_t *gnttab_v1 = NULL; - grant_entry_v2_t *gnttab_v2 = NULL; - struct mmuext_op mops; - int gnt_num, unpined = 0; - void *old_p, *backup = NULL; - int rc, result = -1; - uint32_t status; - xen_pfn_t new_mfn, gpfn; - xen_pfn_t *m2p_table; - unsigned long max_mfn; - - if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 ) - { - ERROR("Could not get domain info"); - return -1; - } - - if (!info.shutdown || info.shutdown_reason != SHUTDOWN_suspend) - { - errno = EINVAL; - ERROR("Can't exchange page unless domain is suspended\n"); - return -1; - } - if (!is_page_exchangable(xch, domid, mfn, &info)) - { - ERROR("Could not exchange page\n"); - return -1; - } - - /* Map M2P and obtain gpfn */ - rc = xc_maximum_ram_page(xch, &max_mfn); - if ( rc || !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, NULL)) ) - { - PERROR("Failed to map live M2P table"); - return -1; - } - gpfn = m2p_table[mfn]; - - /* Map domain's memory information */ - memset(&minfo, 0, sizeof(minfo)); - if ( xc_map_domain_meminfo(xch, domid, &minfo) ) - { - PERROR("Could not map domain's memory information\n"); - goto failed; - } - - /* For translation macros */ - dinfo->guest_width = minfo.guest_width; - dinfo->p2m_size = minfo.p2m_size; - - /* Don't exchange CR3 for PAE guest in PAE host environment */ - if (minfo.guest_width > sizeof(long)) - { - if ( (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) == - XEN_DOMCTL_PFINFO_L3TAB ) - goto failed; - } - - gnttab_v2 = xc_gnttab_map_table_v2(xch, domid, &gnt_num); - if (!gnttab_v2) - { - gnttab_v1 = xc_gnttab_map_table_v1(xch, domid, &gnt_num); - if (!gnttab_v1) - { - ERROR("Failed to map grant table\n"); - goto failed; - } - } - - if (gnttab_v1 - ? xc_is_page_granted_v1(xch, mfn, gnttab_v1, gnt_num) - : xc_is_page_granted_v2(xch, mfn, gnttab_v2, gnt_num)) - { - ERROR("Page %lx is granted now\n", mfn); - goto failed; - } - - /* allocate required data structure */ - backup = malloc(PAGE_SIZE); - if (!backup) - { - ERROR("Failed to allocate backup pages pointer\n"); - goto failed; - } - - old_ptes.max = DEFAULT_BACKUP_COUNT; - old_ptes.entries = malloc(sizeof(struct pte_backup_entry) * - DEFAULT_BACKUP_COUNT); - - if (!old_ptes.entries) - { - ERROR("Faield to allocate backup\n"); - goto failed; - } - old_ptes.cur = 0; - - /* Unpin the page if it is pined */ - if (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LPINTAB) - { - mops.cmd = MMUEXT_UNPIN_TABLE; - mops.arg1.mfn = mfn; - - if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 ) - { - ERROR("Failed to unpin page %lx", mfn); - goto failed; - } - mops.arg1.mfn = mfn; - unpined = 1; - } - - /* backup the content */ - old_p = xc_map_foreign_range(xch, domid, PAGE_SIZE, - PROT_READ, mfn); - if (!old_p) - { - ERROR("Failed to map foreign page %lx\n", mfn); - goto failed; - } - - memcpy(backup, old_p, PAGE_SIZE); - munmap(old_p, PAGE_SIZE); - - mmu = xc_alloc_mmu_updates(xch, domid); - if ( mmu == NULL ) - { - ERROR("%s: failed at %d\n", __FUNCTION__, __LINE__); - goto failed; - } - - /* Firstly update all pte to be invalid to remove the reference */ - rc = clear_pte(xch, domid, &minfo, &old_ptes, mmu, mfn); - - if (rc) - { - ERROR("clear pte failed\n"); - goto failed; - } - - rc = xc_domain_memory_exchange_pages(xch, domid, - 1, 0, &mfn, - 1, 0, &new_mfn); - - if (rc) - { - ERROR("Exchange the page failed\n"); - /* Exchange fail means there are refere to the page still */ - rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, mfn); - if (rc) - result = -2; - goto failed; - } - - rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, new_mfn); - - if (rc) - { - ERROR("update pte failed guest may be broken now\n"); - /* No recover action now for swap fail */ - result = -2; - goto failed; - } - - /* Check if pages are offlined already */ - rc = xc_query_page_offline_status(xch, mfn, mfn, - &status); - - if (rc) - { - ERROR("Fail to query offline status\n"); - }else if ( !(status & PG_OFFLINE_STATUS_OFFLINED) ) - { - ERROR("page is still online or pending\n"); - goto failed; - } - else - { - void *new_p; - IPRINTF("Now page is offlined %lx\n", mfn); - /* Update the p2m table */ - minfo.p2m_table[gpfn] = new_mfn; - - new_p = xc_map_foreign_range(xch, domid, PAGE_SIZE, - PROT_READ|PROT_WRITE, new_mfn); - if ( new_p == NULL ) - { - ERROR("failed to map new_p for copy, guest may be broken?"); - goto failed; - } - memcpy(new_p, backup, PAGE_SIZE); - munmap(new_p, PAGE_SIZE); - mops.arg1.mfn = new_mfn; - result = 0; - } - -failed: - - if (unpined && (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB)) - { - switch ( minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) - { - case XEN_DOMCTL_PFINFO_L1TAB: - mops.cmd = MMUEXT_PIN_L1_TABLE; - break; - - case XEN_DOMCTL_PFINFO_L2TAB: - mops.cmd = MMUEXT_PIN_L2_TABLE; - break; - - case XEN_DOMCTL_PFINFO_L3TAB: - mops.cmd = MMUEXT_PIN_L3_TABLE; - break; - - case XEN_DOMCTL_PFINFO_L4TAB: - mops.cmd = MMUEXT_PIN_L4_TABLE; - break; - - default: - ERROR("Unpined for non pate table page\n"); - break; - } - - if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 ) - { - ERROR("failed to pin the mfn again\n"); - result = -2; - } - } - - free(mmu); - - free(old_ptes.entries); - - free(backup); - - if (gnttab_v1) - munmap(gnttab_v1, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v1_t))); - if (gnttab_v2) - munmap(gnttab_v2, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v2_t))); - - xc_unmap_domain_meminfo(xch, &minfo); - munmap(m2p_table, M2P_SIZE(max_mfn)); - - return result; -} diff --git a/tools/libxc/xg_private.c b/tools/libxc/xg_private.c deleted file mode 100644 index 2073dba2ef..0000000000 --- a/tools/libxc/xg_private.c +++ /dev/null @@ -1,198 +0,0 @@ -/****************************************************************************** - * xg_private.c - * - * Helper functions for the rest of the library. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - */ - -#include -#include -#include - -#include "xg_private.h" - -char *xc_read_image(xc_interface *xch, - const char *filename, unsigned long *size) -{ - int kernel_fd = -1; - gzFile kernel_gfd = NULL; - char *image = NULL, *tmp; - unsigned int bytes; - - if ( (filename == NULL) || (size == NULL) ) - return NULL; - - if ( (kernel_fd = open(filename, O_RDONLY)) < 0 ) - { - PERROR("Could not open kernel image '%s'", filename); - goto out; - } - - if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL ) - { - PERROR("Could not allocate decompression state for state file"); - goto out; - } - - *size = 0; - -#define CHUNK 1*1024*1024 - while(1) - { - if ( (tmp = realloc(image, *size + CHUNK)) == NULL ) - { - PERROR("Could not allocate memory for kernel image"); - free(image); - image = NULL; - goto out; - } - image = tmp; - - bytes = gzread(kernel_gfd, image + *size, CHUNK); - switch (bytes) - { - case -1: - PERROR("Error reading kernel image"); - free(image); - image = NULL; - goto out; - case 0: /* EOF */ - if ( *size == 0 ) - { - PERROR("Could not read kernel image"); - free(image); - image = NULL; - } - goto out; - default: - *size += bytes; - break; - } - } -#undef CHUNK - - out: - if ( image ) - { - /* Shrink allocation to fit image. */ - tmp = realloc(image, *size); - if ( tmp ) - image = tmp; - } - - if ( kernel_gfd != NULL ) - gzclose(kernel_gfd); - else if ( kernel_fd >= 0 ) - close(kernel_fd); - return image; -} - -char *xc_inflate_buffer(xc_interface *xch, - const char *in_buf, unsigned long in_size, - unsigned long *out_size) -{ - int sts; - z_stream zStream; - unsigned long out_len; - char *out_buf; - - /* Not compressed? Then return the original buffer. */ - if ( ((unsigned char)in_buf[0] != 0x1F) || - ((unsigned char)in_buf[1] != 0x8B) ) - { - if ( out_size != NULL ) - *out_size = in_size; - return (char *)in_buf; - } - - out_len = (unsigned char)in_buf[in_size-4] + - (256 * ((unsigned char)in_buf[in_size-3] + - (256 * ((unsigned char)in_buf[in_size-2] + - (256 * (unsigned char)in_buf[in_size-1]))))); - - memset(&zStream, 0, sizeof(zStream)); - out_buf = malloc(out_len + 16); /* Leave a little extra space */ - if ( out_buf == NULL ) - { - ERROR("Error mallocing buffer\n"); - return NULL; - } - - zStream.next_in = (unsigned char *)in_buf; - zStream.avail_in = in_size; - zStream.next_out = (unsigned char *)out_buf; - zStream.avail_out = out_len+16; - sts = inflateInit2(&zStream, (MAX_WBITS+32)); /* +32 means "handle gzip" */ - if ( sts != Z_OK ) - { - ERROR("inflateInit failed, sts %d\n", sts); - free(out_buf); - return NULL; - } - - /* Inflate in one pass/call */ - sts = inflate(&zStream, Z_FINISH); - inflateEnd(&zStream); - if ( sts != Z_STREAM_END ) - { - ERROR("inflate failed, sts %d\n", sts); - free(out_buf); - return NULL; - } - - if ( out_size != NULL ) - *out_size = out_len; - - return out_buf; -} - -/*******************/ - -int pin_table( - xc_interface *xch, unsigned int type, unsigned long mfn, uint32_t dom) -{ - struct mmuext_op op; - - op.cmd = type; - op.arg1.mfn = mfn; - - if ( xc_mmuext_op(xch, &op, 1, dom) < 0 ) - return 1; - - return 0; -} - -/* This is shared between save and restore, and may generally be useful. */ -unsigned long csum_page(void *page) -{ - int i; - unsigned long *p = page; - unsigned long long sum=0; - - for ( i = 0; i < (PAGE_SIZE/sizeof(unsigned long)); i++ ) - sum += p[i]; - - return sum ^ (sum>>32); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_private.h b/tools/libxc/xg_private.h deleted file mode 100644 index 0000b2b9b6..0000000000 --- a/tools/libxc/xg_private.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - */ - -#ifndef XG_PRIVATE_H -#define XG_PRIVATE_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xc_private.h" -#include "xenguest.h" - -#include -#include - -#ifndef ELFSIZE -#include -#if UINT_MAX == ULONG_MAX -#define ELFSIZE 32 -#else -#define ELFSIZE 64 -#endif -#endif - -char *xc_read_image(xc_interface *xch, - const char *filename, unsigned long *size); -char *xc_inflate_buffer(xc_interface *xch, - const char *in_buf, - unsigned long in_size, - unsigned long *out_size); - -unsigned long csum_page (void * page); - -#define _PAGE_PRESENT 0x001 -#define _PAGE_RW 0x002 -#define _PAGE_USER 0x004 -#define _PAGE_PWT 0x008 -#define _PAGE_PCD 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_PAT 0x080 -#define _PAGE_PSE 0x080 -#define _PAGE_GLOBAL 0x100 - -#define VIRT_BITS_I386 32 -#define VIRT_BITS_X86_64 48 - -#define PGTBL_LEVELS_I386 3 -#define PGTBL_LEVELS_X86_64 4 - -#define PGTBL_LEVEL_SHIFT_X86 9 - -#define L1_PAGETABLE_SHIFT_PAE 12 -#define L2_PAGETABLE_SHIFT_PAE 21 -#define L3_PAGETABLE_SHIFT_PAE 30 -#define L1_PAGETABLE_ENTRIES_PAE 512 -#define L2_PAGETABLE_ENTRIES_PAE 512 -#define L3_PAGETABLE_ENTRIES_PAE 4 - -#define L1_PAGETABLE_SHIFT_X86_64 12 -#define L2_PAGETABLE_SHIFT_X86_64 21 -#define L3_PAGETABLE_SHIFT_X86_64 30 -#define L4_PAGETABLE_SHIFT_X86_64 39 -#define L1_PAGETABLE_ENTRIES_X86_64 512 -#define L2_PAGETABLE_ENTRIES_X86_64 512 -#define L3_PAGETABLE_ENTRIES_X86_64 512 -#define L4_PAGETABLE_ENTRIES_X86_64 512 - -typedef uint64_t x86_pgentry_t; - -#define PAGE_SHIFT_ARM 12 -#define PAGE_SIZE_ARM (1UL << PAGE_SHIFT_ARM) -#define PAGE_MASK_ARM (~(PAGE_SIZE_ARM-1)) - -#define PAGE_SHIFT_X86 12 -#define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86) -#define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1)) - -#define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT) - -static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m, - unsigned gwidth) -{ - if ( gwidth == sizeof(uint64_t) ) - /* 64 bit guest. Need to truncate their pfns for 32 bit toolstacks. */ - return ((uint64_t *)p2m)[pfn]; - else - { - /* 32 bit guest. Need to expand INVALID_MFN for 64 bit toolstacks. */ - uint32_t mfn = ((uint32_t *)p2m)[pfn]; - - return mfn == ~0U ? INVALID_MFN : mfn; - } -} - - -/* Masks for PTE<->PFN conversions */ -#define MADDR_BITS_X86 ((dinfo->guest_width == 8) ? 52 : 44) -#define MFN_MASK_X86 ((1ULL << (MADDR_BITS_X86 - PAGE_SHIFT_X86)) - 1) -#define MADDR_MASK_X86 (MFN_MASK_X86 << PAGE_SHIFT_X86) - -int pin_table(xc_interface *xch, unsigned int type, unsigned long mfn, - uint32_t dom); - -#endif /* XG_PRIVATE_H */ diff --git a/tools/libxc/xg_save_restore.h b/tools/libxc/xg_save_restore.h deleted file mode 100644 index 88120eb54b..0000000000 --- a/tools/libxc/xg_save_restore.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Definitions and utilities for save / restore. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - */ - -#include "xc_private.h" - -#include -#include - -/* -** We process save/restore/migrate in batches of pages; the below -** determines how many pages we (at maximum) deal with in each batch. -*/ -#define MAX_BATCH_SIZE 1024 /* up to 1024 pages (4MB) at a time */ - -/* When pinning page tables at the end of restore, we also use batching. */ -#define MAX_PIN_BATCH 1024 - -/* -** Determine various platform information required for save/restore, in -** particular: -** -** - the maximum MFN on this machine, used to compute the size of -** the M2P table; -** -** - the starting virtual address of the the hypervisor; we use this -** to determine which parts of guest address space(s) do and don't -** require canonicalization during save/restore; and -** -** - the number of page-table levels for save/ restore. This should -** be a property of the domain, but for the moment we just read it -** from the hypervisor. -** -** - The width of a guest word (unsigned long), in bytes. -** -** Returns 1 on success, 0 on failure. -*/ -static inline int get_platform_info(xc_interface *xch, uint32_t dom, - /* OUT */ unsigned long *max_mfn, - /* OUT */ unsigned long *hvirt_start, - /* OUT */ unsigned int *pt_levels, - /* OUT */ unsigned int *guest_width) -{ - xen_capabilities_info_t xen_caps = ""; - xen_platform_parameters_t xen_params; - - if (xc_version(xch, XENVER_platform_parameters, &xen_params) != 0) - return 0; - - if (xc_version(xch, XENVER_capabilities, &xen_caps) != 0) - return 0; - - if (xc_maximum_ram_page(xch, max_mfn)) - return 0; - - *hvirt_start = xen_params.virt_start; - - if ( xc_domain_get_guest_width(xch, dom, guest_width) != 0) - return 0; - - /* 64-bit tools will see the 64-bit hvirt_start, but 32-bit guests - * will be using the compat one. */ - if ( *guest_width < sizeof (unsigned long) ) - /* XXX need to fix up a way of extracting this value from Xen if - * XXX it becomes variable for domU */ - *hvirt_start = 0xf5800000; - - if (strstr(xen_caps, "xen-3.0-x86_64")) - /* Depends on whether it's a compat 32-on-64 guest */ - *pt_levels = ( (*guest_width == 8) ? 4 : 3 ); - else if (strstr(xen_caps, "xen-3.0-x86_32p")) - *pt_levels = 3; - else - return 0; - - return 1; -} - - -/* -** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables. -** The M2P simply holds the corresponding PFN, while the top bit of a P2M -** entry tell us whether or not the the PFN is currently mapped. -*/ - -#define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10)) - - -/* -** The M2P is made up of some number of 'chunks' of at least 2MB in size. -** The below definitions and utility function(s) deal with mapping the M2P -** regarldess of the underlying machine memory size or architecture. -*/ -#define M2P_SHIFT L2_PAGETABLE_SHIFT_PAE -#define M2P_CHUNK_SIZE (1 << M2P_SHIFT) -#define M2P_SIZE(_m) ROUNDUP(((_m) * sizeof(xen_pfn_t)), M2P_SHIFT) -#define M2P_CHUNKS(_m) (M2P_SIZE((_m)) >> M2P_SHIFT) - -#define UNFOLD_CR3(_c) \ - ((uint64_t)((dinfo->guest_width == 8) \ - ? ((_c) >> 12) \ - : (((uint32_t)(_c) >> 12) | ((uint32_t)(_c) << 20)))) - -#define FOLD_CR3(_c) \ - ((uint64_t)((dinfo->guest_width == 8) \ - ? ((uint64_t)(_c)) << 12 \ - : (((uint32_t)(_c) << 12) | ((uint32_t)(_c) >> 20)))) - -#define MEMCPY_FIELD(_d, _s, _f, _w) do { \ - if ((_w) == 8) \ - memcpy(&(_d)->x64._f, &(_s)->x64._f,sizeof((_d)->x64._f)); \ - else \ - memcpy(&(_d)->x32._f, &(_s)->x32._f,sizeof((_d)->x32._f)); \ -} while (0) - -#define MEMSET_ARRAY_FIELD(_p, _f, _v, _w) do { \ - if ((_w) == 8) \ - memset(&(_p)->x64._f[0], (_v), sizeof((_p)->x64._f)); \ - else \ - memset(&(_p)->x32._f[0], (_v), sizeof((_p)->x32._f)); \ -} while (0) diff --git a/tools/libxc/xg_sr_common.c b/tools/libxc/xg_sr_common.c deleted file mode 100644 index 17567ab133..0000000000 --- a/tools/libxc/xg_sr_common.c +++ /dev/null @@ -1,167 +0,0 @@ -#include - -#include "xg_sr_common.h" - -#include - -static const char *const dhdr_types[] = -{ - [DHDR_TYPE_X86_PV] = "x86 PV", - [DHDR_TYPE_X86_HVM] = "x86 HVM", -}; - -const char *dhdr_type_to_str(uint32_t type) -{ - if ( type < ARRAY_SIZE(dhdr_types) && dhdr_types[type] ) - return dhdr_types[type]; - - return "Reserved"; -} - -static const char *const mandatory_rec_types[] = -{ - [REC_TYPE_END] = "End", - [REC_TYPE_PAGE_DATA] = "Page data", - [REC_TYPE_X86_PV_INFO] = "x86 PV info", - [REC_TYPE_X86_PV_P2M_FRAMES] = "x86 PV P2M frames", - [REC_TYPE_X86_PV_VCPU_BASIC] = "x86 PV vcpu basic", - [REC_TYPE_X86_PV_VCPU_EXTENDED] = "x86 PV vcpu extended", - [REC_TYPE_X86_PV_VCPU_XSAVE] = "x86 PV vcpu xsave", - [REC_TYPE_SHARED_INFO] = "Shared info", - [REC_TYPE_X86_TSC_INFO] = "x86 TSC info", - [REC_TYPE_HVM_CONTEXT] = "HVM context", - [REC_TYPE_HVM_PARAMS] = "HVM params", - [REC_TYPE_TOOLSTACK] = "Toolstack", - [REC_TYPE_X86_PV_VCPU_MSRS] = "x86 PV vcpu msrs", - [REC_TYPE_VERIFY] = "Verify", - [REC_TYPE_CHECKPOINT] = "Checkpoint", - [REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST] = "Checkpoint dirty pfn list", - [REC_TYPE_STATIC_DATA_END] = "Static data end", - [REC_TYPE_X86_CPUID_POLICY] = "x86 CPUID policy", - [REC_TYPE_X86_MSR_POLICY] = "x86 MSR policy", -}; - -const char *rec_type_to_str(uint32_t type) -{ - if ( !(type & REC_TYPE_OPTIONAL) ) - { - if ( (type < ARRAY_SIZE(mandatory_rec_types)) && - (mandatory_rec_types[type]) ) - return mandatory_rec_types[type]; - } - - return "Reserved"; -} - -int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec, - void *buf, size_t sz) -{ - static const char zeroes[(1u << REC_ALIGN_ORDER) - 1] = { 0 }; - - xc_interface *xch = ctx->xch; - typeof(rec->length) combined_length = rec->length + sz; - size_t record_length = ROUNDUP(combined_length, REC_ALIGN_ORDER); - struct iovec parts[] = { - { &rec->type, sizeof(rec->type) }, - { &combined_length, sizeof(combined_length) }, - { rec->data, rec->length }, - { buf, sz }, - { (void *)zeroes, record_length - combined_length }, - }; - - if ( record_length > REC_LENGTH_MAX ) - { - ERROR("Record (0x%08x, %s) length %#zx exceeds max (%#x)", rec->type, - rec_type_to_str(rec->type), record_length, REC_LENGTH_MAX); - return -1; - } - - if ( rec->length ) - assert(rec->data); - if ( sz ) - assert(buf); - - if ( writev_exact(ctx->fd, parts, ARRAY_SIZE(parts)) ) - goto err; - - return 0; - - err: - PERROR("Unable to write record to stream"); - return -1; -} - -int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rhdr rhdr; - size_t datasz; - - if ( read_exact(fd, &rhdr, sizeof(rhdr)) ) - { - PERROR("Failed to read Record Header from stream"); - return -1; - } - - if ( rhdr.length > REC_LENGTH_MAX ) - { - ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rhdr.type, - rec_type_to_str(rhdr.type), rhdr.length, REC_LENGTH_MAX); - return -1; - } - - datasz = ROUNDUP(rhdr.length, REC_ALIGN_ORDER); - - if ( datasz ) - { - rec->data = malloc(datasz); - - if ( !rec->data ) - { - ERROR("Unable to allocate %zu bytes for record data (0x%08x, %s)", - datasz, rhdr.type, rec_type_to_str(rhdr.type)); - return -1; - } - - if ( read_exact(fd, rec->data, datasz) ) - { - free(rec->data); - rec->data = NULL; - PERROR("Failed to read %zu bytes of data for record (0x%08x, %s)", - datasz, rhdr.type, rec_type_to_str(rhdr.type)); - return -1; - } - } - else - rec->data = NULL; - - rec->type = rhdr.type; - rec->length = rhdr.length; - - return 0; -}; - -static void __attribute__((unused)) build_assertions(void) -{ - BUILD_BUG_ON(sizeof(struct xc_sr_ihdr) != 24); - BUILD_BUG_ON(sizeof(struct xc_sr_dhdr) != 16); - BUILD_BUG_ON(sizeof(struct xc_sr_rhdr) != 8); - - BUILD_BUG_ON(sizeof(struct xc_sr_rec_page_data_header) != 8); - BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_info) != 8); - BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_p2m_frames) != 8); - BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_vcpu_hdr) != 8); - BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_tsc_info) != 24); - BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params_entry) != 16); - BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params) != 8); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_common.h b/tools/libxc/xg_sr_common.h deleted file mode 100644 index 13fcc47420..0000000000 --- a/tools/libxc/xg_sr_common.h +++ /dev/null @@ -1,468 +0,0 @@ -#ifndef __COMMON__H -#define __COMMON__H - -#include - -#include "xg_private.h" -#include "xg_save_restore.h" -#include "xenctrl_dom.h" -#include "xc_bitops.h" - -#include "xg_sr_stream_format.h" - -/* String representation of Domain Header types. */ -const char *dhdr_type_to_str(uint32_t type); - -/* String representation of Record types. */ -const char *rec_type_to_str(uint32_t type); - -struct xc_sr_context; -struct xc_sr_record; - -/** - * Save operations. To be implemented for each type of guest, for use by the - * common save algorithm. - * - * Every function must be implemented, even if only with a no-op stub. - */ -struct xc_sr_save_ops -{ - /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */ - xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn); - - /** - * Optionally transform the contents of a page from being specific to the - * sending environment, to being generic for the stream. - * - * The page of data at the end of 'page' may be a read-only mapping of a - * running guest; it must not be modified. If no transformation is - * required, the callee should leave '*pages' untouched. - * - * If a transformation is required, the callee should allocate themselves - * a local page using malloc() and return it via '*page'. - * - * The caller shall free() '*page' in all cases. In the case that the - * callee encounters an error, it should *NOT* free() the memory it - * allocated for '*page'. - * - * It is valid to fail with EAGAIN if the transformation is not able to be - * completed at this point. The page shall be retried later. - * - * @returns 0 for success, -1 for failure, with errno appropriately set. - */ - int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type, - void **page); - - /** - * Set up local environment to save a domain. (Typically querying - * running domain state, setting up mappings etc.) - * - * This is called once before any common setup has occurred, allowing for - * guest-specific adjustments to be made to common state. - */ - int (*setup)(struct xc_sr_context *ctx); - - /** - * Send static records at the head of the stream. This is called once, - * after the Image and Domain headers are written. - */ - int (*static_data)(struct xc_sr_context *ctx); - - /** - * Send dynamic records which need to be at the start of the stream. This - * is called after the STATIC_DATA_END record is written. - */ - int (*start_of_stream)(struct xc_sr_context *ctx); - - /** - * Send records which need to be at the start of a checkpoint. This is - * called once, or once per checkpoint in a checkpointed stream, and is - * ahead of memory data. - */ - int (*start_of_checkpoint)(struct xc_sr_context *ctx); - - /** - * Send records which need to be at the end of the checkpoint. This is - * called once, or once per checkpoint in a checkpointed stream, and is - * after the memory data. - */ - int (*end_of_checkpoint)(struct xc_sr_context *ctx); - - /** - * Check state of guest to decide whether it makes sense to continue - * migration. This is called in each iteration or checkpoint to check - * whether all criteria for the migration are still met. If that's not - * the case either migration is cancelled via a bad rc or the situation - * is handled, e.g. by sending appropriate records. - */ - int (*check_vm_state)(struct xc_sr_context *ctx); - - /** - * Clean up the local environment. Will be called exactly once, either - * after a successful save, or upon encountering an error. - */ - int (*cleanup)(struct xc_sr_context *ctx); -}; - - -/** - * Restore operations. To be implemented for each type of guest, for use by - * the common restore algorithm. - * - * Every function must be implemented, even if only with a no-op stub. - */ -struct xc_sr_restore_ops -{ - /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */ - xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn); - - /* Check to see whether a PFN is valid. */ - bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn); - - /* Set the GFN of a PFN. */ - void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn); - - /* Set the type of a PFN. */ - void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn, - xen_pfn_t type); - - /** - * Optionally transform the contents of a page from being generic in the - * stream, to being specific to the restoring environment. - * - * 'page' is expected to be modified in-place if a transformation is - * required. - * - * @returns 0 for success, -1 for failure, with errno appropriately set. - */ - int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page); - - /** - * Set up local environment to restore a domain. - * - * This is called once before any common setup has occurred, allowing for - * guest-specific adjustments to be made to common state. - */ - int (*setup)(struct xc_sr_context *ctx); - - /** - * Process an individual record from the stream. The caller shall take - * care of processing common records (e.g. END, PAGE_DATA). - * - * @return 0 for success, -1 for failure, or the following sentinels: - * - RECORD_NOT_PROCESSED - * - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and - * a failover is needed. - */ -#define RECORD_NOT_PROCESSED 1 -#define BROKEN_CHANNEL 2 - int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec); - - /** - * Perform any actions required after the static data has arrived. Called - * when the STATIC_DATA_COMPLETE record has been recieved/inferred. - * 'missing' should be filled in for any data item the higher level - * toolstack needs to provide compatiblity for. - */ - int (*static_data_complete)(struct xc_sr_context *ctx, - unsigned int *missing); - - /** - * Perform any actions required after the stream has been finished. Called - * after the END record has been received. - */ - int (*stream_complete)(struct xc_sr_context *ctx); - - /** - * Clean up the local environment. Will be called exactly once, either - * after a successful restore, or upon encountering an error. - */ - int (*cleanup)(struct xc_sr_context *ctx); -}; - -/* Wrapper for blobs of data heading Xen-wards. */ -struct xc_sr_blob -{ - void *ptr; - size_t size; -}; - -/* - * Update a blob. Duplicate src/size, freeing the old blob if necessary. May - * fail due to memory allocation. - */ -static inline int update_blob(struct xc_sr_blob *blob, - const void *src, size_t size) -{ - void *ptr; - - if ( !src || !size ) - { - errno = EINVAL; - return -1; - } - - if ( (ptr = malloc(size)) == NULL ) - return -1; - - free(blob->ptr); - blob->ptr = memcpy(ptr, src, size); - blob->size = size; - - return 0; -} - -struct xc_sr_context -{ - xc_interface *xch; - uint32_t domid; - int fd; - - /* Plain VM, or checkpoints over time. */ - xc_stream_type_t stream_type; - - xc_dominfo_t dominfo; - - union /* Common save or restore data. */ - { - struct /* Save data. */ - { - int recv_fd; - - struct xc_sr_save_ops ops; - struct save_callbacks *callbacks; - - /* Live migrate vs non live suspend. */ - bool live; - - /* Further debugging information in the stream. */ - bool debug; - - unsigned long p2m_size; - - struct precopy_stats stats; - - xen_pfn_t *batch_pfns; - unsigned int nr_batch_pfns; - unsigned long *deferred_pages; - unsigned long nr_deferred_pages; - xc_hypercall_buffer_t dirty_bitmap_hbuf; - } save; - - struct /* Restore data. */ - { - struct xc_sr_restore_ops ops; - struct restore_callbacks *callbacks; - - int send_back_fd; - unsigned long p2m_size; - xc_hypercall_buffer_t dirty_bitmap_hbuf; - - /* From Image Header. */ - uint32_t format_version; - - /* From Domain Header. */ - uint32_t guest_type; - uint32_t guest_page_size; - - /* Currently buffering records between a checkpoint */ - bool buffer_all_records; - - /* Whether a STATIC_DATA_END record has been seen/inferred. */ - bool seen_static_data_end; - -/* - * With Remus/COLO, we buffer the records sent by the primary at checkpoint, - * in case the primary will fail, we can recover from the last - * checkpoint state. - * This should be enough for most of the cases because primary only send - * dirty pages at checkpoint. - */ -#define DEFAULT_BUF_RECORDS 1024 - struct xc_sr_record *buffered_records; - unsigned int allocated_rec_num; - unsigned int buffered_rec_num; - - /* - * Xenstore and Console parameters. - * INPUT: evtchn & domid - * OUTPUT: gfn - */ - xen_pfn_t xenstore_gfn, console_gfn; - unsigned int xenstore_evtchn, console_evtchn; - uint32_t xenstore_domid, console_domid; - - /* Bitmap of currently populated PFNs during restore. */ - unsigned long *populated_pfns; - xen_pfn_t max_populated_pfn; - - /* Sender has invoked verify mode on the stream. */ - bool verify; - } restore; - }; - - union /* Guest-arch specific data. */ - { - struct /* x86 */ - { - /* Common save/restore data. */ - union - { - struct - { - /* X86_{CPUID,MSR}_DATA blobs for CPU Policy. */ - struct xc_sr_blob cpuid, msr; - } restore; - }; - - struct /* x86 PV guest. */ - { - /* 4 or 8; 32 or 64 bit domain */ - unsigned int width; - /* 3 or 4 pagetable levels */ - unsigned int levels; - - /* Maximum Xen frame */ - xen_pfn_t max_mfn; - /* Read-only machine to phys map */ - xen_pfn_t *m2p; - /* first mfn of the compat m2p (Only needed for 32bit PV guests) */ - xen_pfn_t compat_m2p_mfn0; - /* Number of m2p frames mapped */ - unsigned long nr_m2p_frames; - - /* Maximum guest frame */ - xen_pfn_t max_pfn; - - /* Number of frames making up the p2m */ - unsigned int p2m_frames; - /* Guest's phys to machine map. Mapped read-only (save) or - * allocated locally (restore). Uses guest unsigned longs. */ - void *p2m; - /* The guest pfns containing the p2m leaves */ - xen_pfn_t *p2m_pfns; - - /* Read-only mapping of guests shared info page */ - shared_info_any_t *shinfo; - - /* p2m generation count for verifying validity of local p2m. */ - uint64_t p2m_generation; - - union - { - struct - { - /* State machine for the order of received records. */ - bool seen_pv_info; - - /* Types for each page (bounded by max_pfn). */ - uint32_t *pfn_types; - - /* x86 PV per-vcpu storage structure for blobs. */ - struct xc_sr_x86_pv_restore_vcpu - { - struct xc_sr_blob basic, extd, xsave, msr; - } *vcpus; - unsigned int nr_vcpus; - } restore; - }; - } pv; - - struct /* x86 HVM guest. */ - { - union - { - struct - { - /* Whether qemu enabled logdirty mode, and we should - * disable on cleanup. */ - bool qemu_enabled_logdirty; - } save; - - struct - { - /* HVM context blob. */ - struct xc_sr_blob context; - } restore; - }; - } hvm; - - } x86; - }; -}; - -extern struct xc_sr_save_ops save_ops_x86_pv; -extern struct xc_sr_save_ops save_ops_x86_hvm; - -extern struct xc_sr_restore_ops restore_ops_x86_pv; -extern struct xc_sr_restore_ops restore_ops_x86_hvm; - -struct xc_sr_record -{ - uint32_t type; - uint32_t length; - void *data; -}; - -/* - * Writes a split record to the stream, applying correct padding where - * appropriate. It is common when sending records containing blobs from Xen - * that the header and blob data are separate. This function accepts a second - * buffer and length, and will merge it with the main record when sending. - * - * Records with a non-zero length must provide a valid data field; records - * with a 0 length shall have their data field ignored. - * - * Returns 0 on success and non0 on failure. - */ -int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec, - void *buf, size_t sz); - -/* - * Writes a record to the stream, applying correct padding where appropriate. - * Records with a non-zero length must provide a valid data field; records - * with a 0 length shall have their data field ignored. - * - * Returns 0 on success and non0 on failure. - */ -static inline int write_record(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - return write_split_record(ctx, rec, NULL, 0); -} - -/* - * Reads a record from the stream, and fills in the record structure. - * - * Returns 0 on success and non-0 on failure. - * - * On success, the records type and size shall be valid. - * - If size is 0, data shall be NULL. - * - If size is non-0, data shall be a buffer allocated by malloc() which must - * be passed to free() by the caller. - * - * On failure, the contents of the record structure are undefined. - */ -int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec); - -/* - * This would ideally be private in restore.c, but is needed by - * x86_pv_localise_page() if we receive pagetables frames ahead of the - * contents of the frames they point at. - */ -int populate_pfns(struct xc_sr_context *ctx, unsigned int count, - const xen_pfn_t *original_pfns, const uint32_t *types); - -/* Handle a STATIC_DATA_END record. */ -int handle_static_data_end(struct xc_sr_context *ctx); - -#endif -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_common_x86.c b/tools/libxc/xg_sr_common_x86.c deleted file mode 100644 index 6f12483907..0000000000 --- a/tools/libxc/xg_sr_common_x86.c +++ /dev/null @@ -1,173 +0,0 @@ -#include "xg_sr_common_x86.h" - -int write_x86_tsc_info(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_x86_tsc_info tsc = {}; - struct xc_sr_record rec = { - .type = REC_TYPE_X86_TSC_INFO, - .length = sizeof(tsc), - .data = &tsc, - }; - - if ( xc_domain_get_tsc_info(xch, ctx->domid, &tsc.mode, - &tsc.nsec, &tsc.khz, &tsc.incarnation) < 0 ) - { - PERROR("Unable to obtain TSC information"); - return -1; - } - - return write_record(ctx, &rec); -} - -int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_x86_tsc_info *tsc = rec->data; - - if ( rec->length != sizeof(*tsc) ) - { - ERROR("X86_TSC_INFO record wrong size: length %u, expected %zu", - rec->length, sizeof(*tsc)); - return -1; - } - - if ( xc_domain_set_tsc_info(xch, ctx->domid, tsc->mode, - tsc->nsec, tsc->khz, tsc->incarnation) ) - { - PERROR("Unable to set TSC information"); - return -1; - } - - return 0; -} - -int write_x86_cpu_policy_records(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_record cpuid = { .type = REC_TYPE_X86_CPUID_POLICY, }; - struct xc_sr_record msrs = { .type = REC_TYPE_X86_MSR_POLICY, }; - uint32_t nr_leaves = 0, nr_msrs = 0; - int rc; - - if ( xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs) < 0 ) - { - PERROR("Unable to get CPU Policy size"); - return -1; - } - - cpuid.data = malloc(nr_leaves * sizeof(xen_cpuid_leaf_t)); - msrs.data = malloc(nr_msrs * sizeof(xen_msr_entry_t)); - if ( !cpuid.data || !msrs.data ) - { - ERROR("Cannot allocate memory for CPU Policy"); - rc = -1; - goto out; - } - - if ( xc_get_domain_cpu_policy(xch, ctx->domid, &nr_leaves, cpuid.data, - &nr_msrs, msrs.data) ) - { - PERROR("Unable to get d%d CPU Policy", ctx->domid); - rc = -1; - goto out; - } - - cpuid.length = nr_leaves * sizeof(xen_cpuid_leaf_t); - if ( cpuid.length ) - { - rc = write_record(ctx, &cpuid); - if ( rc ) - goto out; - } - - msrs.length = nr_msrs * sizeof(xen_msr_entry_t); - if ( msrs.length ) - rc = write_record(ctx, &msrs); - - out: - free(cpuid.data); - free(msrs.data); - - return rc; -} - -int handle_x86_cpuid_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - int rc; - - if ( rec->length == 0 || - rec->length % sizeof(xen_cpuid_leaf_t) != 0 ) - { - ERROR("X86_CPUID_POLICY size %u should be multiple of %zu", - rec->length, sizeof(xen_cpuid_leaf_t)); - return -1; - } - - rc = update_blob(&ctx->x86.restore.cpuid, rec->data, rec->length); - if ( rc ) - ERROR("Unable to allocate %u bytes for X86_CPUID_POLICY", rec->length); - - return rc; -} - -int handle_x86_msr_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - int rc; - - if ( rec->length == 0 || - rec->length % sizeof(xen_msr_entry_t) != 0 ) - { - ERROR("X86_MSR_POLICY size %u should be multiple of %zu", - rec->length, sizeof(xen_cpuid_leaf_t)); - return -1; - } - - rc = update_blob(&ctx->x86.restore.msr, rec->data, rec->length); - if ( rc ) - ERROR("Unable to allocate %u bytes for X86_MSR_POLICY", rec->length); - - return rc; -} - -int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing) -{ - xc_interface *xch = ctx->xch; - uint32_t nr_leaves = 0, nr_msrs = 0; - uint32_t err_l = ~0, err_s = ~0, err_m = ~0; - - if ( ctx->x86.restore.cpuid.ptr ) - nr_leaves = ctx->x86.restore.cpuid.size / sizeof(xen_cpuid_leaf_t); - else - *missing |= XGR_SDD_MISSING_CPUID; - - if ( ctx->x86.restore.msr.ptr ) - nr_msrs = ctx->x86.restore.msr.size / sizeof(xen_msr_entry_t); - else - *missing |= XGR_SDD_MISSING_MSR; - - if ( (nr_leaves || nr_msrs) && - xc_set_domain_cpu_policy(xch, ctx->domid, - nr_leaves, ctx->x86.restore.cpuid.ptr, - nr_msrs, ctx->x86.restore.msr.ptr, - &err_l, &err_s, &err_m) ) - { - PERROR("Failed to set CPUID policy: leaf %08x, subleaf %08x, msr %08x", - err_l, err_s, err_m); - return -1; - } - - return 0; -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_common_x86.h b/tools/libxc/xg_sr_common_x86.h deleted file mode 100644 index b55758c96d..0000000000 --- a/tools/libxc/xg_sr_common_x86.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __COMMON_X86__H -#define __COMMON_X86__H - -#include "xg_sr_common.h" - -/* - * Obtains a domains TSC information from Xen and writes a X86_TSC_INFO record - * into the stream. - */ -int write_x86_tsc_info(struct xc_sr_context *ctx); - -/* - * Parses a X86_TSC_INFO record and applies the result to the domain. - */ -int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec); - -/* - * Obtains a domains CPU Policy from Xen, and writes X86_{CPUID,MSR}_POLICY - * records into the stream. - */ -int write_x86_cpu_policy_records(struct xc_sr_context *ctx); - -/* - * Parses an X86_CPUID_POLICY record and stashes the content for application - * when a STATIC_DATA_END record is encountered. - */ -int handle_x86_cpuid_policy(struct xc_sr_context *ctx, - struct xc_sr_record *rec); - -/* - * Parses an X86_MSR_POLICY record and stashes the content for application - * when a STATIC_DATA_END record is encountered. - */ -int handle_x86_msr_policy(struct xc_sr_context *ctx, - struct xc_sr_record *rec); - -/* - * Perform common x86 actions required after the static data has arrived. - */ -int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing); - -#endif -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_common_x86_pv.c b/tools/libxc/xg_sr_common_x86_pv.c deleted file mode 100644 index cd33406aab..0000000000 --- a/tools/libxc/xg_sr_common_x86_pv.c +++ /dev/null @@ -1,193 +0,0 @@ -#include - -#include "xg_sr_common_x86_pv.h" - -xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn) -{ - assert(mfn <= ctx->x86.pv.max_mfn); - return ctx->x86.pv.m2p[mfn]; -} - -bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn) -{ - return ((mfn <= ctx->x86.pv.max_mfn) && - (mfn_to_pfn(ctx, mfn) <= ctx->x86.pv.max_pfn) && - (xc_pfn_to_mfn(mfn_to_pfn(ctx, mfn), ctx->x86.pv.p2m, - ctx->x86.pv.width) == mfn)); -} - -void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t pfn = ~0UL; - - ERROR("mfn %#lx, max %#lx", mfn, ctx->x86.pv.max_mfn); - - if ( (mfn != ~0UL) && (mfn <= ctx->x86.pv.max_mfn) ) - { - pfn = ctx->x86.pv.m2p[mfn]; - ERROR(" m2p[%#lx] = %#lx, max_pfn %#lx", - mfn, pfn, ctx->x86.pv.max_pfn); - } - - if ( (pfn != ~0UL) && (pfn <= ctx->x86.pv.max_pfn) ) - ERROR(" p2m[%#lx] = %#lx", - pfn, xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width)); -} - -xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3) -{ - if ( ctx->x86.pv.width == 8 ) - return cr3 >> 12; - else - { - /* 32bit guests can't represent mfns wider than 32 bits */ - if ( cr3 & 0xffffffff00000000UL ) - return ~0UL; - else - return (uint32_t)((cr3 >> 12) | (cr3 << 20)); - } -} - -uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t _mfn) -{ - uint64_t mfn = _mfn; - - if ( ctx->x86.pv.width == 8 ) - return mfn << 12; - else - { - /* 32bit guests can't represent mfns wider than 32 bits */ - if ( mfn & 0xffffffff00000000UL ) - return ~0UL; - else - return (uint32_t)((mfn << 12) | (mfn >> 20)); - } -} - -int x86_pv_domain_info(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - unsigned int guest_width, guest_levels; - - /* Get the domain width */ - if ( xc_domain_get_guest_width(xch, ctx->domid, &guest_width) ) - { - PERROR("Unable to determine dom%d's width", ctx->domid); - return -1; - } - - if ( guest_width == 4 ) - guest_levels = 3; - else if ( guest_width == 8 ) - guest_levels = 4; - else - { - ERROR("Invalid guest width %d. Expected 32 or 64", guest_width * 8); - return -1; - } - ctx->x86.pv.width = guest_width; - ctx->x86.pv.levels = guest_levels; - - DPRINTF("%d bits, %d levels", guest_width * 8, guest_levels); - - return 0; -} - -int x86_pv_map_m2p(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t m2p_chunks, m2p_size, max_page; - privcmd_mmap_entry_t *entries = NULL; - xen_pfn_t *extents_start = NULL; - int rc = -1, i; - - if ( xc_maximum_ram_page(xch, &max_page) < 0 ) - { - PERROR("Failed to get maximum ram page"); - goto err; - } - - ctx->x86.pv.max_mfn = max_page; - m2p_size = M2P_SIZE(ctx->x86.pv.max_mfn); - m2p_chunks = M2P_CHUNKS(ctx->x86.pv.max_mfn); - - extents_start = malloc(m2p_chunks * sizeof(xen_pfn_t)); - if ( !extents_start ) - { - ERROR("Unable to allocate %lu bytes for m2p mfns", - m2p_chunks * sizeof(xen_pfn_t)); - goto err; - } - - if ( xc_machphys_mfn_list(xch, m2p_chunks, extents_start) ) - { - PERROR("Failed to get m2p mfn list"); - goto err; - } - - entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)); - if ( !entries ) - { - ERROR("Unable to allocate %lu bytes for m2p mapping mfns", - m2p_chunks * sizeof(privcmd_mmap_entry_t)); - goto err; - } - - for ( i = 0; i < m2p_chunks; ++i ) - entries[i].mfn = extents_start[i]; - - ctx->x86.pv.m2p = xc_map_foreign_ranges( - xch, DOMID_XEN, m2p_size, PROT_READ, - M2P_CHUNK_SIZE, entries, m2p_chunks); - - if ( !ctx->x86.pv.m2p ) - { - PERROR("Failed to mmap() m2p ranges"); - goto err; - } - - ctx->x86.pv.nr_m2p_frames = (M2P_CHUNK_SIZE >> PAGE_SHIFT) * m2p_chunks; - -#ifdef __i386__ - /* 32 bit toolstacks automatically get the compat m2p */ - ctx->x86.pv.compat_m2p_mfn0 = entries[0].mfn; -#else - /* 64 bit toolstacks need to ask Xen specially for it */ - { - struct xen_machphys_mfn_list xmml = { - .max_extents = 1, - .extent_start = { &ctx->x86.pv.compat_m2p_mfn0 }, - }; - - rc = do_memory_op(xch, XENMEM_machphys_compat_mfn_list, - &xmml, sizeof(xmml)); - if ( rc || xmml.nr_extents != 1 ) - { - PERROR("Failed to get compat mfn list from Xen"); - rc = -1; - goto err; - } - } -#endif - - /* All Done */ - rc = 0; - DPRINTF("max_mfn %#lx", ctx->x86.pv.max_mfn); - - err: - free(entries); - free(extents_start); - - return rc; -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_common_x86_pv.h b/tools/libxc/xg_sr_common_x86_pv.h deleted file mode 100644 index 953b5bfb8d..0000000000 --- a/tools/libxc/xg_sr_common_x86_pv.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef __COMMON_X86_PV_H -#define __COMMON_X86_PV_H - -#include "xg_sr_common_x86.h" - -/* Virtual address ranges reserved for hypervisor. */ -#define HYPERVISOR_VIRT_START_X86_64 0xFFFF800000000000ULL -#define HYPERVISOR_VIRT_END_X86_64 0xFFFF87FFFFFFFFFFULL - -#define HYPERVISOR_VIRT_START_X86_32 0x00000000F5800000ULL -#define HYPERVISOR_VIRT_END_X86_32 0x00000000FFFFFFFFULL - -/* - * Convert an mfn to a pfn, given Xen's m2p table. - * - * Caller must ensure that the requested mfn is in range. - */ -xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn); - -/* - * Query whether a particular mfn is valid in the physmap of a guest. - */ -bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn); - -/* - * Debug a particular mfn by walking the p2m and m2p. - */ -void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn); - -/* - * Convert a PV cr3 field to an mfn. - * - * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into - * a 32bit architectural cr3. - */ -xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3); - -/* - * Convert an mfn to a PV cr3 field. - * - * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into - * a 32bit architectural cr3. - */ -uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t mfn); - -/* Bits 12 through 51 of a PTE point at the frame */ -#define PTE_FRAME_MASK 0x000ffffffffff000ULL - -/* - * Extract an mfn from a Pagetable Entry. May return INVALID_MFN if the pte - * would overflow a 32bit xen_pfn_t. - */ -static inline xen_pfn_t pte_to_frame(uint64_t pte) -{ - uint64_t frame = (pte & PTE_FRAME_MASK) >> PAGE_SHIFT; - -#ifdef __i386__ - if ( frame >= INVALID_MFN ) - return INVALID_MFN; -#endif - - return frame; -} - -/* - * Change the frame in a Pagetable Entry while leaving the flags alone. - */ -static inline uint64_t merge_pte(uint64_t pte, xen_pfn_t mfn) -{ - return (pte & ~PTE_FRAME_MASK) | ((uint64_t)mfn << PAGE_SHIFT); -} - -/* - * Get current domain information. - * - * Fills ctx->x86.pv - * - .width - * - .levels - * - .fpp - * - .p2m_frames - * - * Used by the save side to create the X86_PV_INFO record, and by the restore - * side to verify the incoming stream. - * - * Returns 0 on success and non-zero on error. - */ -int x86_pv_domain_info(struct xc_sr_context *ctx); - -/* - * Maps the Xen M2P. - * - * Fills ctx->x86.pv. - * - .max_mfn - * - .m2p - * - * Returns 0 on success and non-zero on error. - */ -int x86_pv_map_m2p(struct xc_sr_context *ctx); - -#endif -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_restore.c b/tools/libxc/xg_sr_restore.c deleted file mode 100644 index b57a787519..0000000000 --- a/tools/libxc/xg_sr_restore.c +++ /dev/null @@ -1,986 +0,0 @@ -#include - -#include - -#include "xg_sr_common.h" - -/* - * Read and validate the Image and Domain headers. - */ -static int read_headers(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_ihdr ihdr; - struct xc_sr_dhdr dhdr; - - if ( read_exact(ctx->fd, &ihdr, sizeof(ihdr)) ) - { - PERROR("Failed to read Image Header from stream"); - return -1; - } - - ihdr.id = ntohl(ihdr.id); - ihdr.version = ntohl(ihdr.version); - ihdr.options = ntohs(ihdr.options); - - if ( ihdr.marker != IHDR_MARKER ) - { - ERROR("Invalid marker: Got 0x%016"PRIx64, ihdr.marker); - return -1; - } - - if ( ihdr.id != IHDR_ID ) - { - ERROR("Invalid ID: Expected 0x%08x, Got 0x%08x", IHDR_ID, ihdr.id); - return -1; - } - - if ( ihdr.version < 2 || ihdr.version > 3 ) - { - ERROR("Invalid Version: Expected 2 <= ver <= 3, Got %d", - ihdr.version); - return -1; - } - - if ( ihdr.options & IHDR_OPT_BIG_ENDIAN ) - { - ERROR("Unable to handle big endian streams"); - return -1; - } - - ctx->restore.format_version = ihdr.version; - - if ( read_exact(ctx->fd, &dhdr, sizeof(dhdr)) ) - { - PERROR("Failed to read Domain Header from stream"); - return -1; - } - - ctx->restore.guest_type = dhdr.type; - ctx->restore.guest_page_size = (1U << dhdr.page_shift); - - if ( dhdr.xen_major == 0 ) - { - IPRINTF("Found %s domain, converted from legacy stream format", - dhdr_type_to_str(dhdr.type)); - DPRINTF(" Legacy conversion script version %u", dhdr.xen_minor); - } - else - IPRINTF("Found %s domain from Xen %u.%u", - dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor); - return 0; -} - -/* - * Is a pfn populated? - */ -static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn) -{ - if ( pfn > ctx->restore.max_populated_pfn ) - return false; - return test_bit(pfn, ctx->restore.populated_pfns); -} - -/* - * Set a pfn as populated, expanding the tracking structures if needed. To - * avoid realloc()ing too excessively, the size increased to the nearest power - * of two large enough to contain the required pfn. - */ -static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn) -{ - xc_interface *xch = ctx->xch; - - if ( pfn > ctx->restore.max_populated_pfn ) - { - xen_pfn_t new_max; - size_t old_sz, new_sz; - unsigned long *p; - - /* Round up to the nearest power of two larger than pfn, less 1. */ - new_max = pfn; - new_max |= new_max >> 1; - new_max |= new_max >> 2; - new_max |= new_max >> 4; - new_max |= new_max >> 8; - new_max |= new_max >> 16; -#ifdef __x86_64__ - new_max |= new_max >> 32; -#endif - - old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1); - new_sz = bitmap_size(new_max + 1); - p = realloc(ctx->restore.populated_pfns, new_sz); - if ( !p ) - { - ERROR("Failed to realloc populated bitmap"); - errno = ENOMEM; - return -1; - } - - memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz); - - ctx->restore.populated_pfns = p; - ctx->restore.max_populated_pfn = new_max; - } - - assert(!test_bit(pfn, ctx->restore.populated_pfns)); - set_bit(pfn, ctx->restore.populated_pfns); - - return 0; -} - -/* - * Given a set of pfns, obtain memory from Xen to fill the physmap for the - * unpopulated subset. If types is NULL, no page type checking is performed - * and all unpopulated pfns are populated. - */ -int populate_pfns(struct xc_sr_context *ctx, unsigned int count, - const xen_pfn_t *original_pfns, const uint32_t *types) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t *mfns = malloc(count * sizeof(*mfns)), - *pfns = malloc(count * sizeof(*pfns)); - unsigned int i, nr_pfns = 0; - int rc = -1; - - if ( !mfns || !pfns ) - { - ERROR("Failed to allocate %zu bytes for populating the physmap", - 2 * count * sizeof(*mfns)); - goto err; - } - - for ( i = 0; i < count; ++i ) - { - if ( (!types || (types && - (types[i] != XEN_DOMCTL_PFINFO_XTAB && - types[i] != XEN_DOMCTL_PFINFO_BROKEN))) && - !pfn_is_populated(ctx, original_pfns[i]) ) - { - rc = pfn_set_populated(ctx, original_pfns[i]); - if ( rc ) - goto err; - pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i]; - ++nr_pfns; - } - } - - if ( nr_pfns ) - { - rc = xc_domain_populate_physmap_exact( - xch, ctx->domid, nr_pfns, 0, 0, mfns); - if ( rc ) - { - PERROR("Failed to populate physmap"); - goto err; - } - - for ( i = 0; i < nr_pfns; ++i ) - { - if ( mfns[i] == INVALID_MFN ) - { - ERROR("Populate physmap failed for pfn %u", i); - rc = -1; - goto err; - } - - ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]); - } - } - - rc = 0; - - err: - free(pfns); - free(mfns); - - return rc; -} - -/* - * Given a list of pfns, their types, and a block of page data from the - * stream, populate and record their types, map the relevant subset and copy - * the data into the guest. - */ -static int process_page_data(struct xc_sr_context *ctx, unsigned int count, - xen_pfn_t *pfns, uint32_t *types, void *page_data) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t *mfns = malloc(count * sizeof(*mfns)); - int *map_errs = malloc(count * sizeof(*map_errs)); - int rc; - void *mapping = NULL, *guest_page = NULL; - unsigned int i, /* i indexes the pfns from the record. */ - j, /* j indexes the subset of pfns we decide to map. */ - nr_pages = 0; - - if ( !mfns || !map_errs ) - { - rc = -1; - ERROR("Failed to allocate %zu bytes to process page data", - count * (sizeof(*mfns) + sizeof(*map_errs))); - goto err; - } - - rc = populate_pfns(ctx, count, pfns, types); - if ( rc ) - { - ERROR("Failed to populate pfns for batch of %u pages", count); - goto err; - } - - for ( i = 0; i < count; ++i ) - { - ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]); - - switch ( types[i] ) - { - case XEN_DOMCTL_PFINFO_NOTAB: - - case XEN_DOMCTL_PFINFO_L1TAB: - case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB: - - case XEN_DOMCTL_PFINFO_L2TAB: - case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB: - - case XEN_DOMCTL_PFINFO_L3TAB: - case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB: - - case XEN_DOMCTL_PFINFO_L4TAB: - case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB: - - mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]); - break; - } - } - - /* Nothing to do? */ - if ( nr_pages == 0 ) - goto done; - - mapping = guest_page = xenforeignmemory_map( - xch->fmem, ctx->domid, PROT_READ | PROT_WRITE, - nr_pages, mfns, map_errs); - if ( !mapping ) - { - rc = -1; - PERROR("Unable to map %u mfns for %u pages of data", - nr_pages, count); - goto err; - } - - for ( i = 0, j = 0; i < count; ++i ) - { - switch ( types[i] ) - { - case XEN_DOMCTL_PFINFO_XTAB: - case XEN_DOMCTL_PFINFO_BROKEN: - case XEN_DOMCTL_PFINFO_XALLOC: - /* No page data to deal with. */ - continue; - } - - if ( map_errs[j] ) - { - rc = -1; - ERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed with %d", - pfns[i], mfns[j], types[i], map_errs[j]); - goto err; - } - - /* Undo page normalisation done by the saver. */ - rc = ctx->restore.ops.localise_page(ctx, types[i], page_data); - if ( rc ) - { - ERROR("Failed to localise pfn %#"PRIpfn" (type %#"PRIx32")", - pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT); - goto err; - } - - if ( ctx->restore.verify ) - { - /* Verify mode - compare incoming data to what we already have. */ - if ( memcmp(guest_page, page_data, PAGE_SIZE) ) - ERROR("verify pfn %#"PRIpfn" failed (type %#"PRIx32")", - pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT); - } - else - { - /* Regular mode - copy incoming data into place. */ - memcpy(guest_page, page_data, PAGE_SIZE); - } - - ++j; - guest_page += PAGE_SIZE; - page_data += PAGE_SIZE; - } - - done: - rc = 0; - - err: - if ( mapping ) - xenforeignmemory_unmap(xch->fmem, mapping, nr_pages); - - free(map_errs); - free(mfns); - - return rc; -} - -/* - * Validate a PAGE_DATA record from the stream, and pass the results to - * process_page_data() to actually perform the legwork. - */ -static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_page_data_header *pages = rec->data; - unsigned int i, pages_of_data = 0; - int rc = -1; - - xen_pfn_t *pfns = NULL, pfn; - uint32_t *types = NULL, type; - - /* - * v2 compatibility only exists for x86 streams. This is a bit of a - * bodge, but it is less bad than duplicating handle_page_data() between - * different architectures. - */ -#if defined(__i386__) || defined(__x86_64__) - /* v2 compat. Infer the position of STATIC_DATA_END. */ - if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end ) - { - rc = handle_static_data_end(ctx); - if ( rc ) - { - ERROR("Inferred STATIC_DATA_END record failed"); - goto err; - } - rc = -1; - } - - if ( !ctx->restore.seen_static_data_end ) - { - ERROR("No STATIC_DATA_END seen"); - goto err; - } -#endif - - if ( rec->length < sizeof(*pages) ) - { - ERROR("PAGE_DATA record truncated: length %u, min %zu", - rec->length, sizeof(*pages)); - goto err; - } - - if ( pages->count < 1 ) - { - ERROR("Expected at least 1 pfn in PAGE_DATA record"); - goto err; - } - - if ( rec->length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) ) - { - ERROR("PAGE_DATA record (length %u) too short to contain %u" - " pfns worth of information", rec->length, pages->count); - goto err; - } - - pfns = malloc(pages->count * sizeof(*pfns)); - types = malloc(pages->count * sizeof(*types)); - if ( !pfns || !types ) - { - ERROR("Unable to allocate enough memory for %u pfns", - pages->count); - goto err; - } - - for ( i = 0; i < pages->count; ++i ) - { - pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK; - if ( !ctx->restore.ops.pfn_is_valid(ctx, pfn) ) - { - ERROR("pfn %#"PRIpfn" (index %u) outside domain maximum", pfn, i); - goto err; - } - - type = (pages->pfn[i] & PAGE_DATA_TYPE_MASK) >> 32; - if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) && - ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) ) - { - ERROR("Invalid type %#"PRIx32" for pfn %#"PRIpfn" (index %u)", - type, pfn, i); - goto err; - } - - if ( type < XEN_DOMCTL_PFINFO_BROKEN ) - /* NOTAB and all L1 through L4 tables (including pinned) should - * have a page worth of data in the record. */ - pages_of_data++; - - pfns[i] = pfn; - types[i] = type; - } - - if ( rec->length != (sizeof(*pages) + - (sizeof(uint64_t) * pages->count) + - (PAGE_SIZE * pages_of_data)) ) - { - ERROR("PAGE_DATA record wrong size: length %u, expected " - "%zu + %zu + %lu", rec->length, sizeof(*pages), - (sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data)); - goto err; - } - - rc = process_page_data(ctx, pages->count, pfns, types, - &pages->pfn[pages->count]); - err: - free(types); - free(pfns); - - return rc; -} - -/* - * Send checkpoint dirty pfn list to primary. - */ -static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc = -1; - unsigned int count, written; - uint64_t i, *pfns = NULL; - struct iovec *iov = NULL; - xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size }; - struct xc_sr_record rec = { - .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST, - }; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->restore.dirty_bitmap_hbuf); - - if ( xc_shadow_control( - xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, - HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size, - NULL, 0, &stats) != ctx->restore.p2m_size ) - { - PERROR("Failed to retrieve logdirty bitmap"); - goto err; - } - - for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ ) - { - if ( test_bit(i, dirty_bitmap) ) - count++; - } - - - pfns = malloc(count * sizeof(*pfns)); - if ( !pfns ) - { - ERROR("Unable to allocate %zu bytes of memory for dirty pfn list", - count * sizeof(*pfns)); - goto err; - } - - for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i ) - { - if ( !test_bit(i, dirty_bitmap) ) - continue; - - if ( written > count ) - { - ERROR("Dirty pfn list exceed"); - goto err; - } - - pfns[written++] = i; - } - - /* iovec[] for writev(). */ - iov = malloc(3 * sizeof(*iov)); - if ( !iov ) - { - ERROR("Unable to allocate memory for sending dirty bitmap"); - goto err; - } - - rec.length = count * sizeof(*pfns); - - iov[0].iov_base = &rec.type; - iov[0].iov_len = sizeof(rec.type); - - iov[1].iov_base = &rec.length; - iov[1].iov_len = sizeof(rec.length); - - iov[2].iov_base = pfns; - iov[2].iov_len = count * sizeof(*pfns); - - if ( writev_exact(ctx->restore.send_back_fd, iov, 3) ) - { - PERROR("Failed to write dirty bitmap to stream"); - goto err; - } - - rc = 0; - err: - free(pfns); - free(iov); - return rc; -} - -static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec); -static int handle_checkpoint(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc = 0, ret; - unsigned int i; - - if ( ctx->stream_type == XC_STREAM_PLAIN ) - { - ERROR("Found checkpoint in non-checkpointed stream"); - rc = -1; - goto err; - } - - ret = ctx->restore.callbacks->checkpoint(ctx->restore.callbacks->data); - switch ( ret ) - { - case XGR_CHECKPOINT_SUCCESS: - break; - - case XGR_CHECKPOINT_FAILOVER: - if ( ctx->restore.buffer_all_records ) - rc = BROKEN_CHANNEL; - else - /* We don't have a consistent state */ - rc = -1; - goto err; - - default: /* Other fatal error */ - rc = -1; - goto err; - } - - if ( ctx->restore.buffer_all_records ) - { - IPRINTF("All records buffered"); - - for ( i = 0; i < ctx->restore.buffered_rec_num; i++ ) - { - rc = process_record(ctx, &ctx->restore.buffered_records[i]); - if ( rc ) - goto err; - } - ctx->restore.buffered_rec_num = 0; - IPRINTF("All records processed"); - } - else - ctx->restore.buffer_all_records = true; - - if ( ctx->stream_type == XC_STREAM_COLO ) - { -#define HANDLE_CALLBACK_RETURN_VALUE(ret) \ - do { \ - if ( ret == 1 ) \ - rc = 0; /* Success */ \ - else \ - { \ - if ( ret == 2 ) \ - rc = BROKEN_CHANNEL; \ - else \ - rc = -1; /* Some unspecified error */ \ - goto err; \ - } \ - } while (0) - - /* COLO */ - - /* We need to resume guest */ - rc = ctx->restore.ops.stream_complete(ctx); - if ( rc ) - goto err; - - ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn, - ctx->restore.console_gfn, - ctx->restore.callbacks->data); - - /* Resume secondary vm */ - ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data); - HANDLE_CALLBACK_RETURN_VALUE(ret); - - /* Wait for a new checkpoint */ - ret = ctx->restore.callbacks->wait_checkpoint( - ctx->restore.callbacks->data); - HANDLE_CALLBACK_RETURN_VALUE(ret); - - /* suspend secondary vm */ - ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data); - HANDLE_CALLBACK_RETURN_VALUE(ret); - -#undef HANDLE_CALLBACK_RETURN_VALUE - - rc = send_checkpoint_dirty_pfn_list(ctx); - if ( rc ) - goto err; - } - - err: - return rc; -} - -static int buffer_record(struct xc_sr_context *ctx, struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - unsigned int new_alloc_num; - struct xc_sr_record *p; - - if ( ctx->restore.buffered_rec_num >= ctx->restore.allocated_rec_num ) - { - new_alloc_num = ctx->restore.allocated_rec_num + DEFAULT_BUF_RECORDS; - p = realloc(ctx->restore.buffered_records, - new_alloc_num * sizeof(struct xc_sr_record)); - if ( !p ) - { - ERROR("Failed to realloc memory for buffered records"); - return -1; - } - - ctx->restore.buffered_records = p; - ctx->restore.allocated_rec_num = new_alloc_num; - } - - memcpy(&ctx->restore.buffered_records[ctx->restore.buffered_rec_num++], - rec, sizeof(*rec)); - - return 0; -} - -int handle_static_data_end(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - unsigned int missing = 0; - int rc = 0; - - if ( ctx->restore.seen_static_data_end ) - { - ERROR("Multiple STATIC_DATA_END records found"); - return -1; - } - - ctx->restore.seen_static_data_end = true; - - rc = ctx->restore.ops.static_data_complete(ctx, &missing); - if ( rc ) - return rc; - - if ( ctx->restore.callbacks->static_data_done && - (rc = ctx->restore.callbacks->static_data_done( - missing, ctx->restore.callbacks->data) != 0) ) - ERROR("static_data_done() callback failed: %d\n", rc); - - return rc; -} - -static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - int rc = 0; - - switch ( rec->type ) - { - case REC_TYPE_END: - break; - - case REC_TYPE_PAGE_DATA: - rc = handle_page_data(ctx, rec); - break; - - case REC_TYPE_VERIFY: - DPRINTF("Verify mode enabled"); - ctx->restore.verify = true; - break; - - case REC_TYPE_CHECKPOINT: - rc = handle_checkpoint(ctx); - break; - - case REC_TYPE_STATIC_DATA_END: - rc = handle_static_data_end(ctx); - break; - - default: - rc = ctx->restore.ops.process_record(ctx, rec); - break; - } - - free(rec->data); - rec->data = NULL; - - return rc; -} - -static int setup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->restore.dirty_bitmap_hbuf); - - if ( ctx->stream_type == XC_STREAM_COLO ) - { - dirty_bitmap = xc_hypercall_buffer_alloc_pages( - xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size))); - - if ( !dirty_bitmap ) - { - ERROR("Unable to allocate memory for dirty bitmap"); - rc = -1; - goto err; - } - } - - rc = ctx->restore.ops.setup(ctx); - if ( rc ) - goto err; - - ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1; - ctx->restore.populated_pfns = bitmap_alloc( - ctx->restore.max_populated_pfn + 1); - if ( !ctx->restore.populated_pfns ) - { - ERROR("Unable to allocate memory for populated_pfns bitmap"); - rc = -1; - goto err; - } - - ctx->restore.buffered_records = malloc( - DEFAULT_BUF_RECORDS * sizeof(struct xc_sr_record)); - if ( !ctx->restore.buffered_records ) - { - ERROR("Unable to allocate memory for buffered records"); - rc = -1; - goto err; - } - ctx->restore.allocated_rec_num = DEFAULT_BUF_RECORDS; - - err: - return rc; -} - -static void cleanup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - unsigned int i; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->restore.dirty_bitmap_hbuf); - - for ( i = 0; i < ctx->restore.buffered_rec_num; i++ ) - free(ctx->restore.buffered_records[i].data); - - if ( ctx->stream_type == XC_STREAM_COLO ) - xc_hypercall_buffer_free_pages( - xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size))); - - free(ctx->restore.buffered_records); - free(ctx->restore.populated_pfns); - - if ( ctx->restore.ops.cleanup(ctx) ) - PERROR("Failed to clean up"); -} - -/* - * Restore a domain. - */ -static int restore(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_record rec; - int rc, saved_rc = 0, saved_errno = 0; - - IPRINTF("Restoring domain"); - - rc = setup(ctx); - if ( rc ) - goto err; - - do - { - rc = read_record(ctx, ctx->fd, &rec); - if ( rc ) - { - if ( ctx->restore.buffer_all_records ) - goto remus_failover; - else - goto err; - } - - if ( ctx->restore.buffer_all_records && - rec.type != REC_TYPE_END && - rec.type != REC_TYPE_CHECKPOINT ) - { - rc = buffer_record(ctx, &rec); - if ( rc ) - goto err; - } - else - { - rc = process_record(ctx, &rec); - if ( rc == RECORD_NOT_PROCESSED ) - { - if ( rec.type & REC_TYPE_OPTIONAL ) - DPRINTF("Ignoring optional record %#x (%s)", - rec.type, rec_type_to_str(rec.type)); - else - { - ERROR("Mandatory record %#x (%s) not handled", - rec.type, rec_type_to_str(rec.type)); - rc = -1; - goto err; - } - } - else if ( rc == BROKEN_CHANNEL ) - goto remus_failover; - else if ( rc ) - goto err; - } - - } while ( rec.type != REC_TYPE_END ); - - remus_failover: - if ( ctx->stream_type == XC_STREAM_COLO ) - { - /* With COLO, we have already called stream_complete */ - rc = 0; - IPRINTF("COLO Failover"); - goto done; - } - - /* - * With Remus, if we reach here, there must be some error on primary, - * failover from the last checkpoint state. - */ - rc = ctx->restore.ops.stream_complete(ctx); - if ( rc ) - goto err; - - IPRINTF("Restore successful"); - goto done; - - err: - saved_errno = errno; - saved_rc = rc; - PERROR("Restore failed"); - - done: - cleanup(ctx); - - if ( saved_rc ) - { - rc = saved_rc; - errno = saved_errno; - } - - return rc; -} - -int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, - unsigned int store_evtchn, unsigned long *store_mfn, - uint32_t store_domid, unsigned int console_evtchn, - unsigned long *console_gfn, uint32_t console_domid, - xc_stream_type_t stream_type, - struct restore_callbacks *callbacks, int send_back_fd) -{ - xen_pfn_t nr_pfns; - struct xc_sr_context ctx = { - .xch = xch, - .fd = io_fd, - .stream_type = stream_type, - }; - - /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */ - ctx.restore.console_evtchn = console_evtchn; - ctx.restore.console_domid = console_domid; - ctx.restore.xenstore_evtchn = store_evtchn; - ctx.restore.xenstore_domid = store_domid; - ctx.restore.callbacks = callbacks; - ctx.restore.send_back_fd = send_back_fd; - - /* Sanity check stream_type-related parameters */ - switch ( stream_type ) - { - case XC_STREAM_COLO: - assert(callbacks->suspend && - callbacks->postcopy && - callbacks->wait_checkpoint && - callbacks->restore_results); - /* Fallthrough */ - case XC_STREAM_REMUS: - assert(callbacks->checkpoint); - /* Fallthrough */ - case XC_STREAM_PLAIN: - break; - - default: - assert(!"Bad stream_type"); - break; - } - - if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 ) - { - PERROR("Failed to get domain info"); - return -1; - } - - if ( ctx.dominfo.domid != dom ) - { - ERROR("Domain %u does not exist", dom); - return -1; - } - - DPRINTF("fd %d, dom %u, hvm %u, stream_type %d", - io_fd, dom, ctx.dominfo.hvm, stream_type); - - ctx.domid = dom; - - if ( read_headers(&ctx) ) - return -1; - - if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 ) - { - PERROR("Unable to obtain the guest p2m size"); - return -1; - } - - ctx.restore.p2m_size = nr_pfns; - ctx.restore.ops = ctx.dominfo.hvm - ? restore_ops_x86_hvm : restore_ops_x86_pv; - - if ( restore(&ctx) ) - return -1; - - IPRINTF("XenStore: mfn %#"PRIpfn", dom %d, evt %u", - ctx.restore.xenstore_gfn, - ctx.restore.xenstore_domid, - ctx.restore.xenstore_evtchn); - - IPRINTF("Console: mfn %#"PRIpfn", dom %d, evt %u", - ctx.restore.console_gfn, - ctx.restore.console_domid, - ctx.restore.console_evtchn); - - *console_gfn = ctx.restore.console_gfn; - *store_mfn = ctx.restore.xenstore_gfn; - - return 0; -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_restore_x86_hvm.c b/tools/libxc/xg_sr_restore_x86_hvm.c deleted file mode 100644 index d6ea6f3012..0000000000 --- a/tools/libxc/xg_sr_restore_x86_hvm.c +++ /dev/null @@ -1,274 +0,0 @@ -#include -#include - -#include "xg_sr_common_x86.h" - -/* - * Process an HVM_CONTEXT record from the stream. - */ -static int handle_hvm_context(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - int rc = update_blob(&ctx->x86.hvm.restore.context, rec->data, rec->length); - - if ( rc ) - ERROR("Unable to allocate %u bytes for hvm context", rec->length); - - return rc; -} - -/* - * Process an HVM_PARAMS record from the stream. - */ -static int handle_hvm_params(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_hvm_params *hdr = rec->data; - struct xc_sr_rec_hvm_params_entry *entry = hdr->param; - unsigned int i; - int rc; - - if ( rec->length < sizeof(*hdr) ) - { - ERROR("HVM_PARAMS record truncated: length %u, header size %zu", - rec->length, sizeof(*hdr)); - return -1; - } - - if ( rec->length != (sizeof(*hdr) + hdr->count * sizeof(*entry)) ) - { - ERROR("HVM_PARAMS record truncated: header %zu, count %u, " - "expected len %zu, got %u", - sizeof(*hdr), hdr->count, hdr->count * sizeof(*entry), - rec->length); - return -1; - } - - /* - * Tolerate empty records. Older sending sides used to accidentally - * generate them. - */ - if ( hdr->count == 0 ) - { - DBGPRINTF("Skipping empty HVM_PARAMS record\n"); - return 0; - } - - for ( i = 0; i < hdr->count; i++, entry++ ) - { - switch ( entry->index ) - { - case HVM_PARAM_CONSOLE_PFN: - ctx->restore.console_gfn = entry->value; - xc_clear_domain_page(xch, ctx->domid, entry->value); - break; - case HVM_PARAM_STORE_PFN: - ctx->restore.xenstore_gfn = entry->value; - xc_clear_domain_page(xch, ctx->domid, entry->value); - break; - case HVM_PARAM_IOREQ_PFN: - case HVM_PARAM_BUFIOREQ_PFN: - xc_clear_domain_page(xch, ctx->domid, entry->value); - break; - - case HVM_PARAM_PAE_ENABLED: - /* - * This HVM_PARAM only ever existed to pass data into - * xc_cpuid_apply_policy(). The function has now been updated to - * use a normal calling convention, making the param obsolete. - * - * Discard if we find it in an old migration stream. - */ - continue; - } - - rc = xc_hvm_param_set(xch, ctx->domid, entry->index, entry->value); - if ( rc < 0 ) - { - PERROR("set HVM param %"PRId64" = 0x%016"PRIx64, - entry->index, entry->value); - return rc; - } - } - return 0; -} - -/* restore_ops function. */ -static bool x86_hvm_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn) -{ - return true; -} - -/* restore_ops function. */ -static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx, - xen_pfn_t pfn) -{ - return pfn; -} - -/* restore_ops function. */ -static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, - xen_pfn_t gfn) -{ - /* no op */ -} - -/* restore_ops function. */ -static void x86_hvm_set_page_type(struct xc_sr_context *ctx, - xen_pfn_t pfn, xen_pfn_t type) -{ - /* no-op */ -} - -/* restore_ops function. */ -static int x86_hvm_localise_page(struct xc_sr_context *ctx, - uint32_t type, void *page) -{ - /* no-op */ - return 0; -} - -/* - * restore_ops function. Confirms the stream matches the domain. - */ -static int x86_hvm_setup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - - if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM ) - { - ERROR("Unable to restore %s domain into an x86 HVM domain", - dhdr_type_to_str(ctx->restore.guest_type)); - return -1; - } - - if ( ctx->restore.guest_page_size != PAGE_SIZE ) - { - ERROR("Invalid page size %u for x86 HVM domains", - ctx->restore.guest_page_size); - return -1; - } - -#ifdef __i386__ - /* Very large domains (> 1TB) will exhaust virtual address space. */ - if ( ctx->restore.p2m_size > 0x0fffffff ) - { - errno = E2BIG; - PERROR("Cannot restore this big a guest"); - return -1; - } -#endif - - return 0; -} - -/* - * restore_ops function. - */ -static int x86_hvm_process_record(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - switch ( rec->type ) - { - case REC_TYPE_X86_TSC_INFO: - return handle_x86_tsc_info(ctx, rec); - - case REC_TYPE_HVM_CONTEXT: - return handle_hvm_context(ctx, rec); - - case REC_TYPE_HVM_PARAMS: - return handle_hvm_params(ctx, rec); - - case REC_TYPE_X86_CPUID_POLICY: - return handle_x86_cpuid_policy(ctx, rec); - - case REC_TYPE_X86_MSR_POLICY: - return handle_x86_msr_policy(ctx, rec); - - default: - return RECORD_NOT_PROCESSED; - } -} - -/* - * restore_ops function. Sets extra hvm parameters and seeds the grant table. - */ -static int x86_hvm_stream_complete(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; - - rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_STORE_EVTCHN, - ctx->restore.xenstore_evtchn); - if ( rc ) - { - PERROR("Failed to set HVM_PARAM_STORE_EVTCHN"); - return rc; - } - - rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_CONSOLE_EVTCHN, - ctx->restore.console_evtchn); - if ( rc ) - { - PERROR("Failed to set HVM_PARAM_CONSOLE_EVTCHN"); - return rc; - } - - rc = xc_domain_hvm_setcontext(xch, ctx->domid, - ctx->x86.hvm.restore.context.ptr, - ctx->x86.hvm.restore.context.size); - if ( rc < 0 ) - { - PERROR("Unable to restore HVM context"); - return rc; - } - - rc = xc_dom_gnttab_seed(xch, ctx->domid, true, - ctx->restore.console_gfn, - ctx->restore.xenstore_gfn, - ctx->restore.console_domid, - ctx->restore.xenstore_domid); - if ( rc ) - { - PERROR("Failed to seed grant table"); - return rc; - } - - return rc; -} - -static int x86_hvm_cleanup(struct xc_sr_context *ctx) -{ - free(ctx->x86.hvm.restore.context.ptr); - - free(ctx->x86.restore.cpuid.ptr); - free(ctx->x86.restore.msr.ptr); - - return 0; -} - -struct xc_sr_restore_ops restore_ops_x86_hvm = -{ - .pfn_is_valid = x86_hvm_pfn_is_valid, - .pfn_to_gfn = x86_hvm_pfn_to_gfn, - .set_gfn = x86_hvm_set_gfn, - .set_page_type = x86_hvm_set_page_type, - .localise_page = x86_hvm_localise_page, - .setup = x86_hvm_setup, - .process_record = x86_hvm_process_record, - .static_data_complete = x86_static_data_complete, - .stream_complete = x86_hvm_stream_complete, - .cleanup = x86_hvm_cleanup, -}; - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_restore_x86_pv.c b/tools/libxc/xg_sr_restore_x86_pv.c deleted file mode 100644 index dc50b0f5a8..0000000000 --- a/tools/libxc/xg_sr_restore_x86_pv.c +++ /dev/null @@ -1,1210 +0,0 @@ -#include - -#include "xg_sr_common_x86_pv.h" - -static xen_pfn_t pfn_to_mfn(const struct xc_sr_context *ctx, xen_pfn_t pfn) -{ - assert(pfn <= ctx->x86.pv.max_pfn); - - return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width); -} - -/* - * Expand our local tracking information for the p2m table and domains maximum - * size. Normally this will be called once to expand from 0 to max_pfn, but - * is liable to expand multiple times if the domain grows on the sending side - * after migration has started. - */ -static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn) -{ - xc_interface *xch = ctx->xch; - unsigned long old_max = ctx->x86.pv.max_pfn, i; - unsigned int fpp = PAGE_SIZE / ctx->x86.pv.width; - unsigned long end_frame = (max_pfn / fpp) + 1; - unsigned long old_end_frame = (old_max / fpp) + 1; - xen_pfn_t *p2m = NULL, *p2m_pfns = NULL; - uint32_t *pfn_types = NULL; - size_t p2msz, p2m_pfnsz, pfn_typesz; - - assert(max_pfn > old_max); - - p2msz = (max_pfn + 1) * ctx->x86.pv.width; - p2m = realloc(ctx->x86.pv.p2m, p2msz); - if ( !p2m ) - { - ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz); - return -1; - } - ctx->x86.pv.p2m = p2m; - - pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types); - pfn_types = realloc(ctx->x86.pv.restore.pfn_types, pfn_typesz); - if ( !pfn_types ) - { - ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz); - return -1; - } - ctx->x86.pv.restore.pfn_types = pfn_types; - - p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns); - p2m_pfns = realloc(ctx->x86.pv.p2m_pfns, p2m_pfnsz); - if ( !p2m_pfns ) - { - ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz); - return -1; - } - ctx->x86.pv.p2m_frames = end_frame; - ctx->x86.pv.p2m_pfns = p2m_pfns; - - ctx->x86.pv.max_pfn = max_pfn; - for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i ) - { - ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN); - ctx->restore.ops.set_page_type(ctx, i, 0); - } - - for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i ) - ctx->x86.pv.p2m_pfns[i] = INVALID_MFN; - - DPRINTF("Changed max_pfn from %#lx to %#lx", old_max, max_pfn); - return 0; -} - -/* - * Pin all of the pagetables. - */ -static int pin_pagetables(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - unsigned long i, nr_pins; - struct mmuext_op pin[MAX_PIN_BATCH]; - - for ( i = nr_pins = 0; i <= ctx->x86.pv.max_pfn; ++i ) - { - if ( (ctx->x86.pv.restore.pfn_types[i] & - XEN_DOMCTL_PFINFO_LPINTAB) == 0 ) - continue; - - switch ( (ctx->x86.pv.restore.pfn_types[i] & - XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ) - { - case XEN_DOMCTL_PFINFO_L1TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE; - break; - case XEN_DOMCTL_PFINFO_L2TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE; - break; - case XEN_DOMCTL_PFINFO_L3TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE; - break; - case XEN_DOMCTL_PFINFO_L4TAB: - pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE; - break; - default: - continue; - } - - pin[nr_pins].arg1.mfn = pfn_to_mfn(ctx, i); - nr_pins++; - - if ( nr_pins == MAX_PIN_BATCH ) - { - if ( xc_mmuext_op(xch, pin, nr_pins, ctx->domid) != 0 ) - { - PERROR("Failed to pin batch of pagetables"); - return -1; - } - nr_pins = 0; - } - } - - if ( (nr_pins > 0) && (xc_mmuext_op(xch, pin, nr_pins, ctx->domid) < 0) ) - { - PERROR("Failed to pin batch of pagetables"); - return -1; - } - - return 0; -} - -/* - * Update details in a guests start_info structure. - */ -static int process_start_info(struct xc_sr_context *ctx, - vcpu_guest_context_any_t *vcpu) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t pfn, mfn; - start_info_any_t *guest_start_info = NULL; - int rc = -1; - - pfn = GET_FIELD(vcpu, user_regs.edx, ctx->x86.pv.width); - - if ( pfn > ctx->x86.pv.max_pfn ) - { - ERROR("Start Info pfn %#lx out of range", pfn); - goto err; - } - - if ( ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB ) - { - ERROR("Start Info pfn %#lx has bad type %u", pfn, - (ctx->x86.pv.restore.pfn_types[pfn] >> - XEN_DOMCTL_PFINFO_LTAB_SHIFT)); - goto err; - } - - mfn = pfn_to_mfn(ctx, pfn); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("Start Info has bad mfn"); - dump_bad_pseudophysmap_entry(ctx, mfn); - goto err; - } - - SET_FIELD(vcpu, user_regs.edx, mfn, ctx->x86.pv.width); - guest_start_info = xc_map_foreign_range( - xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); - if ( !guest_start_info ) - { - PERROR("Failed to map Start Info at mfn %#lx", mfn); - goto err; - } - - /* Deal with xenstore stuff */ - pfn = GET_FIELD(guest_start_info, store_mfn, ctx->x86.pv.width); - if ( pfn > ctx->x86.pv.max_pfn ) - { - ERROR("XenStore pfn %#lx out of range", pfn); - goto err; - } - - mfn = pfn_to_mfn(ctx, pfn); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("XenStore pfn has bad mfn"); - dump_bad_pseudophysmap_entry(ctx, mfn); - goto err; - } - - ctx->restore.xenstore_gfn = mfn; - SET_FIELD(guest_start_info, store_mfn, mfn, ctx->x86.pv.width); - SET_FIELD(guest_start_info, store_evtchn, - ctx->restore.xenstore_evtchn, ctx->x86.pv.width); - - /* Deal with console stuff */ - pfn = GET_FIELD(guest_start_info, console.domU.mfn, ctx->x86.pv.width); - if ( pfn > ctx->x86.pv.max_pfn ) - { - ERROR("Console pfn %#lx out of range", pfn); - goto err; - } - - mfn = pfn_to_mfn(ctx, pfn); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("Console pfn has bad mfn"); - dump_bad_pseudophysmap_entry(ctx, mfn); - goto err; - } - - ctx->restore.console_gfn = mfn; - SET_FIELD(guest_start_info, console.domU.mfn, mfn, ctx->x86.pv.width); - SET_FIELD(guest_start_info, console.domU.evtchn, - ctx->restore.console_evtchn, ctx->x86.pv.width); - - /* Set other information */ - SET_FIELD(guest_start_info, nr_pages, - ctx->x86.pv.max_pfn + 1, ctx->x86.pv.width); - SET_FIELD(guest_start_info, shared_info, - ctx->dominfo.shared_info_frame << PAGE_SHIFT, ctx->x86.pv.width); - SET_FIELD(guest_start_info, flags, 0, ctx->x86.pv.width); - - rc = 0; - - err: - if ( guest_start_info ) - munmap(guest_start_info, PAGE_SIZE); - - return rc; -} - -/* - * Process one stashed vcpu worth of basic state and send to Xen. - */ -static int process_vcpu_basic(struct xc_sr_context *ctx, - unsigned int vcpuid) -{ - xc_interface *xch = ctx->xch; - vcpu_guest_context_any_t *vcpu = ctx->x86.pv.restore.vcpus[vcpuid].basic.ptr; - xen_pfn_t pfn, mfn; - unsigned int i, gdt_count; - int rc = -1; - - /* Vcpu 0 is special: Convert the suspend record to an mfn. */ - if ( vcpuid == 0 ) - { - rc = process_start_info(ctx, vcpu); - if ( rc ) - return rc; - rc = -1; - } - - SET_FIELD(vcpu, flags, - GET_FIELD(vcpu, flags, ctx->x86.pv.width) | VGCF_online, - ctx->x86.pv.width); - - gdt_count = GET_FIELD(vcpu, gdt_ents, ctx->x86.pv.width); - if ( gdt_count > FIRST_RESERVED_GDT_ENTRY ) - { - ERROR("GDT entry count (%u) out of range (max %u)", - gdt_count, FIRST_RESERVED_GDT_ENTRY); - errno = ERANGE; - goto err; - } - gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */ - - /* Convert GDT frames to mfns. */ - for ( i = 0; i < gdt_count; ++i ) - { - pfn = GET_FIELD(vcpu, gdt_frames[i], ctx->x86.pv.width); - if ( pfn > ctx->x86.pv.max_pfn ) - { - ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn); - goto err; - } - - if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) - { - ERROR("GDT frame %u (pfn %#lx) has bad type %u", i, pfn, - (ctx->x86.pv.restore.pfn_types[pfn] >> - XEN_DOMCTL_PFINFO_LTAB_SHIFT)); - goto err; - } - - mfn = pfn_to_mfn(ctx, pfn); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("GDT frame %u has bad mfn", i); - dump_bad_pseudophysmap_entry(ctx, mfn); - goto err; - } - - SET_FIELD(vcpu, gdt_frames[i], mfn, ctx->x86.pv.width); - } - - /* Convert CR3 to an mfn. */ - pfn = cr3_to_mfn(ctx, GET_FIELD(vcpu, ctrlreg[3], ctx->x86.pv.width)); - if ( pfn > ctx->x86.pv.max_pfn ) - { - ERROR("cr3 (pfn %#lx) out of range", pfn); - goto err; - } - - if ( (ctx->x86.pv.restore.pfn_types[pfn] & - XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != - (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) - { - ERROR("cr3 (pfn %#lx) has bad type %u, expected %u", pfn, - (ctx->x86.pv.restore.pfn_types[pfn] >> - XEN_DOMCTL_PFINFO_LTAB_SHIFT), - ctx->x86.pv.levels); - goto err; - } - - mfn = pfn_to_mfn(ctx, pfn); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("cr3 has bad mfn"); - dump_bad_pseudophysmap_entry(ctx, mfn); - goto err; - } - - SET_FIELD(vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn), ctx->x86.pv.width); - - /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */ - if ( ctx->x86.pv.levels == 4 && (vcpu->x64.ctrlreg[1] & 1) ) - { - pfn = vcpu->x64.ctrlreg[1] >> PAGE_SHIFT; - - if ( pfn > ctx->x86.pv.max_pfn ) - { - ERROR("cr1 (pfn %#lx) out of range", pfn); - goto err; - } - - if ( (ctx->x86.pv.restore.pfn_types[pfn] & - XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != - (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) - { - ERROR("cr1 (pfn %#lx) has bad type %u, expected %u", pfn, - (ctx->x86.pv.restore.pfn_types[pfn] >> - XEN_DOMCTL_PFINFO_LTAB_SHIFT), - ctx->x86.pv.levels); - goto err; - } - - mfn = pfn_to_mfn(ctx, pfn); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("cr1 has bad mfn"); - dump_bad_pseudophysmap_entry(ctx, mfn); - goto err; - } - - vcpu->x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT; - } - - if ( xc_vcpu_setcontext(xch, ctx->domid, vcpuid, vcpu) ) - { - PERROR("Failed to set vcpu%u's basic info", vcpuid); - goto err; - } - - rc = 0; - - err: - return rc; -} - -/* - * Process one stashed vcpu worth of extended state and send to Xen. - */ -static int process_vcpu_extended(struct xc_sr_context *ctx, - unsigned int vcpuid) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_x86_pv_restore_vcpu *vcpu = - &ctx->x86.pv.restore.vcpus[vcpuid]; - DECLARE_DOMCTL; - - domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext; - domctl.domain = ctx->domid; - memcpy(&domctl.u.ext_vcpucontext, vcpu->extd.ptr, vcpu->extd.size); - - if ( xc_domctl(xch, &domctl) != 0 ) - { - PERROR("Failed to set vcpu%u's extended info", vcpuid); - return -1; - } - - return 0; -} - -/* - * Process one stashed vcpu worth of xsave state and send to Xen. - */ -static int process_vcpu_xsave(struct xc_sr_context *ctx, - unsigned int vcpuid) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_x86_pv_restore_vcpu *vcpu = - &ctx->x86.pv.restore.vcpus[vcpuid]; - int rc; - DECLARE_DOMCTL; - DECLARE_HYPERCALL_BUFFER(void, buffer); - - buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->xsave.size); - if ( !buffer ) - { - ERROR("Unable to allocate %zu bytes for xsave hypercall buffer", - vcpu->xsave.size); - return -1; - } - - domctl.cmd = XEN_DOMCTL_setvcpuextstate; - domctl.domain = ctx->domid; - domctl.u.vcpuextstate.vcpu = vcpuid; - domctl.u.vcpuextstate.size = vcpu->xsave.size; - set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); - - memcpy(buffer, vcpu->xsave.ptr, vcpu->xsave.size); - - rc = xc_domctl(xch, &domctl); - if ( rc ) - PERROR("Failed to set vcpu%u's xsave info", vcpuid); - - xc_hypercall_buffer_free(xch, buffer); - - return rc; -} - -/* - * Process one stashed vcpu worth of msr state and send to Xen. - */ -static int process_vcpu_msrs(struct xc_sr_context *ctx, - unsigned int vcpuid) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_x86_pv_restore_vcpu *vcpu = - &ctx->x86.pv.restore.vcpus[vcpuid]; - int rc; - DECLARE_DOMCTL; - DECLARE_HYPERCALL_BUFFER(void, buffer); - - buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->msr.size); - if ( !buffer ) - { - ERROR("Unable to allocate %zu bytes for msr hypercall buffer", - vcpu->msr.size); - return -1; - } - - domctl.cmd = XEN_DOMCTL_set_vcpu_msrs; - domctl.domain = ctx->domid; - domctl.u.vcpu_msrs.vcpu = vcpuid; - domctl.u.vcpu_msrs.msr_count = vcpu->msr.size / sizeof(xen_domctl_vcpu_msr_t); - set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer); - - memcpy(buffer, vcpu->msr.ptr, vcpu->msr.size); - - rc = xc_domctl(xch, &domctl); - if ( rc ) - PERROR("Failed to set vcpu%u's msrs", vcpuid); - - xc_hypercall_buffer_free(xch, buffer); - - return rc; -} - -/* - * Process all stashed vcpu context and send to Xen. - */ -static int update_vcpu_context(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_x86_pv_restore_vcpu *vcpu; - unsigned int i; - int rc = 0; - - for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i ) - { - vcpu = &ctx->x86.pv.restore.vcpus[i]; - - if ( vcpu->basic.ptr ) - { - rc = process_vcpu_basic(ctx, i); - if ( rc ) - return rc; - } - else if ( i == 0 ) - { - ERROR("Sender didn't send vcpu0's basic state"); - return -1; - } - - if ( vcpu->extd.ptr ) - { - rc = process_vcpu_extended(ctx, i); - if ( rc ) - return rc; - } - - if ( vcpu->xsave.ptr ) - { - rc = process_vcpu_xsave(ctx, i); - if ( rc ) - return rc; - } - - if ( vcpu->msr.ptr ) - { - rc = process_vcpu_msrs(ctx, i); - if ( rc ) - return rc; - } - } - - return rc; -} - -/* - * Copy the p2m which has been constructed locally as memory has been - * allocated, over the p2m in guest, so the guest can find its memory again on - * resume. - */ -static int update_guest_p2m(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t mfn, pfn, *guest_p2m = NULL; - unsigned int i; - int rc = -1; - - for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i ) - { - pfn = ctx->x86.pv.p2m_pfns[i]; - - if ( pfn > ctx->x86.pv.max_pfn ) - { - ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range", - pfn, i); - goto err; - } - - if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) - { - ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %u", pfn, i, - (ctx->x86.pv.restore.pfn_types[pfn] >> - XEN_DOMCTL_PFINFO_LTAB_SHIFT)); - goto err; - } - - mfn = pfn_to_mfn(ctx, pfn); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("p2m_frame_list[%u] has bad mfn", i); - dump_bad_pseudophysmap_entry(ctx, mfn); - goto err; - } - - ctx->x86.pv.p2m_pfns[i] = mfn; - } - - guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE, - ctx->x86.pv.p2m_pfns, - ctx->x86.pv.p2m_frames); - if ( !guest_p2m ) - { - PERROR("Failed to map p2m frames"); - goto err; - } - - memcpy(guest_p2m, ctx->x86.pv.p2m, - (ctx->x86.pv.max_pfn + 1) * ctx->x86.pv.width); - rc = 0; - - err: - if ( guest_p2m ) - munmap(guest_p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE); - - return rc; -} - -/* - * The valid width/pt_levels values in X86_PV_INFO are inextricably linked. - * Cross-check the legitimate combinations. - */ -static bool valid_x86_pv_info_combination( - const struct xc_sr_rec_x86_pv_info *info) -{ - switch ( info->guest_width ) - { - case 4: return info->pt_levels == 3; - case 8: return info->pt_levels == 4; - default: return false; - } -} - -/* - * Process an X86_PV_INFO record. - */ -static int handle_x86_pv_info(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_x86_pv_info *info = rec->data; - - if ( ctx->x86.pv.restore.seen_pv_info ) - { - ERROR("Already received X86_PV_INFO record"); - return -1; - } - - if ( rec->length < sizeof(*info) ) - { - ERROR("X86_PV_INFO record truncated: length %u, expected %zu", - rec->length, sizeof(*info)); - return -1; - } - - if ( !valid_x86_pv_info_combination(info) ) - { - ERROR("Invalid X86_PV_INFO combination: width %u, pt_levels %u", - info->guest_width, info->pt_levels); - return -1; - } - - /* - * PV domains default to native width. For an incomming compat domain, we - * will typically be the first entity to inform Xen. - */ - if ( info->guest_width != ctx->x86.pv.width ) - { - struct xen_domctl domctl = { - .domain = ctx->domid, - .cmd = XEN_DOMCTL_set_address_size, - .u.address_size.size = info->guest_width * 8, - }; - int rc = do_domctl(xch, &domctl); - - if ( rc != 0 ) - { - ERROR("Failed to update d%d address size to %u", - ctx->domid, info->guest_width * 8); - return -1; - } - - /* Domain's information changed, better to refresh. */ - rc = x86_pv_domain_info(ctx); - if ( rc != 0 ) - { - ERROR("Unable to refresh guest information"); - return -1; - } - } - - /* Sanity check (possibly new) domain settings. */ - if ( (info->guest_width != ctx->x86.pv.width) || - (info->pt_levels != ctx->x86.pv.levels) ) - { - ERROR("X86_PV_INFO width/pt_levels settings %u/%u mismatch with d%d %u/%u", - info->guest_width, info->pt_levels, ctx->domid, - ctx->x86.pv.width, ctx->x86.pv.levels); - return -1; - } - - ctx->x86.pv.restore.seen_pv_info = true; - return 0; -} - -/* - * Process an X86_PV_P2M_FRAMES record. Takes care of expanding the local p2m - * state if needed. - */ -static int handle_x86_pv_p2m_frames(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_x86_pv_p2m_frames *data = rec->data; - unsigned int start, end, x, fpp = PAGE_SIZE / ctx->x86.pv.width; - int rc; - - /* v2 compat. Infer the position of STATIC_DATA_END. */ - if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end ) - { - rc = handle_static_data_end(ctx); - if ( rc ) - { - ERROR("Inferred STATIC_DATA_END record failed"); - return rc; - } - } - - if ( !ctx->restore.seen_static_data_end ) - { - ERROR("No STATIC_DATA_END seen"); - return -1; - } - - if ( !ctx->x86.pv.restore.seen_pv_info ) - { - ERROR("Not yet received X86_PV_INFO record"); - return -1; - } - - if ( rec->length < sizeof(*data) ) - { - ERROR("X86_PV_P2M_FRAMES record truncated: length %u, min %zu", - rec->length, sizeof(*data) + sizeof(uint64_t)); - return -1; - } - - if ( data->start_pfn > data->end_pfn ) - { - ERROR("End pfn in stream (%#x) exceeds Start (%#x)", - data->end_pfn, data->start_pfn); - return -1; - } - - start = data->start_pfn / fpp; - end = data->end_pfn / fpp + 1; - - if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) ) - { - ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#x" - ", end_pfn %#x, length %u, expected %zu + (%u - %u) * %zu", - data->start_pfn, data->end_pfn, rec->length, - sizeof(*data), end, start, sizeof(uint64_t)); - return -1; - } - - if ( data->end_pfn > ctx->x86.pv.max_pfn ) - { - rc = expand_p2m(ctx, data->end_pfn); - if ( rc ) - return rc; - } - - for ( x = 0; x < (end - start); ++x ) - ctx->x86.pv.p2m_pfns[start + x] = data->p2m_pfns[x]; - - return 0; -} - -/* - * Processes X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} records from the stream. - * The blobs are all stashed to one side as they need to be deferred until the - * very end of the stream, rather than being send to Xen at the point they - * arrive in the stream. It performs all pre-hypercall size validation. - */ -static int handle_x86_pv_vcpu_blob(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_x86_pv_vcpu_hdr *vhdr = rec->data; - struct xc_sr_x86_pv_restore_vcpu *vcpu; - const char *rec_name; - size_t blobsz; - struct xc_sr_blob *blob = NULL; - int rc = -1; - - switch ( rec->type ) - { - case REC_TYPE_X86_PV_VCPU_BASIC: - rec_name = "X86_PV_VCPU_BASIC"; - break; - - case REC_TYPE_X86_PV_VCPU_EXTENDED: - rec_name = "X86_PV_VCPU_EXTENDED"; - break; - - case REC_TYPE_X86_PV_VCPU_XSAVE: - rec_name = "X86_PV_VCPU_XSAVE"; - break; - - case REC_TYPE_X86_PV_VCPU_MSRS: - rec_name = "X86_PV_VCPU_MSRS"; - break; - - default: - ERROR("Unrecognised vcpu blob record %s (%u)", - rec_type_to_str(rec->type), rec->type); - goto out; - } - - /* Confirm that there is a complete header. */ - if ( rec->length < sizeof(*vhdr) ) - { - ERROR("%s record truncated: length %u, header size %zu", - rec_name, rec->length, sizeof(*vhdr)); - goto out; - } - - blobsz = rec->length - sizeof(*vhdr); - - /* - * Tolerate empty records. Older sending sides used to accidentally - * generate them. - */ - if ( blobsz == 0 ) - { - DBGPRINTF("Skipping empty %s record for vcpu %u\n", - rec_type_to_str(rec->type), vhdr->vcpu_id); - rc = 0; - goto out; - } - - /* Check that the vcpu id is within range. */ - if ( vhdr->vcpu_id >= ctx->x86.pv.restore.nr_vcpus ) - { - ERROR("%s record vcpu_id (%u) exceeds domain max (%u)", - rec_name, vhdr->vcpu_id, ctx->x86.pv.restore.nr_vcpus - 1); - goto out; - } - - vcpu = &ctx->x86.pv.restore.vcpus[vhdr->vcpu_id]; - - /* Further per-record checks, where possible. */ - switch ( rec->type ) - { - case REC_TYPE_X86_PV_VCPU_BASIC: - { - size_t vcpusz = ctx->x86.pv.width == 8 ? - sizeof(vcpu_guest_context_x86_64_t) : - sizeof(vcpu_guest_context_x86_32_t); - - if ( blobsz != vcpusz ) - { - ERROR("%s record wrong size: expected %zu, got %u", - rec_name, sizeof(*vhdr) + vcpusz, rec->length); - goto out; - } - blob = &vcpu->basic; - break; - } - - case REC_TYPE_X86_PV_VCPU_EXTENDED: - if ( blobsz > 128 ) - { - ERROR("%s record too long: max %zu, got %u", - rec_name, sizeof(*vhdr) + 128, rec->length); - goto out; - } - blob = &vcpu->extd; - break; - - case REC_TYPE_X86_PV_VCPU_XSAVE: - if ( blobsz < 16 ) - { - ERROR("%s record too short: min %zu, got %u", - rec_name, sizeof(*vhdr) + 16, rec->length); - goto out; - } - blob = &vcpu->xsave; - break; - - case REC_TYPE_X86_PV_VCPU_MSRS: - if ( blobsz % sizeof(xen_domctl_vcpu_msr_t) != 0 ) - { - ERROR("%s record payload size %zu expected to be a multiple of %zu", - rec_name, blobsz, sizeof(xen_domctl_vcpu_msr_t)); - goto out; - } - blob = &vcpu->msr; - break; - } - - rc = update_blob(blob, vhdr->context, blobsz); - if ( rc ) - ERROR("Unable to allocate %zu bytes for vcpu%u %s blob", - blobsz, vhdr->vcpu_id, rec_name); - - out: - return rc; -} - -/* - * Process a SHARED_INFO record from the stream. - */ -static int handle_shared_info(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - xc_interface *xch = ctx->xch; - unsigned int i; - int rc = -1; - shared_info_any_t *guest_shinfo = NULL; - const shared_info_any_t *old_shinfo = rec->data; - - if ( !ctx->x86.pv.restore.seen_pv_info ) - { - ERROR("Not yet received X86_PV_INFO record"); - return -1; - } - - if ( rec->length != PAGE_SIZE ) - { - ERROR("X86_PV_SHARED_INFO record wrong size: length %u" - ", expected 4096", rec->length); - goto err; - } - - guest_shinfo = xc_map_foreign_range( - xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, - ctx->dominfo.shared_info_frame); - if ( !guest_shinfo ) - { - PERROR("Failed to map Shared Info at mfn %#lx", - ctx->dominfo.shared_info_frame); - goto err; - } - - MEMCPY_FIELD(guest_shinfo, old_shinfo, vcpu_info, ctx->x86.pv.width); - MEMCPY_FIELD(guest_shinfo, old_shinfo, arch, ctx->x86.pv.width); - - SET_FIELD(guest_shinfo, arch.pfn_to_mfn_frame_list_list, - 0, ctx->x86.pv.width); - - MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_pending, 0, ctx->x86.pv.width); - for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) - SET_FIELD(guest_shinfo, vcpu_info[i].evtchn_pending_sel, - 0, ctx->x86.pv.width); - - MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_mask, 0xff, ctx->x86.pv.width); - - rc = 0; - - err: - if ( guest_shinfo ) - munmap(guest_shinfo, PAGE_SIZE); - - return rc; -} - -/* restore_ops function. */ -static bool x86_pv_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn) -{ - return pfn <= ctx->x86.pv.max_pfn; -} - -/* restore_ops function. */ -static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn, - unsigned long type) -{ - assert(pfn <= ctx->x86.pv.max_pfn); - - ctx->x86.pv.restore.pfn_types[pfn] = type; -} - -/* restore_ops function. */ -static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, - xen_pfn_t mfn) -{ - assert(pfn <= ctx->x86.pv.max_pfn); - - if ( ctx->x86.pv.width == sizeof(uint64_t) ) - /* 64 bit guest. Need to expand INVALID_MFN for 32 bit toolstacks. */ - ((uint64_t *)ctx->x86.pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn; - else - /* 32 bit guest. Can truncate INVALID_MFN for 64 bit toolstacks. */ - ((uint32_t *)ctx->x86.pv.p2m)[pfn] = mfn; -} - -/* - * restore_ops function. Convert pfns back to mfns in pagetables. Possibly - * needs to populate new frames if a PTE is found referring to a frame which - * hasn't yet been seen from PAGE_DATA records. - */ -static int x86_pv_localise_page(struct xc_sr_context *ctx, - uint32_t type, void *page) -{ - xc_interface *xch = ctx->xch; - uint64_t *table = page; - uint64_t pte; - unsigned int i, to_populate; - xen_pfn_t pfns[(PAGE_SIZE / sizeof(uint64_t))]; - - type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; - - /* Only page tables need localisation. */ - if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB ) - return 0; - - /* Check to see whether we need to populate any new frames. */ - for ( i = 0, to_populate = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) - { - pte = table[i]; - - if ( pte & _PAGE_PRESENT ) - { - xen_pfn_t pfn = pte_to_frame(pte); - -#ifdef __i386__ - if ( pfn == INVALID_MFN ) - { - ERROR("PTE truncation detected. L%u[%u] = %016"PRIx64, - type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); - errno = E2BIG; - return -1; - } -#endif - - if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN ) - pfns[to_populate++] = pfn; - } - } - - if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) ) - return -1; - - for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) - { - pte = table[i]; - - if ( pte & _PAGE_PRESENT ) - { - xen_pfn_t mfn, pfn; - - pfn = pte_to_frame(pte); - mfn = pfn_to_mfn(ctx, pfn); - - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("Bad mfn for L%u[%u] - pte %"PRIx64, - type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); - dump_bad_pseudophysmap_entry(ctx, mfn); - errno = ERANGE; - return -1; - } - - table[i] = merge_pte(pte, mfn); - } - } - - return 0; -} - -/* - * restore_ops function. Confirm that the incoming stream matches the type of - * domain we are attempting to restore into. - */ -static int x86_pv_setup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; - - if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV ) - { - ERROR("Unable to restore %s domain into an x86_pv domain", - dhdr_type_to_str(ctx->restore.guest_type)); - return -1; - } - - if ( ctx->restore.guest_page_size != PAGE_SIZE ) - { - ERROR("Invalid page size %d for x86_pv domains", - ctx->restore.guest_page_size); - return -1; - } - - rc = x86_pv_domain_info(ctx); - if ( rc ) - return rc; - - ctx->x86.pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1; - ctx->x86.pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu), - ctx->x86.pv.restore.nr_vcpus); - if ( !ctx->x86.pv.restore.vcpus ) - { - errno = ENOMEM; - return -1; - } - - rc = x86_pv_map_m2p(ctx); - if ( rc ) - return rc; - - return rc; -} - -/* - * restore_ops function. - */ -static int x86_pv_process_record(struct xc_sr_context *ctx, - struct xc_sr_record *rec) -{ - switch ( rec->type ) - { - case REC_TYPE_X86_PV_INFO: - return handle_x86_pv_info(ctx, rec); - - case REC_TYPE_X86_PV_P2M_FRAMES: - return handle_x86_pv_p2m_frames(ctx, rec); - - case REC_TYPE_X86_PV_VCPU_BASIC: - case REC_TYPE_X86_PV_VCPU_EXTENDED: - case REC_TYPE_X86_PV_VCPU_XSAVE: - case REC_TYPE_X86_PV_VCPU_MSRS: - return handle_x86_pv_vcpu_blob(ctx, rec); - - case REC_TYPE_SHARED_INFO: - return handle_shared_info(ctx, rec); - - case REC_TYPE_X86_TSC_INFO: - return handle_x86_tsc_info(ctx, rec); - - case REC_TYPE_X86_CPUID_POLICY: - return handle_x86_cpuid_policy(ctx, rec); - - case REC_TYPE_X86_MSR_POLICY: - return handle_x86_msr_policy(ctx, rec); - - default: - return RECORD_NOT_PROCESSED; - } -} - -/* - * restore_ops function. Update the vcpu context in Xen, pin the pagetables, - * rewrite the p2m and seed the grant table. - */ -static int x86_pv_stream_complete(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; - - rc = update_vcpu_context(ctx); - if ( rc ) - return rc; - - rc = pin_pagetables(ctx); - if ( rc ) - return rc; - - rc = update_guest_p2m(ctx); - if ( rc ) - return rc; - - rc = xc_dom_gnttab_seed(xch, ctx->domid, false, - ctx->restore.console_gfn, - ctx->restore.xenstore_gfn, - ctx->restore.console_domid, - ctx->restore.xenstore_domid); - if ( rc ) - { - PERROR("Failed to seed grant table"); - return rc; - } - - return rc; -} - -/* - * restore_ops function. - */ -static int x86_pv_cleanup(struct xc_sr_context *ctx) -{ - free(ctx->x86.pv.p2m); - free(ctx->x86.pv.p2m_pfns); - - if ( ctx->x86.pv.restore.vcpus ) - { - unsigned int i; - - for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i ) - { - struct xc_sr_x86_pv_restore_vcpu *vcpu = - &ctx->x86.pv.restore.vcpus[i]; - - free(vcpu->basic.ptr); - free(vcpu->extd.ptr); - free(vcpu->xsave.ptr); - free(vcpu->msr.ptr); - } - - free(ctx->x86.pv.restore.vcpus); - } - - free(ctx->x86.pv.restore.pfn_types); - - if ( ctx->x86.pv.m2p ) - munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE); - - free(ctx->x86.restore.cpuid.ptr); - free(ctx->x86.restore.msr.ptr); - - return 0; -} - -struct xc_sr_restore_ops restore_ops_x86_pv = -{ - .pfn_is_valid = x86_pv_pfn_is_valid, - .pfn_to_gfn = pfn_to_mfn, - .set_page_type = x86_pv_set_page_type, - .set_gfn = x86_pv_set_gfn, - .localise_page = x86_pv_localise_page, - .setup = x86_pv_setup, - .process_record = x86_pv_process_record, - .static_data_complete = x86_static_data_complete, - .stream_complete = x86_pv_stream_complete, - .cleanup = x86_pv_cleanup, -}; - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_save.c b/tools/libxc/xg_sr_save.c deleted file mode 100644 index d74c72cba6..0000000000 --- a/tools/libxc/xg_sr_save.c +++ /dev/null @@ -1,1059 +0,0 @@ -#include -#include - -#include "xg_sr_common.h" - -/* - * Writes an Image header and Domain header into the stream. - */ -static int write_headers(struct xc_sr_context *ctx, uint16_t guest_type) -{ - xc_interface *xch = ctx->xch; - int32_t xen_version = xc_version(xch, XENVER_version, NULL); - struct xc_sr_ihdr ihdr = { - .marker = IHDR_MARKER, - .id = htonl(IHDR_ID), - .version = htonl(3), - .options = htons(IHDR_OPT_LITTLE_ENDIAN), - }; - struct xc_sr_dhdr dhdr = { - .type = guest_type, - .page_shift = XC_PAGE_SHIFT, - .xen_major = (xen_version >> 16) & 0xffff, - .xen_minor = (xen_version) & 0xffff, - }; - - if ( xen_version < 0 ) - { - PERROR("Unable to obtain Xen Version"); - return -1; - } - - if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) ) - { - PERROR("Unable to write Image Header to stream"); - return -1; - } - - if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) ) - { - PERROR("Unable to write Domain Header to stream"); - return -1; - } - - return 0; -} - -/* - * Writes an END record into the stream. - */ -static int write_end_record(struct xc_sr_context *ctx) -{ - struct xc_sr_record end = { .type = REC_TYPE_END }; - - return write_record(ctx, &end); -} - -/* - * Writes a STATIC_DATA_END record into the stream. - */ -static int write_static_data_end_record(struct xc_sr_context *ctx) -{ - struct xc_sr_record end = { .type = REC_TYPE_STATIC_DATA_END }; - - return write_record(ctx, &end); -} - -/* - * Writes a CHECKPOINT record into the stream. - */ -static int write_checkpoint_record(struct xc_sr_context *ctx) -{ - struct xc_sr_record checkpoint = { .type = REC_TYPE_CHECKPOINT }; - - return write_record(ctx, &checkpoint); -} - -/* - * Writes a batch of memory as a PAGE_DATA record into the stream. The batch - * is constructed in ctx->save.batch_pfns. - * - * This function: - * - gets the types for each pfn in the batch. - * - for each pfn with real data: - * - maps and attempts to localise the pages. - * - construct and writes a PAGE_DATA record into the stream. - */ -static int write_batch(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t *mfns = NULL, *types = NULL; - void *guest_mapping = NULL; - void **guest_data = NULL; - void **local_pages = NULL; - int *errors = NULL, rc = -1; - unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0; - unsigned int nr_pfns = ctx->save.nr_batch_pfns; - void *page, *orig_page; - uint64_t *rec_pfns = NULL; - struct iovec *iov = NULL; int iovcnt = 0; - struct xc_sr_rec_page_data_header hdr = { 0 }; - struct xc_sr_record rec = { - .type = REC_TYPE_PAGE_DATA, - }; - - assert(nr_pfns != 0); - - /* Mfns of the batch pfns. */ - mfns = malloc(nr_pfns * sizeof(*mfns)); - /* Types of the batch pfns. */ - types = malloc(nr_pfns * sizeof(*types)); - /* Errors from attempting to map the gfns. */ - errors = malloc(nr_pfns * sizeof(*errors)); - /* Pointers to page data to send. Mapped gfns or local allocations. */ - guest_data = calloc(nr_pfns, sizeof(*guest_data)); - /* Pointers to locally allocated pages. Need freeing. */ - local_pages = calloc(nr_pfns, sizeof(*local_pages)); - /* iovec[] for writev(). */ - iov = malloc((nr_pfns + 4) * sizeof(*iov)); - - if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov ) - { - ERROR("Unable to allocate arrays for a batch of %u pages", - nr_pfns); - goto err; - } - - for ( i = 0; i < nr_pfns; ++i ) - { - types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx, - ctx->save.batch_pfns[i]); - - /* Likely a ballooned page. */ - if ( mfns[i] == INVALID_MFN ) - { - set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); - ++ctx->save.nr_deferred_pages; - } - } - - rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types); - if ( rc ) - { - PERROR("Failed to get types for pfn batch"); - goto err; - } - rc = -1; - - for ( i = 0; i < nr_pfns; ++i ) - { - switch ( types[i] ) - { - case XEN_DOMCTL_PFINFO_BROKEN: - case XEN_DOMCTL_PFINFO_XALLOC: - case XEN_DOMCTL_PFINFO_XTAB: - continue; - } - - mfns[nr_pages++] = mfns[i]; - } - - if ( nr_pages > 0 ) - { - guest_mapping = xenforeignmemory_map( - xch->fmem, ctx->domid, PROT_READ, nr_pages, mfns, errors); - if ( !guest_mapping ) - { - PERROR("Failed to map guest pages"); - goto err; - } - nr_pages_mapped = nr_pages; - - for ( i = 0, p = 0; i < nr_pfns; ++i ) - { - switch ( types[i] ) - { - case XEN_DOMCTL_PFINFO_BROKEN: - case XEN_DOMCTL_PFINFO_XALLOC: - case XEN_DOMCTL_PFINFO_XTAB: - continue; - } - - if ( errors[p] ) - { - ERROR("Mapping of pfn %#"PRIpfn" (mfn %#"PRIpfn") failed %d", - ctx->save.batch_pfns[i], mfns[p], errors[p]); - goto err; - } - - orig_page = page = guest_mapping + (p * PAGE_SIZE); - rc = ctx->save.ops.normalise_page(ctx, types[i], &page); - - if ( orig_page != page ) - local_pages[i] = page; - - if ( rc ) - { - if ( rc == -1 && errno == EAGAIN ) - { - set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); - ++ctx->save.nr_deferred_pages; - types[i] = XEN_DOMCTL_PFINFO_XTAB; - --nr_pages; - } - else - goto err; - } - else - guest_data[i] = page; - - rc = -1; - ++p; - } - } - - rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns)); - if ( !rec_pfns ) - { - ERROR("Unable to allocate %zu bytes of memory for page data pfn list", - nr_pfns * sizeof(*rec_pfns)); - goto err; - } - - hdr.count = nr_pfns; - - rec.length = sizeof(hdr); - rec.length += nr_pfns * sizeof(*rec_pfns); - rec.length += nr_pages * PAGE_SIZE; - - for ( i = 0; i < nr_pfns; ++i ) - rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i]; - - iov[0].iov_base = &rec.type; - iov[0].iov_len = sizeof(rec.type); - - iov[1].iov_base = &rec.length; - iov[1].iov_len = sizeof(rec.length); - - iov[2].iov_base = &hdr; - iov[2].iov_len = sizeof(hdr); - - iov[3].iov_base = rec_pfns; - iov[3].iov_len = nr_pfns * sizeof(*rec_pfns); - - iovcnt = 4; - - if ( nr_pages ) - { - for ( i = 0; i < nr_pfns; ++i ) - { - if ( guest_data[i] ) - { - iov[iovcnt].iov_base = guest_data[i]; - iov[iovcnt].iov_len = PAGE_SIZE; - iovcnt++; - --nr_pages; - } - } - } - - if ( writev_exact(ctx->fd, iov, iovcnt) ) - { - PERROR("Failed to write page data to stream"); - goto err; - } - - /* Sanity check we have sent all the pages we expected to. */ - assert(nr_pages == 0); - rc = ctx->save.nr_batch_pfns = 0; - - err: - free(rec_pfns); - if ( guest_mapping ) - xenforeignmemory_unmap(xch->fmem, guest_mapping, nr_pages_mapped); - for ( i = 0; local_pages && i < nr_pfns; ++i ) - free(local_pages[i]); - free(iov); - free(local_pages); - free(guest_data); - free(errors); - free(types); - free(mfns); - - return rc; -} - -/* - * Flush a batch of pfns into the stream. - */ -static int flush_batch(struct xc_sr_context *ctx) -{ - int rc = 0; - - if ( ctx->save.nr_batch_pfns == 0 ) - return rc; - - rc = write_batch(ctx); - - if ( !rc ) - { - VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns, - MAX_BATCH_SIZE * - sizeof(*ctx->save.batch_pfns)); - } - - return rc; -} - -/* - * Add a single pfn to the batch, flushing the batch if full. - */ -static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn) -{ - int rc = 0; - - if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE ) - rc = flush_batch(ctx); - - if ( rc == 0 ) - ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn; - - return rc; -} - -/* - * Pause/suspend the domain, and refresh ctx->dominfo if required. - */ -static int suspend_domain(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - - /* TODO: Properly specify the return value from this callback. All - * implementations currently appear to return 1 for success, whereas - * the legacy code checks for != 0. */ - int cb_rc = ctx->save.callbacks->suspend(ctx->save.callbacks->data); - - if ( cb_rc == 0 ) - { - ERROR("save callback suspend() failed: %d", cb_rc); - return -1; - } - - /* Refresh domain information. */ - if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) || - (ctx->dominfo.domid != ctx->domid) ) - { - PERROR("Unable to refresh domain information"); - return -1; - } - - /* Confirm the domain has actually been paused. */ - if ( !ctx->dominfo.shutdown || - (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) ) - { - ERROR("Domain has not been suspended: shutdown %d, reason %d", - ctx->dominfo.shutdown, ctx->dominfo.shutdown_reason); - return -1; - } - - xc_report_progress_single(xch, "Domain now suspended"); - - return 0; -} - -/* - * Send a subset of pages in the guests p2m, according to the dirty bitmap. - * Used for each subsequent iteration of the live migration loop. - * - * Bitmap is bounded by p2m_size. - */ -static int send_dirty_pages(struct xc_sr_context *ctx, - unsigned long entries) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t p; - unsigned long written; - int rc; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->save.dirty_bitmap_hbuf); - - for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p ) - { - if ( !test_bit(p, dirty_bitmap) ) - continue; - - rc = add_to_batch(ctx, p); - if ( rc ) - return rc; - - /* Update progress every 4MB worth of memory sent. */ - if ( (written & ((1U << (22 - 12)) - 1)) == 0 ) - xc_report_progress_step(xch, written, entries); - - ++written; - } - - rc = flush_batch(ctx); - if ( rc ) - return rc; - - if ( written > entries ) - DPRINTF("Bitmap contained more entries than expected..."); - - xc_report_progress_step(xch, entries, entries); - - return ctx->save.ops.check_vm_state(ctx); -} - -/* - * Send all pages in the guests p2m. Used as the first iteration of the live - * migration loop, and for a non-live save. - */ -static int send_all_pages(struct xc_sr_context *ctx) -{ - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->save.dirty_bitmap_hbuf); - - bitmap_set(dirty_bitmap, ctx->save.p2m_size); - - return send_dirty_pages(ctx, ctx->save.p2m_size); -} - -static int enable_logdirty(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int on1 = 0, off = 0, on2 = 0; - int rc; - - /* This juggling is required if logdirty is enabled for VRAM tracking. */ - rc = xc_shadow_control(xch, ctx->domid, - XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL, 0, NULL); - if ( rc < 0 ) - { - on1 = errno; - rc = xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF, - NULL, 0, NULL, 0, NULL); - if ( rc < 0 ) - off = errno; - else { - rc = xc_shadow_control(xch, ctx->domid, - XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL, 0, NULL); - if ( rc < 0 ) - on2 = errno; - } - if ( rc < 0 ) - { - PERROR("Failed to enable logdirty: %d,%d,%d", on1, off, on2); - return rc; - } - } - - return 0; -} - -static int update_progress_string(struct xc_sr_context *ctx, char **str) -{ - xc_interface *xch = ctx->xch; - char *new_str = NULL; - unsigned int iter = ctx->save.stats.iteration; - - if ( asprintf(&new_str, "Frames iteration %u", iter) == -1 ) - { - PERROR("Unable to allocate new progress string"); - return -1; - } - - free(*str); - *str = new_str; - - xc_set_progress_prefix(xch, *str); - return 0; -} - -/* - * This is the live migration precopy policy - it's called periodically during - * the precopy phase of live migrations, and is responsible for deciding when - * the precopy phase should terminate and what should be done next. - * - * The policy implemented here behaves identically to the policy previously - * hard-coded into xc_domain_save() - it proceeds to the stop-and-copy phase of - * the live migration when there are either fewer than 50 dirty pages, or more - * than 5 precopy rounds have completed. - */ -#define SPP_MAX_ITERATIONS 5 -#define SPP_TARGET_DIRTY_COUNT 50 - -static int simple_precopy_policy(struct precopy_stats stats, void *user) -{ - return ((stats.dirty_count >= 0 && - stats.dirty_count < SPP_TARGET_DIRTY_COUNT) || - stats.iteration >= SPP_MAX_ITERATIONS) - ? XGS_POLICY_STOP_AND_COPY - : XGS_POLICY_CONTINUE_PRECOPY; -} - -/* - * Send memory while guest is running. - */ -static int send_memory_live(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size }; - char *progress_str = NULL; - unsigned int x = 0; - int rc; - int policy_decision; - - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->save.dirty_bitmap_hbuf); - - precopy_policy_t precopy_policy = ctx->save.callbacks->precopy_policy; - void *data = ctx->save.callbacks->data; - - struct precopy_stats *policy_stats; - - rc = update_progress_string(ctx, &progress_str); - if ( rc ) - goto out; - - ctx->save.stats = (struct precopy_stats){ - .dirty_count = ctx->save.p2m_size, - }; - policy_stats = &ctx->save.stats; - - if ( precopy_policy == NULL ) - precopy_policy = simple_precopy_policy; - - bitmap_set(dirty_bitmap, ctx->save.p2m_size); - - for ( ; ; ) - { - policy_decision = precopy_policy(*policy_stats, data); - x++; - - if ( stats.dirty_count > 0 && policy_decision != XGS_POLICY_ABORT ) - { - rc = update_progress_string(ctx, &progress_str); - if ( rc ) - goto out; - - rc = send_dirty_pages(ctx, stats.dirty_count); - if ( rc ) - goto out; - } - - if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY ) - break; - - policy_stats->iteration = x; - policy_stats->total_written += policy_stats->dirty_count; - policy_stats->dirty_count = -1; - - policy_decision = precopy_policy(*policy_stats, data); - - if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY ) - break; - - if ( xc_shadow_control( - xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, - &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size, - NULL, 0, &stats) != ctx->save.p2m_size ) - { - PERROR("Failed to retrieve logdirty bitmap"); - rc = -1; - goto out; - } - - policy_stats->dirty_count = stats.dirty_count; - - } - - if ( policy_decision == XGS_POLICY_ABORT ) - { - PERROR("Abort precopy loop"); - rc = -1; - goto out; - } - - out: - xc_set_progress_prefix(xch, NULL); - free(progress_str); - return rc; -} - -static int colo_merge_secondary_dirty_bitmap(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_record rec; - uint64_t *pfns = NULL; - uint64_t pfn; - unsigned int count, i; - int rc; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->save.dirty_bitmap_hbuf); - - rc = read_record(ctx, ctx->save.recv_fd, &rec); - if ( rc ) - goto err; - - if ( rec.type != REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST ) - { - PERROR("Expect dirty bitmap record, but received %u", rec.type); - rc = -1; - goto err; - } - - if ( rec.length % sizeof(*pfns) ) - { - PERROR("Invalid dirty pfn list record length %u", rec.length); - rc = -1; - goto err; - } - - count = rec.length / sizeof(*pfns); - pfns = rec.data; - - for ( i = 0; i < count; i++ ) - { - pfn = pfns[i]; - if ( pfn > ctx->save.p2m_size ) - { - PERROR("Invalid pfn 0x%" PRIx64, pfn); - rc = -1; - goto err; - } - - set_bit(pfn, dirty_bitmap); - } - - rc = 0; - - err: - free(rec.data); - return rc; -} - -/* - * Suspend the domain and send dirty memory. - * This is the last iteration of the live migration and the - * heart of the checkpointed stream. - */ -static int suspend_and_send_dirty(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size }; - char *progress_str = NULL; - int rc; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->save.dirty_bitmap_hbuf); - - rc = suspend_domain(ctx); - if ( rc ) - goto out; - - if ( xc_shadow_control( - xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, - HYPERCALL_BUFFER(dirty_bitmap), ctx->save.p2m_size, - NULL, XEN_DOMCTL_SHADOW_LOGDIRTY_FINAL, &stats) != - ctx->save.p2m_size ) - { - PERROR("Failed to retrieve logdirty bitmap"); - rc = -1; - goto out; - } - - if ( ctx->save.live ) - { - rc = update_progress_string(ctx, &progress_str); - if ( rc ) - goto out; - } - else - xc_set_progress_prefix(xch, "Checkpointed save"); - - bitmap_or(dirty_bitmap, ctx->save.deferred_pages, ctx->save.p2m_size); - - if ( !ctx->save.live && ctx->stream_type == XC_STREAM_COLO ) - { - rc = colo_merge_secondary_dirty_bitmap(ctx); - if ( rc ) - { - PERROR("Failed to get secondary vm's dirty pages"); - goto out; - } - } - - rc = send_dirty_pages(ctx, stats.dirty_count + ctx->save.nr_deferred_pages); - if ( rc ) - goto out; - - bitmap_clear(ctx->save.deferred_pages, ctx->save.p2m_size); - ctx->save.nr_deferred_pages = 0; - - out: - xc_set_progress_prefix(xch, NULL); - free(progress_str); - return rc; -} - -static int verify_frames(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size }; - int rc; - struct xc_sr_record rec = { .type = REC_TYPE_VERIFY }; - - DPRINTF("Enabling verify mode"); - - rc = write_record(ctx, &rec); - if ( rc ) - goto out; - - xc_set_progress_prefix(xch, "Frames verify"); - rc = send_all_pages(ctx); - if ( rc ) - goto out; - - if ( xc_shadow_control( - xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_PEEK, - &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size, - NULL, 0, &stats) != ctx->save.p2m_size ) - { - PERROR("Failed to retrieve logdirty bitmap"); - rc = -1; - goto out; - } - - DPRINTF(" Further stats: faults %u, dirty %u", - stats.fault_count, stats.dirty_count); - - out: - return rc; -} - -/* - * Send all domain memory. This is the heart of the live migration loop. - */ -static int send_domain_memory_live(struct xc_sr_context *ctx) -{ - int rc; - - rc = enable_logdirty(ctx); - if ( rc ) - goto out; - - rc = send_memory_live(ctx); - if ( rc ) - goto out; - - rc = suspend_and_send_dirty(ctx); - if ( rc ) - goto out; - - if ( ctx->save.debug && ctx->stream_type != XC_STREAM_PLAIN ) - { - rc = verify_frames(ctx); - if ( rc ) - goto out; - } - - out: - return rc; -} - -/* - * Checkpointed save. - */ -static int send_domain_memory_checkpointed(struct xc_sr_context *ctx) -{ - return suspend_and_send_dirty(ctx); -} - -/* - * Send all domain memory, pausing the domain first. Generally used for - * suspend-to-file. - */ -static int send_domain_memory_nonlive(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; - - rc = suspend_domain(ctx); - if ( rc ) - goto err; - - xc_set_progress_prefix(xch, "Frames"); - - rc = send_all_pages(ctx); - if ( rc ) - goto err; - - err: - return rc; -} - -static int setup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->save.dirty_bitmap_hbuf); - - rc = ctx->save.ops.setup(ctx); - if ( rc ) - goto err; - - dirty_bitmap = xc_hypercall_buffer_alloc_pages( - xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->save.p2m_size))); - ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE * - sizeof(*ctx->save.batch_pfns)); - ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size)); - - if ( !ctx->save.batch_pfns || !dirty_bitmap || !ctx->save.deferred_pages ) - { - ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and" - " deferred pages"); - rc = -1; - errno = ENOMEM; - goto err; - } - - rc = 0; - - err: - return rc; -} - -static void cleanup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, - &ctx->save.dirty_bitmap_hbuf); - - - xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF, - NULL, 0, NULL, 0, NULL); - - if ( ctx->save.ops.cleanup(ctx) ) - PERROR("Failed to clean up"); - - xc_hypercall_buffer_free_pages(xch, dirty_bitmap, - NRPAGES(bitmap_size(ctx->save.p2m_size))); - free(ctx->save.deferred_pages); - free(ctx->save.batch_pfns); -} - -/* - * Save a domain. - */ -static int save(struct xc_sr_context *ctx, uint16_t guest_type) -{ - xc_interface *xch = ctx->xch; - int rc, saved_rc = 0, saved_errno = 0; - - IPRINTF("Saving domain %d, type %s", - ctx->domid, dhdr_type_to_str(guest_type)); - - rc = setup(ctx); - if ( rc ) - goto err; - - xc_report_progress_single(xch, "Start of stream"); - - rc = write_headers(ctx, guest_type); - if ( rc ) - goto err; - - rc = ctx->save.ops.static_data(ctx); - if ( rc ) - goto err; - - rc = write_static_data_end_record(ctx); - if ( rc ) - goto err; - - rc = ctx->save.ops.start_of_stream(ctx); - if ( rc ) - goto err; - - do { - rc = ctx->save.ops.start_of_checkpoint(ctx); - if ( rc ) - goto err; - - rc = ctx->save.ops.check_vm_state(ctx); - if ( rc ) - goto err; - - if ( ctx->save.live ) - rc = send_domain_memory_live(ctx); - else if ( ctx->stream_type != XC_STREAM_PLAIN ) - rc = send_domain_memory_checkpointed(ctx); - else - rc = send_domain_memory_nonlive(ctx); - - if ( rc ) - goto err; - - if ( !ctx->dominfo.shutdown || - (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) ) - { - ERROR("Domain has not been suspended"); - rc = -1; - goto err; - } - - rc = ctx->save.ops.end_of_checkpoint(ctx); - if ( rc ) - goto err; - - if ( ctx->stream_type != XC_STREAM_PLAIN ) - { - /* - * We have now completed the initial live portion of the checkpoint - * process. Therefore switch into periodically sending synchronous - * batches of pages. - */ - ctx->save.live = false; - - rc = write_checkpoint_record(ctx); - if ( rc ) - goto err; - - if ( ctx->stream_type == XC_STREAM_COLO ) - { - rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data); - if ( !rc ) - { - rc = -1; - goto err; - } - } - - rc = ctx->save.callbacks->postcopy(ctx->save.callbacks->data); - if ( rc <= 0 ) - goto err; - - if ( ctx->stream_type == XC_STREAM_COLO ) - { - rc = ctx->save.callbacks->wait_checkpoint( - ctx->save.callbacks->data); - if ( rc <= 0 ) - goto err; - } - else if ( ctx->stream_type == XC_STREAM_REMUS ) - { - rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data); - if ( rc <= 0 ) - goto err; - } - else - { - ERROR("Unknown checkpointed stream"); - rc = -1; - goto err; - } - } - } while ( ctx->stream_type != XC_STREAM_PLAIN ); - - xc_report_progress_single(xch, "End of stream"); - - rc = write_end_record(ctx); - if ( rc ) - goto err; - - xc_report_progress_single(xch, "Complete"); - goto done; - - err: - saved_errno = errno; - saved_rc = rc; - PERROR("Save failed"); - - done: - cleanup(ctx); - - if ( saved_rc ) - { - rc = saved_rc; - errno = saved_errno; - } - - return rc; -}; - -int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, - uint32_t flags, struct save_callbacks *callbacks, - xc_stream_type_t stream_type, int recv_fd) -{ - struct xc_sr_context ctx = { - .xch = xch, - .fd = io_fd, - .stream_type = stream_type, - }; - - /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */ - ctx.save.callbacks = callbacks; - ctx.save.live = !!(flags & XCFLAGS_LIVE); - ctx.save.debug = !!(flags & XCFLAGS_DEBUG); - ctx.save.recv_fd = recv_fd; - - if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 ) - { - PERROR("Failed to get domain info"); - return -1; - } - - if ( ctx.dominfo.domid != dom ) - { - ERROR("Domain %u does not exist", dom); - return -1; - } - - /* Sanity check stream_type-related parameters */ - switch ( stream_type ) - { - case XC_STREAM_COLO: - assert(callbacks->wait_checkpoint); - /* Fallthrough */ - case XC_STREAM_REMUS: - assert(callbacks->checkpoint && callbacks->postcopy); - /* Fallthrough */ - case XC_STREAM_PLAIN: - if ( ctx.dominfo.hvm ) - assert(callbacks->switch_qemu_logdirty); - break; - - default: - assert(!"Bad stream_type"); - break; - } - - DPRINTF("fd %d, dom %u, flags %u, hvm %d", - io_fd, dom, flags, ctx.dominfo.hvm); - - ctx.domid = dom; - - if ( ctx.dominfo.hvm ) - { - ctx.save.ops = save_ops_x86_hvm; - return save(&ctx, DHDR_TYPE_X86_HVM); - } - else - { - ctx.save.ops = save_ops_x86_pv; - return save(&ctx, DHDR_TYPE_X86_PV); - } -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_save_x86_hvm.c b/tools/libxc/xg_sr_save_x86_hvm.c deleted file mode 100644 index 1634a7bc43..0000000000 --- a/tools/libxc/xg_sr_save_x86_hvm.c +++ /dev/null @@ -1,251 +0,0 @@ -#include - -#include "xg_sr_common_x86.h" - -#include - -/* - * Query for the HVM context and write an HVM_CONTEXT record into the stream. - */ -static int write_hvm_context(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc, hvm_buf_size; - struct xc_sr_record hvm_rec = { - .type = REC_TYPE_HVM_CONTEXT, - }; - - hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid, 0, 0); - if ( hvm_buf_size < 0 ) - { - PERROR("Couldn't get HVM context size from Xen"); - rc = -1; - goto out; - } - - hvm_rec.data = malloc(hvm_buf_size); - if ( !hvm_rec.data ) - { - PERROR("Couldn't allocate memory"); - rc = -1; - goto out; - } - - hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid, - hvm_rec.data, hvm_buf_size); - if ( hvm_buf_size < 0 ) - { - PERROR("Couldn't get HVM context from Xen"); - rc = -1; - goto out; - } - - hvm_rec.length = hvm_buf_size; - rc = write_record(ctx, &hvm_rec); - if ( rc < 0 ) - { - PERROR("error write HVM_CONTEXT record"); - goto out; - } - - out: - free(hvm_rec.data); - return rc; -} - -/* - * Query for a range of HVM parameters and write an HVM_PARAMS record into the - * stream. - */ -static int write_hvm_params(struct xc_sr_context *ctx) -{ - static const unsigned int params[] = { - HVM_PARAM_STORE_PFN, - HVM_PARAM_IOREQ_PFN, - HVM_PARAM_BUFIOREQ_PFN, - HVM_PARAM_PAGING_RING_PFN, - HVM_PARAM_MONITOR_RING_PFN, - HVM_PARAM_SHARING_RING_PFN, - HVM_PARAM_VM86_TSS_SIZED, - HVM_PARAM_CONSOLE_PFN, - HVM_PARAM_ACPI_IOPORTS_LOCATION, - HVM_PARAM_VIRIDIAN, - HVM_PARAM_IDENT_PT, - HVM_PARAM_VM_GENERATION_ID_ADDR, - HVM_PARAM_IOREQ_SERVER_PFN, - HVM_PARAM_NR_IOREQ_SERVER_PAGES, - HVM_PARAM_X87_FIP_WIDTH, - HVM_PARAM_MCA_CAP, - }; - - xc_interface *xch = ctx->xch; - struct xc_sr_rec_hvm_params_entry entries[ARRAY_SIZE(params)]; - struct xc_sr_rec_hvm_params hdr = { - .count = 0, - }; - struct xc_sr_record rec = { - .type = REC_TYPE_HVM_PARAMS, - .length = sizeof(hdr), - .data = &hdr, - }; - unsigned int i; - int rc; - - for ( i = 0; i < ARRAY_SIZE(params); i++ ) - { - uint32_t index = params[i]; - uint64_t value; - - rc = xc_hvm_param_get(xch, ctx->domid, index, &value); - if ( rc ) - { - PERROR("Failed to get HVMPARAM at index %u", index); - return rc; - } - - if ( value != 0 ) - { - entries[hdr.count].index = index; - entries[hdr.count].value = value; - hdr.count++; - } - } - - /* No params? Skip this record. */ - if ( hdr.count == 0 ) - return 0; - - rc = write_split_record(ctx, &rec, entries, hdr.count * sizeof(*entries)); - if ( rc ) - PERROR("Failed to write HVM_PARAMS record"); - - return rc; -} - -static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx, - xen_pfn_t pfn) -{ - /* identity map */ - return pfn; -} - -static int x86_hvm_normalise_page(struct xc_sr_context *ctx, - xen_pfn_t type, void **page) -{ - return 0; -} - -static int x86_hvm_setup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t nr_pfns; - - if ( xc_domain_nr_gpfns(xch, ctx->domid, &nr_pfns) < 0 ) - { - PERROR("Unable to obtain the guest p2m size"); - return -1; - } -#ifdef __i386__ - /* Very large domains (> 1TB) will exhaust virtual address space. */ - if ( nr_pfns > 0x0fffffff ) - { - errno = E2BIG; - PERROR("Cannot save this big a guest"); - return -1; - } -#endif - - ctx->save.p2m_size = nr_pfns; - - if ( ctx->save.callbacks->switch_qemu_logdirty( - ctx->domid, 1, ctx->save.callbacks->data) ) - { - PERROR("Couldn't enable qemu log-dirty mode"); - return -1; - } - - ctx->x86.hvm.save.qemu_enabled_logdirty = true; - - return 0; -} - -static int x86_hvm_static_data(struct xc_sr_context *ctx) -{ - return write_x86_cpu_policy_records(ctx); -} - -static int x86_hvm_start_of_stream(struct xc_sr_context *ctx) -{ - return 0; -} - -static int x86_hvm_start_of_checkpoint(struct xc_sr_context *ctx) -{ - return 0; -} - -static int x86_hvm_check_vm_state(struct xc_sr_context *ctx) -{ - return 0; -} - -static int x86_hvm_end_of_checkpoint(struct xc_sr_context *ctx) -{ - int rc; - - /* Write the TSC record. */ - rc = write_x86_tsc_info(ctx); - if ( rc ) - return rc; - - /* Write the HVM_CONTEXT record. */ - rc = write_hvm_context(ctx); - if ( rc ) - return rc; - - /* Write HVM_PARAMS record contains applicable HVM params. */ - rc = write_hvm_params(ctx); - if ( rc ) - return rc; - - return 0; -} - -static int x86_hvm_cleanup(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - - /* If qemu successfully enabled logdirty mode, attempt to disable. */ - if ( ctx->x86.hvm.save.qemu_enabled_logdirty && - ctx->save.callbacks->switch_qemu_logdirty( - ctx->domid, 0, ctx->save.callbacks->data) ) - { - PERROR("Couldn't disable qemu log-dirty mode"); - return -1; - } - - return 0; -} - -struct xc_sr_save_ops save_ops_x86_hvm = -{ - .pfn_to_gfn = x86_hvm_pfn_to_gfn, - .normalise_page = x86_hvm_normalise_page, - .setup = x86_hvm_setup, - .static_data = x86_hvm_static_data, - .start_of_stream = x86_hvm_start_of_stream, - .start_of_checkpoint = x86_hvm_start_of_checkpoint, - .end_of_checkpoint = x86_hvm_end_of_checkpoint, - .check_vm_state = x86_hvm_check_vm_state, - .cleanup = x86_hvm_cleanup, -}; - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_save_x86_pv.c b/tools/libxc/xg_sr_save_x86_pv.c deleted file mode 100644 index 4964f1f7b8..0000000000 --- a/tools/libxc/xg_sr_save_x86_pv.c +++ /dev/null @@ -1,1156 +0,0 @@ -#include -#include - -#include "xg_sr_common_x86_pv.h" - -/* Check a 64 bit virtual address for being canonical. */ -static inline bool is_canonical_address(xen_vaddr_t vaddr) -{ - return ((int64_t)vaddr >> 47) == ((int64_t)vaddr >> 63); -} - -/* - * Maps the guests shared info page. - */ -static int map_shinfo(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - - ctx->x86.pv.shinfo = xc_map_foreign_range( - xch, ctx->domid, PAGE_SIZE, PROT_READ, ctx->dominfo.shared_info_frame); - if ( !ctx->x86.pv.shinfo ) - { - PERROR("Failed to map shared info frame at mfn %#lx", - ctx->dominfo.shared_info_frame); - return -1; - } - - return 0; -} - -/* - * Copy a list of mfns from a guest, accounting for differences between guest - * and toolstack width. Can fail if truncation would occur. - */ -static int copy_mfns_from_guest(const struct xc_sr_context *ctx, - xen_pfn_t *dst, const void *src, size_t count) -{ - size_t x; - - if ( ctx->x86.pv.width == sizeof(unsigned long) ) - memcpy(dst, src, count * sizeof(*dst)); - else - { - for ( x = 0; x < count; ++x ) - { -#ifdef __x86_64__ - /* 64bit toolstack, 32bit guest. Expand any INVALID_MFN. */ - uint32_t s = ((uint32_t *)src)[x]; - - dst[x] = s == ~0U ? INVALID_MFN : s; -#else - /* - * 32bit toolstack, 64bit guest. Truncate INVALID_MFN, but bail - * if any other truncation would occur. - * - * This will only occur on hosts where a PV guest has ram above - * the 16TB boundary. A 32bit dom0 is unlikely to have - * successfully booted on a system this large. - */ - uint64_t s = ((uint64_t *)src)[x]; - - if ( (s != ~0ULL) && ((s >> 32) != 0) ) - { - errno = E2BIG; - return -1; - } - - dst[x] = s; -#endif - } - } - - return 0; -} - -/* - * Map the p2m leave pages and build an array of their pfns. - */ -static int map_p2m_leaves(struct xc_sr_context *ctx, xen_pfn_t *mfns, - size_t n_mfns) -{ - xc_interface *xch = ctx->xch; - unsigned int x; - - ctx->x86.pv.p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, - mfns, n_mfns); - if ( !ctx->x86.pv.p2m ) - { - PERROR("Failed to map p2m frames"); - return -1; - } - - ctx->save.p2m_size = ctx->x86.pv.max_pfn + 1; - ctx->x86.pv.p2m_frames = n_mfns; - ctx->x86.pv.p2m_pfns = malloc(n_mfns * sizeof(*mfns)); - if ( !ctx->x86.pv.p2m_pfns ) - { - ERROR("Cannot allocate %zu bytes for p2m pfns list", - n_mfns * sizeof(*mfns)); - return -1; - } - - /* Convert leaf frames from mfns to pfns. */ - for ( x = 0; x < n_mfns; ++x ) - { - if ( !mfn_in_pseudophysmap(ctx, mfns[x]) ) - { - ERROR("Bad mfn in p2m_frame_list[%u]", x); - dump_bad_pseudophysmap_entry(ctx, mfns[x]); - errno = ERANGE; - return -1; - } - - ctx->x86.pv.p2m_pfns[x] = mfn_to_pfn(ctx, mfns[x]); - } - - return 0; -} - -/* - * Walk the guests frame list list and frame list to identify and map the - * frames making up the guests p2m table. Construct a list of pfns making up - * the table. - */ -static int map_p2m_tree(struct xc_sr_context *ctx) -{ - /* Terminology: - * - * fll - frame list list, top level p2m, list of fl mfns - * fl - frame list, mid level p2m, list of leaf mfns - * local - own allocated buffers, adjusted for bitness - * guest - mappings into the domain - */ - xc_interface *xch = ctx->xch; - int rc = -1; - unsigned int x, saved_x, fpp, fll_entries, fl_entries; - xen_pfn_t fll_mfn, saved_mfn, max_pfn; - - xen_pfn_t *local_fll = NULL; - void *guest_fll = NULL; - size_t local_fll_size; - - xen_pfn_t *local_fl = NULL; - void *guest_fl = NULL; - size_t local_fl_size; - - fpp = PAGE_SIZE / ctx->x86.pv.width; - fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1; - if ( fll_entries > fpp ) - { - ERROR("max_pfn %#lx too large for p2m tree", ctx->x86.pv.max_pfn); - goto err; - } - - fll_mfn = GET_FIELD(ctx->x86.pv.shinfo, arch.pfn_to_mfn_frame_list_list, - ctx->x86.pv.width); - if ( fll_mfn == 0 || fll_mfn > ctx->x86.pv.max_mfn ) - { - ERROR("Bad mfn %#lx for p2m frame list list", fll_mfn); - goto err; - } - - /* Map the guest top p2m. */ - guest_fll = xc_map_foreign_range(xch, ctx->domid, PAGE_SIZE, - PROT_READ, fll_mfn); - if ( !guest_fll ) - { - PERROR("Failed to map p2m frame list list at %#lx", fll_mfn); - goto err; - } - - local_fll_size = fll_entries * sizeof(*local_fll); - local_fll = malloc(local_fll_size); - if ( !local_fll ) - { - ERROR("Cannot allocate %zu bytes for local p2m frame list list", - local_fll_size); - goto err; - } - - if ( copy_mfns_from_guest(ctx, local_fll, guest_fll, fll_entries) ) - { - ERROR("Truncation detected copying p2m frame list list"); - goto err; - } - - /* Check for bad mfns in frame list list. */ - saved_mfn = 0; - saved_x = 0; - for ( x = 0; x < fll_entries; ++x ) - { - if ( local_fll[x] == 0 || local_fll[x] > ctx->x86.pv.max_mfn ) - { - ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list list", - local_fll[x], x, fll_entries); - goto err; - } - if ( local_fll[x] != saved_mfn ) - { - saved_mfn = local_fll[x]; - saved_x = x; - } - } - - /* - * Check for actual lower max_pfn: - * If the trailing entries of the frame list list were all the same we can - * assume they all reference mid pages all referencing p2m pages with all - * invalid entries. Otherwise there would be multiple pfns referencing all - * the same mfn which can't work across migration, as this sharing would be - * broken by the migration process. - * Adjust max_pfn if possible to avoid allocating much larger areas as - * needed for p2m and logdirty map. - */ - max_pfn = (saved_x + 1) * fpp * fpp - 1; - if ( max_pfn < ctx->x86.pv.max_pfn ) - { - ctx->x86.pv.max_pfn = max_pfn; - fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1; - } - ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp; - DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn, - ctx->x86.pv.p2m_frames); - fl_entries = (ctx->x86.pv.max_pfn / fpp) + 1; - - /* Map the guest mid p2m frames. */ - guest_fl = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, - local_fll, fll_entries); - if ( !guest_fl ) - { - PERROR("Failed to map p2m frame list"); - goto err; - } - - local_fl_size = fl_entries * sizeof(*local_fl); - local_fl = malloc(local_fl_size); - if ( !local_fl ) - { - ERROR("Cannot allocate %zu bytes for local p2m frame list", - local_fl_size); - goto err; - } - - if ( copy_mfns_from_guest(ctx, local_fl, guest_fl, fl_entries) ) - { - ERROR("Truncation detected copying p2m frame list"); - goto err; - } - - for ( x = 0; x < fl_entries; ++x ) - { - if ( local_fl[x] == 0 || local_fl[x] > ctx->x86.pv.max_mfn ) - { - ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list", - local_fl[x], x, fl_entries); - goto err; - } - } - - /* Map the p2m leaves themselves. */ - rc = map_p2m_leaves(ctx, local_fl, fl_entries); - - err: - free(local_fl); - if ( guest_fl ) - munmap(guest_fl, fll_entries * PAGE_SIZE); - - free(local_fll); - if ( guest_fll ) - munmap(guest_fll, PAGE_SIZE); - - return rc; -} - -/* - * Get p2m_generation count. - * Returns an error if the generation count has changed since the last call. - */ -static int get_p2m_generation(struct xc_sr_context *ctx) -{ - uint64_t p2m_generation; - int rc; - - p2m_generation = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_generation, - ctx->x86.pv.width); - - rc = (p2m_generation == ctx->x86.pv.p2m_generation) ? 0 : -1; - ctx->x86.pv.p2m_generation = p2m_generation; - - return rc; -} - -static int x86_pv_check_vm_state_p2m_list(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; - - if ( !ctx->save.live ) - return 0; - - rc = get_p2m_generation(ctx); - if ( rc ) - ERROR("p2m generation count changed. Migration aborted."); - - return rc; -} - -/* - * Map the guest p2m frames specified via a cr3 value, a virtual address, and - * the maximum pfn. PTE entries are 64 bits for both, 32 and 64 bit guests as - * in 32 bit case we support PAE guests only. - */ -static int map_p2m_list(struct xc_sr_context *ctx, uint64_t p2m_cr3) -{ - xc_interface *xch = ctx->xch; - xen_vaddr_t p2m_vaddr, p2m_end, mask, off; - xen_pfn_t p2m_mfn, mfn, saved_mfn, max_pfn; - uint64_t *ptes = NULL; - xen_pfn_t *mfns = NULL; - unsigned int fpp, n_pages, level, shift, idx_start, idx_end, idx, saved_idx; - int rc = -1; - - p2m_mfn = cr3_to_mfn(ctx, p2m_cr3); - assert(p2m_mfn != 0); - if ( p2m_mfn > ctx->x86.pv.max_mfn ) - { - ERROR("Bad p2m_cr3 value %#" PRIx64, p2m_cr3); - errno = ERANGE; - goto err; - } - - get_p2m_generation(ctx); - - p2m_vaddr = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_vaddr, - ctx->x86.pv.width); - fpp = PAGE_SIZE / ctx->x86.pv.width; - ctx->x86.pv.p2m_frames = ctx->x86.pv.max_pfn / fpp + 1; - p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1; - - if ( ctx->x86.pv.width == 8 ) - { - mask = 0x0000ffffffffffffULL; - if ( !is_canonical_address(p2m_vaddr) || - !is_canonical_address(p2m_end) || - p2m_end < p2m_vaddr || - (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_64 && - p2m_end > HYPERVISOR_VIRT_START_X86_64) ) - { - ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64, - p2m_vaddr, p2m_end); - errno = ERANGE; - goto err; - } - } - else - { - mask = 0x00000000ffffffffULL; - if ( p2m_vaddr > mask || p2m_end > mask || p2m_end < p2m_vaddr || - (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_32 && - p2m_end > HYPERVISOR_VIRT_START_X86_32) ) - { - ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64, - p2m_vaddr, p2m_end); - errno = ERANGE; - goto err; - } - } - - DPRINTF("p2m list from %#" PRIx64 " to %#" PRIx64 ", root at %#lx", - p2m_vaddr, p2m_end, p2m_mfn); - DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn, - ctx->x86.pv.p2m_frames); - - mfns = malloc(sizeof(*mfns)); - if ( !mfns ) - { - ERROR("Cannot allocate memory for array of %u mfns", 1); - goto err; - } - mfns[0] = p2m_mfn; - off = 0; - saved_mfn = 0; - idx_start = idx_end = saved_idx = 0; - - for ( level = ctx->x86.pv.levels; level > 0; level-- ) - { - n_pages = idx_end - idx_start + 1; - ptes = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, mfns, n_pages); - if ( !ptes ) - { - PERROR("Failed to map %u page table pages for p2m list", n_pages); - goto err; - } - free(mfns); - - shift = level * 9 + 3; - idx_start = ((p2m_vaddr - off) & mask) >> shift; - idx_end = ((p2m_end - off) & mask) >> shift; - idx = idx_end - idx_start + 1; - mfns = malloc(sizeof(*mfns) * idx); - if ( !mfns ) - { - ERROR("Cannot allocate memory for array of %u mfns", idx); - goto err; - } - - for ( idx = idx_start; idx <= idx_end; idx++ ) - { - mfn = pte_to_frame(ptes[idx]); - if ( mfn == 0 || mfn > ctx->x86.pv.max_mfn ) - { - ERROR("Bad mfn %#lx during page table walk for vaddr %#" PRIx64 " at level %d of p2m list", - mfn, off + ((xen_vaddr_t)idx << shift), level); - errno = ERANGE; - goto err; - } - mfns[idx - idx_start] = mfn; - - /* Maximum pfn check at level 2. Same reasoning as for p2m tree. */ - if ( level == 2 ) - { - if ( mfn != saved_mfn ) - { - saved_mfn = mfn; - saved_idx = idx - idx_start; - } - } - } - - if ( level == 2 ) - { - if ( saved_idx == idx_end ) - saved_idx++; - max_pfn = ((xen_pfn_t)saved_idx << 9) * fpp - 1; - if ( max_pfn < ctx->x86.pv.max_pfn ) - { - ctx->x86.pv.max_pfn = max_pfn; - ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp; - p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1; - idx_end = idx_start + saved_idx; - } - } - - munmap(ptes, n_pages * PAGE_SIZE); - ptes = NULL; - off = p2m_vaddr & ((mask >> shift) << shift); - } - - /* Map the p2m leaves themselves. */ - rc = map_p2m_leaves(ctx, mfns, idx_end - idx_start + 1); - - err: - free(mfns); - if ( ptes ) - munmap(ptes, n_pages * PAGE_SIZE); - - return rc; -} - -/* - * Map the guest p2m frames. - * Depending on guest support this might either be a virtual mapped linear - * list (preferred format) or a 3 level tree linked via mfns. - */ -static int map_p2m(struct xc_sr_context *ctx) -{ - uint64_t p2m_cr3; - - ctx->x86.pv.p2m_generation = ~0ULL; - ctx->x86.pv.max_pfn = GET_FIELD(ctx->x86.pv.shinfo, arch.max_pfn, - ctx->x86.pv.width) - 1; - p2m_cr3 = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_cr3, ctx->x86.pv.width); - - return p2m_cr3 ? map_p2m_list(ctx, p2m_cr3) : map_p2m_tree(ctx); -} - -/* - * Obtain a specific vcpus basic state and write an X86_PV_VCPU_BASIC record - * into the stream. Performs mfn->pfn conversion on architectural state. - */ -static int write_one_vcpu_basic(struct xc_sr_context *ctx, uint32_t id) -{ - xc_interface *xch = ctx->xch; - xen_pfn_t mfn, pfn; - unsigned int i, gdt_count; - int rc = -1; - vcpu_guest_context_any_t vcpu; - struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { - .vcpu_id = id, - }; - struct xc_sr_record rec = { - .type = REC_TYPE_X86_PV_VCPU_BASIC, - .length = sizeof(vhdr), - .data = &vhdr, - }; - - if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) ) - { - PERROR("Failed to get vcpu%u context", id); - goto err; - } - - /* Vcpu0 is special: Convert the suspend record to a pfn. */ - if ( id == 0 ) - { - mfn = GET_FIELD(&vcpu, user_regs.edx, ctx->x86.pv.width); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("Bad mfn for suspend record"); - dump_bad_pseudophysmap_entry(ctx, mfn); - errno = ERANGE; - goto err; - } - SET_FIELD(&vcpu, user_regs.edx, mfn_to_pfn(ctx, mfn), - ctx->x86.pv.width); - } - - gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86.pv.width); - if ( gdt_count > FIRST_RESERVED_GDT_ENTRY ) - { - ERROR("GDT entry count (%u) out of range (max %u)", - gdt_count, FIRST_RESERVED_GDT_ENTRY); - errno = ERANGE; - goto err; - } - gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */ - - /* Convert GDT frames to pfns. */ - for ( i = 0; i < gdt_count; ++i ) - { - mfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86.pv.width); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("Bad mfn for frame %u of vcpu%u's GDT", i, id); - dump_bad_pseudophysmap_entry(ctx, mfn); - errno = ERANGE; - goto err; - } - SET_FIELD(&vcpu, gdt_frames[i], mfn_to_pfn(ctx, mfn), - ctx->x86.pv.width); - } - - /* Convert CR3 to a pfn. */ - mfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86.pv.width)); - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("Bad mfn for vcpu%u's cr3", id); - dump_bad_pseudophysmap_entry(ctx, mfn); - errno = ERANGE; - goto err; - } - pfn = mfn_to_pfn(ctx, mfn); - SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, pfn), ctx->x86.pv.width); - - /* 64bit guests: Convert CR1 (guest pagetables) to pfn. */ - if ( ctx->x86.pv.levels == 4 && vcpu.x64.ctrlreg[1] ) - { - mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT; - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - ERROR("Bad mfn for vcpu%u's cr1", id); - dump_bad_pseudophysmap_entry(ctx, mfn); - errno = ERANGE; - goto err; - } - pfn = mfn_to_pfn(ctx, mfn); - vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT); - } - - if ( ctx->x86.pv.width == 8 ) - rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x64)); - else - rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x32)); - - err: - return rc; -} - -/* - * Obtain a specific vcpus extended state and write an X86_PV_VCPU_EXTENDED - * record into the stream. - */ -static int write_one_vcpu_extended(struct xc_sr_context *ctx, uint32_t id) -{ - xc_interface *xch = ctx->xch; - struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { - .vcpu_id = id, - }; - struct xc_sr_record rec = { - .type = REC_TYPE_X86_PV_VCPU_EXTENDED, - .length = sizeof(vhdr), - .data = &vhdr, - }; - struct xen_domctl domctl = { - .cmd = XEN_DOMCTL_get_ext_vcpucontext, - .domain = ctx->domid, - .u.ext_vcpucontext.vcpu = id, - }; - - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("Unable to get vcpu%u extended context", id); - return -1; - } - - /* No content? Skip the record. */ - if ( domctl.u.ext_vcpucontext.size == 0 ) - return 0; - - return write_split_record(ctx, &rec, &domctl.u.ext_vcpucontext, - domctl.u.ext_vcpucontext.size); -} - -/* - * Query to see whether a specific vcpu has xsave state and if so, write an - * X86_PV_VCPU_XSAVE record into the stream. - */ -static int write_one_vcpu_xsave(struct xc_sr_context *ctx, uint32_t id) -{ - xc_interface *xch = ctx->xch; - int rc = -1; - DECLARE_HYPERCALL_BUFFER(void, buffer); - struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { - .vcpu_id = id, - }; - struct xc_sr_record rec = { - .type = REC_TYPE_X86_PV_VCPU_XSAVE, - .length = sizeof(vhdr), - .data = &vhdr, - }; - struct xen_domctl domctl = { - .cmd = XEN_DOMCTL_getvcpuextstate, - .domain = ctx->domid, - .u.vcpuextstate.vcpu = id, - }; - - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("Unable to get vcpu%u's xsave context", id); - goto err; - } - - /* No xsave state? skip this record. */ - if ( !domctl.u.vcpuextstate.xfeature_mask ) - goto out; - - buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size); - if ( !buffer ) - { - ERROR("Unable to allocate %"PRIx64" bytes for vcpu%u's xsave context", - domctl.u.vcpuextstate.size, id); - goto err; - } - - set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer); - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("Unable to get vcpu%u's xsave context", id); - goto err; - } - - /* No xsave state? Skip this record. */ - if ( domctl.u.vcpuextstate.size == 0 ) - goto out; - - rc = write_split_record(ctx, &rec, buffer, domctl.u.vcpuextstate.size); - if ( rc ) - goto err; - - out: - rc = 0; - - err: - xc_hypercall_buffer_free(xch, buffer); - - return rc; -} - -/* - * Query to see whether a specific vcpu has msr state and if so, write an - * X86_PV_VCPU_MSRS record into the stream. - */ -static int write_one_vcpu_msrs(struct xc_sr_context *ctx, uint32_t id) -{ - xc_interface *xch = ctx->xch; - int rc = -1; - size_t buffersz; - DECLARE_HYPERCALL_BUFFER(void, buffer); - struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = { - .vcpu_id = id, - }; - struct xc_sr_record rec = { - .type = REC_TYPE_X86_PV_VCPU_MSRS, - .length = sizeof(vhdr), - .data = &vhdr, - }; - struct xen_domctl domctl = { - .cmd = XEN_DOMCTL_get_vcpu_msrs, - .domain = ctx->domid, - .u.vcpu_msrs.vcpu = id, - }; - - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("Unable to get vcpu%u's msrs", id); - goto err; - } - - /* No MSRs? skip this record. */ - if ( !domctl.u.vcpu_msrs.msr_count ) - goto out; - - buffersz = domctl.u.vcpu_msrs.msr_count * sizeof(xen_domctl_vcpu_msr_t); - buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz); - if ( !buffer ) - { - ERROR("Unable to allocate %zu bytes for vcpu%u's msrs", - buffersz, id); - goto err; - } - - set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer); - if ( xc_domctl(xch, &domctl) < 0 ) - { - PERROR("Unable to get vcpu%u's msrs", id); - goto err; - } - - /* No MSRs? Skip this record. */ - if ( domctl.u.vcpu_msrs.msr_count == 0 ) - goto out; - - rc = write_split_record(ctx, &rec, buffer, - domctl.u.vcpu_msrs.msr_count * - sizeof(xen_domctl_vcpu_msr_t)); - if ( rc ) - goto err; - - out: - rc = 0; - - err: - xc_hypercall_buffer_free(xch, buffer); - - return rc; -} - -/* - * For each vcpu, if it is online, write its state into the stream. - */ -static int write_all_vcpu_information(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - xc_vcpuinfo_t vinfo; - unsigned int i; - int rc; - - for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i ) - { - rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo); - if ( rc ) - { - PERROR("Failed to get vcpu%u information", i); - return rc; - } - - /* Vcpu offline? skip all these records. */ - if ( !vinfo.online ) - continue; - - rc = write_one_vcpu_basic(ctx, i); - if ( rc ) - return rc; - - rc = write_one_vcpu_extended(ctx, i); - if ( rc ) - return rc; - - rc = write_one_vcpu_xsave(ctx, i); - if ( rc ) - return rc; - - rc = write_one_vcpu_msrs(ctx, i); - if ( rc ) - return rc; - } - - return 0; -} - -/* - * Writes an X86_PV_INFO record into the stream. - */ -static int write_x86_pv_info(struct xc_sr_context *ctx) -{ - struct xc_sr_rec_x86_pv_info info = { - .guest_width = ctx->x86.pv.width, - .pt_levels = ctx->x86.pv.levels, - }; - struct xc_sr_record rec = { - .type = REC_TYPE_X86_PV_INFO, - .length = sizeof(info), - .data = &info, - }; - - return write_record(ctx, &rec); -} - -/* - * Writes an X86_PV_P2M_FRAMES record into the stream. This contains the list - * of pfns making up the p2m table. - */ -static int write_x86_pv_p2m_frames(struct xc_sr_context *ctx) -{ - xc_interface *xch = ctx->xch; - int rc; unsigned int i; - size_t datasz = ctx->x86.pv.p2m_frames * sizeof(uint64_t); - uint64_t *data = NULL; - struct xc_sr_rec_x86_pv_p2m_frames hdr = { - .end_pfn = ctx->x86.pv.max_pfn, - }; - struct xc_sr_record rec = { - .type = REC_TYPE_X86_PV_P2M_FRAMES, - .length = sizeof(hdr), - .data = &hdr, - }; - - /* No need to translate if sizeof(uint64_t) == sizeof(xen_pfn_t). */ - if ( sizeof(uint64_t) != sizeof(*ctx->x86.pv.p2m_pfns) ) - { - if ( !(data = malloc(datasz)) ) - { - ERROR("Cannot allocate %zu bytes for X86_PV_P2M_FRAMES data", - datasz); - return -1; - } - - for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i ) - data[i] = ctx->x86.pv.p2m_pfns[i]; - } - else - data = (uint64_t *)ctx->x86.pv.p2m_pfns; - - rc = write_split_record(ctx, &rec, data, datasz); - - if ( data != (uint64_t *)ctx->x86.pv.p2m_pfns ) - free(data); - - return rc; -} - -/* - * Writes an SHARED_INFO record into the stream. - */ -static int write_shared_info(struct xc_sr_context *ctx) -{ - struct xc_sr_record rec = { - .type = REC_TYPE_SHARED_INFO, - .length = PAGE_SIZE, - .data = ctx->x86.pv.shinfo, - }; - - return write_record(ctx, &rec); -} - -/* - * Normalise a pagetable for the migration stream. Performs mfn->pfn - * conversions on the ptes. - */ -static int normalise_pagetable(struct xc_sr_context *ctx, const uint64_t *src, - uint64_t *dst, unsigned long type) -{ - xc_interface *xch = ctx->xch; - uint64_t pte; - unsigned int i, xen_first = -1, xen_last = -1; /* Indices of Xen mappings. */ - - type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; - - if ( ctx->x86.pv.levels == 4 ) - { - /* 64bit guests only have Xen mappings in their L4 tables. */ - if ( type == XEN_DOMCTL_PFINFO_L4TAB ) - { - xen_first = (HYPERVISOR_VIRT_START_X86_64 >> - L4_PAGETABLE_SHIFT_X86_64) & 511; - xen_last = (HYPERVISOR_VIRT_END_X86_64 >> - L4_PAGETABLE_SHIFT_X86_64) & 511; - } - } - else - { - switch ( type ) - { - case XEN_DOMCTL_PFINFO_L4TAB: - ERROR("??? Found L4 table for 32bit guest"); - errno = EINVAL; - return -1; - - case XEN_DOMCTL_PFINFO_L3TAB: - /* 32bit guests can only use the first 4 entries of their L3 tables. - * All other are potentially used by Xen. */ - xen_first = 4; - xen_last = 511; - break; - - case XEN_DOMCTL_PFINFO_L2TAB: - /* It is hard to spot Xen mappings in a 32bit guest's L2. Most - * are normal but only a few will have Xen mappings. - */ - i = (HYPERVISOR_VIRT_START_X86_32 >> L2_PAGETABLE_SHIFT_PAE) & 511; - if ( pte_to_frame(src[i]) == ctx->x86.pv.compat_m2p_mfn0 ) - { - xen_first = i; - xen_last = (HYPERVISOR_VIRT_END_X86_32 >> - L2_PAGETABLE_SHIFT_PAE) & 511; - } - break; - } - } - - for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i ) - { - xen_pfn_t mfn; - - pte = src[i]; - - /* Remove Xen mappings: Xen will reconstruct on the other side. */ - if ( i >= xen_first && i <= xen_last ) - pte = 0; - - /* - * Errors during the live part of migration are expected as a result - * of split pagetable updates, page type changes, active grant - * mappings etc. The pagetable will need to be resent after pausing. - * In such cases we fail with EAGAIN. - * - * For domains which are already paused, errors are fatal. - */ - if ( pte & _PAGE_PRESENT ) - { - mfn = pte_to_frame(pte); - -#ifdef __i386__ - if ( mfn == INVALID_MFN ) - { - if ( !ctx->dominfo.paused ) - errno = EAGAIN; - else - { - ERROR("PTE truncation detected. L%lu[%u] = %016"PRIx64, - type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); - errno = E2BIG; - } - return -1; - } -#endif - - if ( (type > XEN_DOMCTL_PFINFO_L1TAB) && (pte & _PAGE_PSE) ) - { - ERROR("Cannot migrate superpage (L%lu[%u]: 0x%016"PRIx64")", - type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte); - errno = E2BIG; - return -1; - } - - if ( !mfn_in_pseudophysmap(ctx, mfn) ) - { - if ( !ctx->dominfo.paused ) - errno = EAGAIN; - else - { - ERROR("Bad mfn for L%lu[%u]", - type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i); - dump_bad_pseudophysmap_entry(ctx, mfn); - errno = ERANGE; - } - return -1; - } - - pte = merge_pte(pte, mfn_to_pfn(ctx, mfn)); - } - - dst[i] = pte; - } - - return 0; -} - -static xen_pfn_t x86_pv_pfn_to_gfn(const struct xc_sr_context *ctx, - xen_pfn_t pfn) -{ - assert(pfn <= ctx->x86.pv.max_pfn); - - return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width); -} - - -/* - * save_ops function. Performs pagetable normalisation on appropriate pages. - */ -static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type, - void **page) -{ - xc_interface *xch = ctx->xch; - void *local_page; - int rc; - - type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; - - if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB ) - return 0; - - local_page = malloc(PAGE_SIZE); - if ( !local_page ) - { - ERROR("Unable to allocate scratch page"); - rc = -1; - goto out; - } - - rc = normalise_pagetable(ctx, *page, local_page, type); - *page = local_page; - - out: - return rc; -} - -/* - * save_ops function. Queries domain information and maps the Xen m2p and the - * guests shinfo and p2m table. - */ -static int x86_pv_setup(struct xc_sr_context *ctx) -{ - int rc; - - rc = x86_pv_domain_info(ctx); - if ( rc ) - return rc; - - rc = x86_pv_map_m2p(ctx); - if ( rc ) - return rc; - - rc = map_shinfo(ctx); - if ( rc ) - return rc; - - rc = map_p2m(ctx); - if ( rc ) - return rc; - - return 0; -} - -static int x86_pv_static_data(struct xc_sr_context *ctx) -{ - int rc; - - rc = write_x86_pv_info(ctx); - if ( rc ) - return rc; - - rc = write_x86_cpu_policy_records(ctx); - if ( rc ) - return rc; - - return 0; -} - -static int x86_pv_start_of_stream(struct xc_sr_context *ctx) -{ - int rc; - - /* - * Ideally should be able to change during migration. Currently - * corruption will occur if the contents or location of the P2M changes - * during the live migration loop. If one is very lucky, the breakage - * will not be subtle. - */ - rc = write_x86_pv_p2m_frames(ctx); - if ( rc ) - return rc; - - return 0; -} - -static int x86_pv_start_of_checkpoint(struct xc_sr_context *ctx) -{ - return 0; -} - -static int x86_pv_end_of_checkpoint(struct xc_sr_context *ctx) -{ - int rc; - - rc = write_x86_tsc_info(ctx); - if ( rc ) - return rc; - - rc = write_shared_info(ctx); - if ( rc ) - return rc; - - rc = write_all_vcpu_information(ctx); - if ( rc ) - return rc; - - return 0; -} - -static int x86_pv_check_vm_state(struct xc_sr_context *ctx) -{ - if ( ctx->x86.pv.p2m_generation == ~0ULL ) - return 0; - - return x86_pv_check_vm_state_p2m_list(ctx); -} - -static int x86_pv_cleanup(struct xc_sr_context *ctx) -{ - free(ctx->x86.pv.p2m_pfns); - - if ( ctx->x86.pv.p2m ) - munmap(ctx->x86.pv.p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE); - - if ( ctx->x86.pv.shinfo ) - munmap(ctx->x86.pv.shinfo, PAGE_SIZE); - - if ( ctx->x86.pv.m2p ) - munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE); - - return 0; -} - -struct xc_sr_save_ops save_ops_x86_pv = -{ - .pfn_to_gfn = x86_pv_pfn_to_gfn, - .normalise_page = x86_pv_normalise_page, - .setup = x86_pv_setup, - .static_data = x86_pv_static_data, - .start_of_stream = x86_pv_start_of_stream, - .start_of_checkpoint = x86_pv_start_of_checkpoint, - .end_of_checkpoint = x86_pv_end_of_checkpoint, - .check_vm_state = x86_pv_check_vm_state, - .cleanup = x86_pv_cleanup, -}; - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_sr_stream_format.h b/tools/libxc/xg_sr_stream_format.h deleted file mode 100644 index 8a0da26f75..0000000000 --- a/tools/libxc/xg_sr_stream_format.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef __STREAM_FORMAT__H -#define __STREAM_FORMAT__H - -/* - * C structures for the Migration v2 stream format. - * See docs/specs/libxc-migration-stream.pandoc - */ - -#include - -/* - * Image Header - */ -struct xc_sr_ihdr -{ - uint64_t marker; - uint32_t id; - uint32_t version; - uint16_t options; - uint16_t _res1; - uint32_t _res2; -}; - -#define IHDR_MARKER 0xffffffffffffffffULL -#define IHDR_ID 0x58454E46U - -#define _IHDR_OPT_ENDIAN 0 -#define IHDR_OPT_LITTLE_ENDIAN (0 << _IHDR_OPT_ENDIAN) -#define IHDR_OPT_BIG_ENDIAN (1 << _IHDR_OPT_ENDIAN) - -/* - * Domain Header - */ -struct xc_sr_dhdr -{ - uint32_t type; - uint16_t page_shift; - uint16_t _res1; - uint32_t xen_major; - uint32_t xen_minor; -}; - -#define DHDR_TYPE_X86_PV 0x00000001U -#define DHDR_TYPE_X86_HVM 0x00000002U - -/* - * Record Header - */ -struct xc_sr_rhdr -{ - uint32_t type; - uint32_t length; -}; - -/* All records must be aligned up to an 8 octet boundary */ -#define REC_ALIGN_ORDER (3U) -/* Somewhat arbitrary - 128MB */ -#define REC_LENGTH_MAX (128U << 20) - -#define REC_TYPE_END 0x00000000U -#define REC_TYPE_PAGE_DATA 0x00000001U -#define REC_TYPE_X86_PV_INFO 0x00000002U -#define REC_TYPE_X86_PV_P2M_FRAMES 0x00000003U -#define REC_TYPE_X86_PV_VCPU_BASIC 0x00000004U -#define REC_TYPE_X86_PV_VCPU_EXTENDED 0x00000005U -#define REC_TYPE_X86_PV_VCPU_XSAVE 0x00000006U -#define REC_TYPE_SHARED_INFO 0x00000007U -#define REC_TYPE_X86_TSC_INFO 0x00000008U -#define REC_TYPE_HVM_CONTEXT 0x00000009U -#define REC_TYPE_HVM_PARAMS 0x0000000aU -#define REC_TYPE_TOOLSTACK 0x0000000bU -#define REC_TYPE_X86_PV_VCPU_MSRS 0x0000000cU -#define REC_TYPE_VERIFY 0x0000000dU -#define REC_TYPE_CHECKPOINT 0x0000000eU -#define REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST 0x0000000fU -#define REC_TYPE_STATIC_DATA_END 0x00000010U -#define REC_TYPE_X86_CPUID_POLICY 0x00000011U -#define REC_TYPE_X86_MSR_POLICY 0x00000012U - -#define REC_TYPE_OPTIONAL 0x80000000U - -/* PAGE_DATA */ -struct xc_sr_rec_page_data_header -{ - uint32_t count; - uint32_t _res1; - uint64_t pfn[0]; -}; - -#define PAGE_DATA_PFN_MASK 0x000fffffffffffffULL -#define PAGE_DATA_TYPE_MASK 0xf000000000000000ULL - -/* X86_PV_INFO */ -struct xc_sr_rec_x86_pv_info -{ - uint8_t guest_width; - uint8_t pt_levels; - uint8_t _res[6]; -}; - -/* X86_PV_P2M_FRAMES */ -struct xc_sr_rec_x86_pv_p2m_frames -{ - uint32_t start_pfn; - uint32_t end_pfn; - uint64_t p2m_pfns[0]; -}; - -/* X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} */ -struct xc_sr_rec_x86_pv_vcpu_hdr -{ - uint32_t vcpu_id; - uint32_t _res1; - uint8_t context[0]; -}; - -/* X86_TSC_INFO */ -struct xc_sr_rec_x86_tsc_info -{ - uint32_t mode; - uint32_t khz; - uint64_t nsec; - uint32_t incarnation; - uint32_t _res1; -}; - -/* HVM_PARAMS */ -struct xc_sr_rec_hvm_params_entry -{ - uint64_t index; - uint64_t value; -}; - -struct xc_sr_rec_hvm_params -{ - uint32_t count; - uint32_t _res1; - struct xc_sr_rec_hvm_params_entry param[0]; -}; - -#endif -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/tools/libxc/xg_suspend.c b/tools/libxc/xg_suspend.c deleted file mode 100644 index 0ce6364963..0000000000 --- a/tools/libxc/xg_suspend.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; - * version 2.1 of the License. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; If not, see . - */ - -#include -#include - -#include - -#include "xc_private.h" -#include "xenguest.h" - -#define SUSPEND_LOCK_FILE XEN_RUN_DIR "/suspend-evtchn-%d.lock" - -/* - * locking - */ - -#define ERR(x) do{ \ - ERROR("Can't " #x " lock file for suspend event channel %s: %s\n", \ - suspend_file, strerror(errno)); \ - goto err; \ -}while(0) - -#define SUSPEND_FILE_BUFLEN (sizeof(SUSPEND_LOCK_FILE) + 10) - -static void get_suspend_file(char buf[], uint32_t domid) -{ - snprintf(buf, SUSPEND_FILE_BUFLEN, SUSPEND_LOCK_FILE, domid); -} - -static int lock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd) -{ - int fd = -1, r; - char suspend_file[SUSPEND_FILE_BUFLEN]; - struct stat ours, theirs; - struct flock fl; - - get_suspend_file(suspend_file, domid); - - *lockfd = -1; - - for (;;) { - if (fd >= 0) - close (fd); - - fd = open(suspend_file, O_CREAT | O_RDWR, 0600); - if (fd < 0) - ERR("create"); - - r = fcntl(fd, F_SETFD, FD_CLOEXEC); - if (r) - ERR("fcntl F_SETFD FD_CLOEXEC"); - - memset(&fl, 0, sizeof(fl)); - fl.l_type = F_WRLCK; - fl.l_whence = SEEK_SET; - fl.l_len = 1; - r = fcntl(fd, F_SETLK, &fl); - if (r) - ERR("fcntl F_SETLK"); - - r = fstat(fd, &ours); - if (r) - ERR("fstat"); - - r = stat(suspend_file, &theirs); - if (r) { - if (errno == ENOENT) - /* try again */ - continue; - ERR("stat"); - } - - if (ours.st_ino != theirs.st_ino) - /* someone else must have removed it while we were locking it */ - continue; - - break; - } - - *lockfd = fd; - return 0; - - err: - if (fd >= 0) - close(fd); - - return -1; -} - -static int unlock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd) -{ - int r; - char suspend_file[SUSPEND_FILE_BUFLEN]; - - if (*lockfd < 0) - return 0; - - get_suspend_file(suspend_file, domid); - - r = unlink(suspend_file); - if (r) - ERR("unlink"); - - r = close(*lockfd); - *lockfd = -1; - if (r) - ERR("close"); - - err: - if (*lockfd >= 0) - close(*lockfd); - - return -1; -} - -int xc_await_suspend(xc_interface *xch, xenevtchn_handle *xce, int suspend_evtchn) -{ - int rc; - - do { - rc = xenevtchn_pending(xce); - if (rc < 0) { - ERROR("error polling suspend notification channel: %d", rc); - return -1; - } - } while (rc != suspend_evtchn); - - /* harmless for one-off suspend */ - if (xenevtchn_unmask(xce, suspend_evtchn) < 0) - ERROR("failed to unmask suspend notification channel: %d", rc); - - return 0; -} - -/* Internal callers are allowed to call this with suspend_evtchn<0 - * but *lockfd>0. */ -int xc_suspend_evtchn_release(xc_interface *xch, xenevtchn_handle *xce, - uint32_t domid, int suspend_evtchn, int *lockfd) -{ - if (suspend_evtchn >= 0) - xenevtchn_unbind(xce, suspend_evtchn); - - return unlock_suspend_event(xch, domid, lockfd); -} - -int xc_suspend_evtchn_init_sane(xc_interface *xch, xenevtchn_handle *xce, - uint32_t domid, int port, int *lockfd) -{ - int rc, suspend_evtchn = -1; - - if (lock_suspend_event(xch, domid, lockfd)) { - errno = EINVAL; - goto cleanup; - } - - suspend_evtchn = xenevtchn_bind_interdomain(xce, domid, port); - if (suspend_evtchn < 0) { - ERROR("failed to bind suspend event channel: %d", suspend_evtchn); - goto cleanup; - } - - rc = xc_domain_subscribe_for_suspend(xch, domid, port); - if (rc < 0) { - ERROR("failed to subscribe to domain: %d", rc); - goto cleanup; - } - - return suspend_evtchn; - -cleanup: - xc_suspend_evtchn_release(xch, xce, domid, suspend_evtchn, lockfd); - - return -1; -} - -int xc_suspend_evtchn_init_exclusive(xc_interface *xch, xenevtchn_handle *xce, - uint32_t domid, int port, int *lockfd) -{ - int suspend_evtchn; - - suspend_evtchn = xc_suspend_evtchn_init_sane(xch, xce, domid, port, lockfd); - if (suspend_evtchn < 0) - return suspend_evtchn; - - /* event channel is pending immediately after binding */ - xc_await_suspend(xch, xce, suspend_evtchn); - - return suspend_evtchn; -}