tools/libxc now contains libxenguest only. Move it to tools/libs/guest.
When generating the pkg-config file for libxenguest a filter is now
required for replacing "xenctrl" by "xencontrol" in the
"Requires.private:" entry. Add this filter to tools/libs/libs.mk.
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Samuel Thibault <samuel.thibault@ens-lyon.org> (stubdom parts)
stubdom/ioemu
stubdom/ioemu/
stubdom/libs-*
-stubdom/libxc-*
stubdom/libxencall-*
stubdom/libxenevtchn-*
stubdom/libxenforeignmemory-*
tools/libs/foreignmemory/xenforeignmemory.pc
tools/libs/devicemodel/headers.chk
tools/libs/devicemodel/xendevicemodel.pc
+tools/libs/guest/_*.[ch]
+tools/libs/guest/libxenguest.map
+tools/libs/guest/xenguest.pc
+tools/libs/guest/xc_bitops.h
+tools/libs/guest/xc_core.h
+tools/libs/guest/xc_core_arm.h
+tools/libs/guest/xc_core_x86.h
+tools/libs/guest/xc_private.h
tools/console/xenconsole
tools/console/xenconsoled
tools/console/client/_paths.h
tools/include/xen-foreign/*.(c|h|size)
tools/include/xen-foreign/checker
tools/libvchan/xenvchan.pc
-tools/libxc/*.pc
-tools/libxc/xc_bitops.h
-tools/libxc/xc_core.h
-tools/libxc/xc_core_arm.h
-tools/libxc/xc_core_x86.h
-tools/libxc/xc_private.h
tools/libxl/_libxl.api-for-check
tools/libxl/*.api-ok
tools/libxl/*.pc
tools/misc/xen-hptool
tools/misc/xen-mfndump
tools/libs/toolcore/include/_*.h
-tools/libxc/_*.[ch]
tools/libxl/_*.[ch]
tools/libxl/testidl
tools/libxl/testidl.c
# libraries under tools/libs
#######
-STUB_LIBS := toolcore toollog evtchn gnttab call foreignmemory devicemodel ctrl
+STUB_LIBS := toolcore toollog evtchn gnttab call foreignmemory devicemodel ctrl guest
+
+LIBDEP_guest := cross-zlib
#######
# common handling
$(foreach lib,$(STUB_LIBS),$(eval $(call BUILD_lib,$(lib))))
-libxc-$(XEN_TARGET_ARCH)/stamp: $(XEN_ROOT)/tools/libxc/Makefile
- $(do_links)
-
xenstore/stamp: $(XEN_ROOT)/tools/xenstore/Makefile
$(do_links)
-LINK_DIRS := libxc-$(XEN_TARGET_ARCH) xenstore $(foreach dir,$(STUB_LIBS),libs-$(XEN_TARGET_ARCH)/$(dir))
+LINK_DIRS := xenstore $(foreach dir,$(STUB_LIBS),libs-$(XEN_TARGET_ARCH)/$(dir))
LINK_STAMPS := $(foreach dir,$(LINK_DIRS),$(dir)/stamp)
mk-headers-$(XEN_TARGET_ARCH): $(IOEMU_LINKFARM_TARGET) $(LINK_STAMPS)
mkdir -p $@/$$i ; \
done
-#######
-# libxc
-#######
-
-.PHONY: libxc
-libxc: libxc-$(XEN_TARGET_ARCH)/libxenguest.a
-libxc-$(XEN_TARGET_ARCH)/libxenguest.a: libxenevtchn libxenctrl cross-zlib
-libxc-$(XEN_TARGET_ARCH)/libxenguest.a: mk-headers-$(XEN_TARGET_ARCH) $(NEWLIB_STAMPFILE)
- CPPFLAGS="$(TARGET_CPPFLAGS)" CFLAGS="$(TARGET_CFLAGS)" $(MAKE) DESTDIR= CONFIG_LIBXC_MINIOS=y -C libxc-$(XEN_TARGET_ARCH)
-
#######
# ioemu
#######
MINIOS_CONFIG="$<" CONFIG_FILE="$(CURDIR)/$@" $(MAKE) DESTDIR= -C $(MINI_OS) config
.PHONY: ioemu
-ioemu: cross-zlib cross-libpci libxc ioemu-minios-config.mk
+ioemu: cross-zlib cross-libpci libxenguest ioemu-minios-config.mk
[ -f ioemu/config-host.mak ] || \
( $(buildmakevars2shellvars); \
cd ioemu ; \
.PHONY: ioemu-stubdom
ioemu-stubdom: APP_OBJS=$(CURDIR)/ioemu/i386-stubdom/qemu.a $(CURDIR)/ioemu/i386-stubdom/libqemu.a $(CURDIR)/ioemu/libqemu_common.a
-ioemu-stubdom: mini-os-$(XEN_TARGET_ARCH)-ioemu lwip-$(XEN_TARGET_ARCH) libxc ioemu
+ioemu-stubdom: mini-os-$(XEN_TARGET_ARCH)-ioemu lwip-$(XEN_TARGET_ARCH) libxenguest ioemu
DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/ioemu-minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) APP_OBJS="$(APP_OBJS)"
.PHONY: caml-stubdom
-caml-stubdom: mini-os-$(XEN_TARGET_ARCH)-caml lwip-$(XEN_TARGET_ARCH) libxc cross-ocaml caml
+caml-stubdom: mini-os-$(XEN_TARGET_ARCH)-caml lwip-$(XEN_TARGET_ARCH) libxenguest cross-ocaml caml
DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/caml/minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) APP_OBJS="$(CURDIR)/caml/main-caml.o $(CURDIR)/caml/caml.o $(CAMLLIB)/libasmrun.a"
.PHONY: c-stubdom
-c-stubdom: mini-os-$(XEN_TARGET_ARCH)-c lwip-$(XEN_TARGET_ARCH) libxc c
+c-stubdom: mini-os-$(XEN_TARGET_ARCH)-c lwip-$(XEN_TARGET_ARCH) libxenguest c
DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/c/minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) APP_OBJS=$(CURDIR)/c/main.a
.PHONY: vtpm-stubdom
DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/vtpmmgr/minios.cfg" $(MAKE) -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< APP_OBJS="$(CURDIR)/vtpmmgr/vtpmmgr.a" APP_LDLIBS="-lm -lpolarssl"
.PHONY: pv-grub
-pv-grub: mini-os-$(XEN_TARGET_ARCH)-grub libxc grub
+pv-grub: mini-os-$(XEN_TARGET_ARCH)-grub libxenguest grub
DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/grub/minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< APP_OBJS=$(CURDIR)/grub-$(XEN_TARGET_ARCH)/main.a
.PHONY: xenstore-stubdom
-xenstore-stubdom: mini-os-$(XEN_TARGET_ARCH)-xenstore libxc xenstore
+xenstore-stubdom: mini-os-$(XEN_TARGET_ARCH)-xenstore libxenguest xenstore
DEF_CPPFLAGS="$(TARGET_CPPFLAGS)" DEF_CFLAGS="$(TARGET_CFLAGS)" DEF_LDFLAGS="$(TARGET_LDFLAGS)" MINIOS_CONFIG="$(CURDIR)/xenstore-minios.cfg" $(MAKE) DESTDIR= -C $(MINI_OS) OBJ_DIR=$(CURDIR)/$< APP_OBJS=$(CURDIR)/xenstore/xenstored.a
#########
rm -f $(STUBDOMPATH)
rm -f *-minios-config.mk
rm -fr pkg-config
- [ ! -e libxc-$(XEN_TARGET_ARCH)/Makefile ] || $(MAKE) DESTDIR= -C libxc-$(XEN_TARGET_ARCH) clean
-[ ! -d ioemu ] || $(MAKE) DESTDIR= -C ioemu clean
-[ ! -d xenstore ] || $(MAKE) DESTDIR= -C xenstore clean
rm -fr newlib-$(XEN_TARGET_ARCH)
rm -fr zlib-$(XEN_TARGET_ARCH) pciutils-$(XEN_TARGET_ARCH)
rm -fr libs-$(XEN_TARGET_ARCH)
- rm -fr libxc-$(XEN_TARGET_ARCH) ioemu xenstore
+ rm -fr ioemu xenstore
rm -fr gmp-$(XEN_TARGET_ARCH)
rm -fr polarssl-$(XEN_TARGET_ARCH)
rm -fr openssl-$(XEN_TARGET_ARCH)
DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libs/toollog/include
DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libs/ctrl/include
-DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc/include
+DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libs/guest/include
DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/include -I.
DEF_CPPFLAGS += -I../grub-upstream/stage1
DEF_CPPFLAGS += -I../grub-upstream/stage2
FOREIGNMEMORY_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/foreignmemory
DEVICEMODEL_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/devicemodel
CTRL_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/ctrl
-GUEST_PATH = $(XEN_ROOT)/stubdom/libxc-$(MINIOS_TARGET_ARCH)
+GUEST_PATH = $(XEN_ROOT)/stubdom/libs-$(MINIOS_TARGET_ARCH)/guest
SUBDIRS-y :=
SUBDIRS-y += libs
-SUBDIRS-y += libxc
SUBDIRS-y += flask
SUBDIRS-y += fuzz
SUBDIRS-y += xenstore
SUBDIRS-$(OCAML_TOOLS) += ocaml
ifeq ($(CONFIG_RUMP),y)
-SUBDIRS-y := libs libxc xenstore
+SUBDIRS-y := libs xenstore
endif
# For the sake of linking, set the sys-root
include $(XEN_ROOT)/tools/libs/uselibs.mk
-XEN_libxenguest = $(XEN_ROOT)/tools/libxc
XEN_libxenlight = $(XEN_ROOT)/tools/libxl
# Currently libxlutil lives in the same directory as libxenlight
XEN_libxlutil = $(XEN_libxenlight)
# code which compiles against libxenctrl get __XEN_TOOLS__ and
# therefore sees the unstable hypercall interfaces.
CFLAGS_libxenctrl += $(CFLAGS_libxentoollog) $(CFLAGS_libxenforeignmemory) $(CFLAGS_libxendevicemodel) -D__XEN_TOOLS__
-
-CFLAGS_libxenguest = -I$(XEN_libxenguest)/include $(CFLAGS_libxenevtchn) $(CFLAGS_libxenforeignmemory) $(CFLAGS_xeninclude)
-SHDEPS_libxenguest = $(SHLIB_libxenevtchn) $(SHLIB_libxenctrl)
-LDLIBS_libxenguest = $(SHDEPS_libxenguest) $(XEN_libxenguest)/libxenguest$(libextension)
-SHLIB_libxenguest = $(SHDEPS_libxenguest) -Wl,-rpath-link=$(XEN_libxenguest)
+CFLAGS_libxenguest += $(CFLAGS_libxenevtchn) $(CFLAGS_libxenforeignmemory)
CFLAGS_libxenstore = -I$(XEN_libxenstore)/include $(CFLAGS_xeninclude)
SHDEPS_libxenstore = $(SHLIB_libxentoolcore) $(SHLIB_libxenctrl)
SUBDIRS-y += foreignmemory
SUBDIRS-y += devicemodel
SUBDIRS-y += ctrl
+SUBDIRS-y += guest
SUBDIRS-y += hypfs
ifeq ($(CONFIG_RUMP),y)
--- /dev/null
+Note that the only valid version of the LGPL as far as the files in
+this directory (and its subdirectories) are concerned is _this_
+particular version of the license (i.e., *only* v2.1, not v2.2 or v3.x
+or whatever), unless explicitly otherwise stated.
+
+Where clause 3 is invoked in order to relicense under the GPL then
+this shall be considered to be GPL v2 only for files which have
+specified LGPL v2.1 only.
+
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL. It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+ This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it. You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+ When we speak of free software, we are referring to freedom of use,
+not price. Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+ To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+ For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you. You must make sure that they, too, receive or can get the source
+code. If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it. And you must show them these terms so they know their rights.
+
+ We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+ To protect each distributor, we want to make it very clear that
+there is no warranty for the free library. Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+\f
+ Finally, software patents pose a constant threat to the existence of
+any free program. We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder. Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+ Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License. This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License. We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+ When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library. The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom. The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+ We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License. It also provides other free software developers Less
+of an advantage over competing non-free programs. These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries. However, the Lesser license provides advantages in certain
+special circumstances.
+
+ For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard. To achieve this, non-free programs must be
+allowed to use the library. A more frequent case is that a free
+library does the same job as widely used non-free libraries. In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+ In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software. For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+ Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+ The precise terms and conditions for copying, distribution and
+modification follow. Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library". The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+\f
+ GNU LESSER GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+ A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+ The "Library", below, refers to any such software library or work
+which has been distributed under these terms. A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language. (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+ "Source code" for a work means the preferred form of the work for
+making modifications to it. For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+ Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it). Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+ 1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+ You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+\f
+ 2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) The modified work must itself be a software library.
+
+ b) You must cause the files modified to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ c) You must cause the whole of the work to be licensed at no
+ charge to all third parties under the terms of this License.
+
+ d) If a facility in the modified Library refers to a function or a
+ table of data to be supplied by an application program that uses
+ the facility, other than as an argument passed when the facility
+ is invoked, then you must make a good faith effort to ensure that,
+ in the event an application does not supply such function or
+ table, the facility still operates, and performs whatever part of
+ its purpose remains meaningful.
+
+ (For example, a function in a library to compute square roots has
+ a purpose that is entirely well-defined independent of the
+ application. Therefore, Subsection 2d requires that any
+ application-supplied function or table used by this function must
+ be optional: if the application does not supply it, the square
+ root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library. To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License. (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.) Do not make any other change in
+these notices.
+\f
+ Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+ This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+ 4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+ If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library". Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+ However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library". The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+ When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library. The
+threshold for this to be true is not precisely defined by law.
+
+ If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work. (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+ Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+\f
+ 6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+ You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License. You must supply a copy of this License. If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License. Also, you must do one
+of these things:
+
+ a) Accompany the work with the complete corresponding
+ machine-readable source code for the Library including whatever
+ changes were used in the work (which must be distributed under
+ Sections 1 and 2 above); and, if the work is an executable linked
+ with the Library, with the complete machine-readable "work that
+ uses the Library", as object code and/or source code, so that the
+ user can modify the Library and then relink to produce a modified
+ executable containing the modified Library. (It is understood
+ that the user who changes the contents of definitions files in the
+ Library will not necessarily be able to recompile the application
+ to use the modified definitions.)
+
+ b) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (1) uses at run time a
+ copy of the library already present on the user's computer system,
+ rather than copying library functions into the executable, and (2)
+ will operate properly with a modified version of the library, if
+ the user installs one, as long as the modified version is
+ interface-compatible with the version that the work was made with.
+
+ c) Accompany the work with a written offer, valid for at
+ least three years, to give the same user the materials
+ specified in Subsection 6a, above, for a charge no more
+ than the cost of performing this distribution.
+
+ d) If distribution of the work is made by offering access to copy
+ from a designated place, offer equivalent access to copy the above
+ specified materials from the same place.
+
+ e) Verify that the user has already received a copy of these
+ materials or that you have already sent this user a copy.
+
+ For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it. However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+ It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system. Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+\f
+ 7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+ a) Accompany the combined library with a copy of the same work
+ based on the Library, uncombined with any other library
+ facilities. This must be distributed under the terms of the
+ Sections above.
+
+ b) Give prominent notice with the combined library of the fact
+ that part of it is a work based on the Library, and explaining
+ where to find the accompanying uncombined form of the same work.
+
+ 8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License. Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License. However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+ 9. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Library or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+ 10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+\f
+ 11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all. For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded. In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+ 13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+\f
+ 14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission. For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this. Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+ NO WARRANTY
+
+ 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+ END OF TERMS AND CONDITIONS
--- /dev/null
+XEN_ROOT = $(CURDIR)/../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+ifeq ($(CONFIG_LIBXC_MINIOS),y)
+# Save/restore of a domain is currently incompatible with a stubdom environment
+override CONFIG_MIGRATE := n
+endif
+
+LINK_FILES := xc_private.h xc_core.h xc_core_x86.h xc_core_arm.h xc_bitops.h
+
+$(LINK_FILES):
+ ln -sf $(XEN_ROOT)/tools/libs/ctrl/$(notdir $@) $@
+
+SRCS-y += xg_private.c
+SRCS-y += xg_domain.c
+SRCS-y += xg_suspend.c
+ifeq ($(CONFIG_MIGRATE),y)
+SRCS-y += xg_sr_common.c
+SRCS-$(CONFIG_X86) += xg_sr_common_x86.c
+SRCS-$(CONFIG_X86) += xg_sr_common_x86_pv.c
+SRCS-$(CONFIG_X86) += xg_sr_restore_x86_pv.c
+SRCS-$(CONFIG_X86) += xg_sr_restore_x86_hvm.c
+SRCS-$(CONFIG_X86) += xg_sr_save_x86_pv.c
+SRCS-$(CONFIG_X86) += xg_sr_save_x86_hvm.c
+SRCS-y += xg_sr_restore.c
+SRCS-y += xg_sr_save.c
+SRCS-y += xg_offline_page.c
+else
+SRCS-y += xg_nomigrate.c
+endif
+
+vpath %.c ../../../xen/common/libelf
+CFLAGS += -I../../../xen/common/libelf
+
+ELF_SRCS-y += libelf-tools.c libelf-loader.c
+ELF_SRCS-y += libelf-dominfo.c
+
+SRCS-y += $(ELF_SRCS-y)
+
+$(patsubst %.c,%.o,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign
+$(patsubst %.c,%.opic,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign
+
+ifeq ($(CONFIG_X86),y) # Add libx86 to the build
+vpath %.c ../../../xen/lib/x86
+
+SRCS-y += cpuid.c msr.c
+endif
+
+# new domain builder
+SRCS-y += xg_dom_core.c
+SRCS-y += xg_dom_boot.c
+SRCS-y += xg_dom_elfloader.c
+SRCS-$(CONFIG_X86) += xg_dom_bzimageloader.c
+SRCS-$(CONFIG_X86) += xg_dom_decompress_lz4.c
+SRCS-$(CONFIG_X86) += xg_dom_hvmloader.c
+SRCS-$(CONFIG_ARM) += xg_dom_armzimageloader.c
+SRCS-y += xg_dom_binloader.c
+SRCS-y += xg_dom_compat_linux.c
+
+SRCS-$(CONFIG_X86) += xg_dom_x86.c
+SRCS-$(CONFIG_X86) += xg_cpuid_x86.c
+SRCS-$(CONFIG_ARM) += xg_dom_arm.c
+
+ifeq ($(CONFIG_LIBXC_MINIOS),y)
+SRCS-y += xg_dom_decompress_unsafe.c
+SRCS-y += xg_dom_decompress_unsafe_bzip2.c
+SRCS-y += xg_dom_decompress_unsafe_lzma.c
+SRCS-y += xg_dom_decompress_unsafe_lzo1x.c
+SRCS-y += xg_dom_decompress_unsafe_xz.c
+endif
+
+-include $(XEN_TARGET_ARCH)/Makefile
+
+CFLAGS += -Werror -Wmissing-prototypes
+CFLAGS += -I. -I./include $(CFLAGS_xeninclude)
+CFLAGS += -D__XEN_TOOLS__
+CFLAGS += -include $(XEN_ROOT)/tools/config.h
+
+# Needed for posix_fadvise64() in xc_linux.c
+CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE
+
+CFLAGS += $(PTHREAD_CFLAGS)
+CFLAGS += $(CFLAGS_libxentoollog)
+CFLAGS += $(CFLAGS_libxenevtchn)
+CFLAGS += $(CFLAGS_libxendevicemodel)
+
+# libxenguest includes xc_private.h, so needs this despite not using
+# this functionality directly.
+CFLAGS += $(CFLAGS_libxencall) $(CFLAGS_libxenforeignmemory)
+
+ifeq ($(CONFIG_MiniOS),y)
+zlib-options =
+else
+zlib-options = $(ZLIB)
+endif
+
+xc_dom_bzimageloader.o: CFLAGS += $(filter -D%,$(zlib-options))
+xc_dom_bzimageloader.opic: CFLAGS += $(filter -D%,$(zlib-options))
+
+LIBHEADER := xenguest.h
+
+NO_HEADERS_CHK := y
+
+include $(XEN_ROOT)/tools/libs/libs.mk
+
+libxenguest.so.$(MAJOR).$(MINOR): COMPRESSION_LIBS = $(filter -l%,$(zlib-options))
+libxenguest.so.$(MAJOR).$(MINOR): APPEND_LDFLAGS += $(COMPRESSION_LIBS) -lz
+
+genpath-target = $(call buildmakevars2header,_paths.h)
+$(eval $(genpath-target))
+
+xc_private.h: _paths.h
+
+$(LIB_OBJS) $(PIC_OBJS): $(LINK_FILES)
+
+$(PKG_CONFIG_LOCAL): PKG_CONFIG_INCDIR = $(XEN_libxenctrl)/include
+$(PKG_CONFIG_LOCAL): PKG_CONFIG_CFLAGS_LOCAL = $(CFLAGS_xeninclude)
+
+.PHONY: cleanlocal
+cleanlocal:
+ rm -f libxenguest.map
--- /dev/null
+/******************************************************************************
+ * xenguest.h
+ *
+ * A library for guest domain management in Xen.
+ *
+ * Copyright (c) 2003-2004, K A Fraser.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XENGUEST_H
+#define XENGUEST_H
+
+#include <xenctrl_dom.h>
+
+#define XC_NUMA_NO_NODE (~0U)
+
+#define XCFLAGS_LIVE (1 << 0)
+#define XCFLAGS_DEBUG (1 << 1)
+
+#define X86_64_B_SIZE 64
+#define X86_32_B_SIZE 32
+
+/*
+ * User not using xc_suspend_* / xc_await_suspent may not want to
+ * include the full libxenevtchn API here.
+ */
+struct xenevtchn_handle;
+
+/* For save's precopy_policy(). */
+struct precopy_stats
+{
+ unsigned int iteration;
+ unsigned int total_written;
+ long dirty_count; /* -1 if unknown */
+};
+
+/*
+ * A precopy_policy callback may not be running in the same address
+ * space as libxc an so precopy_stats is passed by value.
+ */
+typedef int (*precopy_policy_t)(struct precopy_stats, void *);
+
+/* callbacks provided by xc_domain_save */
+struct save_callbacks {
+ /*
+ * Called after expiration of checkpoint interval,
+ * to suspend the guest.
+ */
+ int (*suspend)(void *data);
+
+ /*
+ * Called before and after every batch of page data sent during
+ * the precopy phase of a live migration to ask the caller what
+ * to do next based on the current state of the precopy migration.
+ *
+ * Should return one of the values listed below:
+ */
+#define XGS_POLICY_ABORT (-1) /* Abandon the migration entirely
+ * and tidy up. */
+#define XGS_POLICY_CONTINUE_PRECOPY 0 /* Remain in the precopy phase. */
+#define XGS_POLICY_STOP_AND_COPY 1 /* Immediately suspend and transmit the
+ * remaining dirty pages. */
+ precopy_policy_t precopy_policy;
+
+ /*
+ * Called after the guest's dirty pages have been
+ * copied into an output buffer.
+ * Callback function resumes the guest & the device model,
+ * returns to xc_domain_save.
+ * xc_domain_save then flushes the output buffer, while the
+ * guest continues to run.
+ */
+ int (*postcopy)(void *data);
+
+ /*
+ * Called after the memory checkpoint has been flushed
+ * out into the network. Typical actions performed in this
+ * callback include:
+ * (a) send the saved device model state (for HVM guests),
+ * (b) wait for checkpoint ack
+ * (c) release the network output buffer pertaining to the acked checkpoint.
+ * (c) sleep for the checkpoint interval.
+ *
+ * returns:
+ * 0: terminate checkpointing gracefully
+ * 1: take another checkpoint
+ */
+ int (*checkpoint)(void *data);
+
+ /*
+ * Called after the checkpoint callback.
+ *
+ * returns:
+ * 0: terminate checkpointing gracefully
+ * 1: take another checkpoint
+ */
+ int (*wait_checkpoint)(void *data);
+
+ /* Enable qemu-dm logging dirty pages to xen */
+ int (*switch_qemu_logdirty)(uint32_t domid, unsigned enable, void *data); /* HVM only */
+
+ /* to be provided as the last argument to each callback function */
+ void *data;
+};
+
+/* Type of stream. Plain, or using a continuous replication protocol? */
+typedef enum {
+ XC_STREAM_PLAIN,
+ XC_STREAM_REMUS,
+ XC_STREAM_COLO,
+} xc_stream_type_t;
+
+/**
+ * This function will save a running domain.
+ *
+ * @param xch a handle to an open hypervisor interface
+ * @param io_fd the file descriptor to save a domain to
+ * @param dom the id of the domain
+ * @param flags XCFLAGS_xxx
+ * @param stream_type XC_STREAM_PLAIN if the far end of the stream
+ * doesn't use checkpointing
+ * @param recv_fd Only used for XC_STREAM_COLO. Contains backchannel from
+ * the destination side.
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
+ uint32_t flags, struct save_callbacks *callbacks,
+ xc_stream_type_t stream_type, int recv_fd);
+
+/* callbacks provided by xc_domain_restore */
+struct restore_callbacks {
+ /*
+ * Called once the STATIC_DATA_END record has been received/inferred.
+ *
+ * For compatibility with older streams, provides a list of static data
+ * expected to be found in the stream, which was missing. A higher level
+ * toolstack is responsible for providing any necessary compatibiltiy.
+ */
+#define XGR_SDD_MISSING_CPUID (1 << 0)
+#define XGR_SDD_MISSING_MSR (1 << 1)
+ int (*static_data_done)(unsigned int missing, void *data);
+
+ /* Called after a new checkpoint to suspend the guest. */
+ int (*suspend)(void *data);
+
+ /*
+ * Called after the secondary vm is ready to resume.
+ * Callback function resumes the guest & the device model,
+ * returns to xc_domain_restore.
+ */
+ int (*postcopy)(void *data);
+
+ /*
+ * A checkpoint record has been found in the stream.
+ * returns:
+ */
+#define XGR_CHECKPOINT_ERROR 0 /* Terminate processing */
+#define XGR_CHECKPOINT_SUCCESS 1 /* Continue reading more data from the stream */
+#define XGR_CHECKPOINT_FAILOVER 2 /* Failover and resume VM */
+ int (*checkpoint)(void *data);
+
+ /*
+ * Called after the checkpoint callback.
+ *
+ * returns:
+ * 0: terminate checkpointing gracefully
+ * 1: take another checkpoint
+ */
+ int (*wait_checkpoint)(void *data);
+
+ /*
+ * callback to send store gfn and console gfn to xl
+ * if we want to resume vm before xc_domain_save()
+ * exits.
+ */
+ void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn,
+ void *data);
+
+ /* to be provided as the last argument to each callback function */
+ void *data;
+};
+
+/**
+ * This function will restore a saved domain.
+ *
+ * Domain is restored in a suspended state ready to be unpaused.
+ *
+ * @param xch a handle to an open hypervisor interface
+ * @param io_fd the file descriptor to restore a domain from
+ * @param dom the id of the domain
+ * @param store_evtchn the xenstore event channel for this domain to use
+ * @param store_mfn filled with the gfn of the store page
+ * @param store_domid the backend domain for xenstore
+ * @param console_evtchn the console event channel for this domain to use
+ * @param console_mfn filled with the gfn of the console page
+ * @param console_domid the backend domain for xenconsole
+ * @param stream_type XC_STREAM_PLAIN if the far end of the stream is using
+ * checkpointing
+ * @param callbacks non-NULL to receive a callback to restore toolstack
+ * specific data
+ * @param send_back_fd Only used for XC_STREAM_COLO. Contains backchannel to
+ * the source side.
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
+ unsigned int store_evtchn, unsigned long *store_mfn,
+ uint32_t store_domid, unsigned int console_evtchn,
+ unsigned long *console_mfn, uint32_t console_domid,
+ xc_stream_type_t stream_type,
+ struct restore_callbacks *callbacks, int send_back_fd);
+
+/**
+ * This function will create a domain for a paravirtualized Linux
+ * using file names pointing to kernel and ramdisk
+ *
+ * @parm xch a handle to an open hypervisor interface
+ * @parm domid the id of the domain
+ * @parm mem_mb memory size in megabytes
+ * @parm image_name name of the kernel image file
+ * @parm ramdisk_name name of the ramdisk image file
+ * @parm cmdline command line string
+ * @parm flags domain creation flags
+ * @parm store_evtchn the store event channel for this domain to use
+ * @parm store_mfn returned with the mfn of the store page
+ * @parm console_evtchn the console event channel for this domain to use
+ * @parm conole_mfn returned with the mfn of the console page
+ * @return 0 on success, -1 on failure
+ */
+int xc_linux_build(xc_interface *xch,
+ uint32_t domid,
+ unsigned int mem_mb,
+ const char *image_name,
+ const char *ramdisk_name,
+ const char *cmdline,
+ const char *features,
+ unsigned long flags,
+ unsigned int store_evtchn,
+ unsigned long *store_mfn,
+ unsigned int console_evtchn,
+ unsigned long *console_mfn);
+
+/*
+ * Sets *lockfd to -1.
+ * Has deallocated everything even on error.
+ */
+int xc_suspend_evtchn_release(xc_interface *xch,
+ struct xenevtchn_handle *xce,
+ uint32_t domid, int suspend_evtchn, int *lockfd);
+
+/**
+ * This function eats the initial notification.
+ * xce must not be used for anything else
+ * See xc_suspend_evtchn_init_sane re lockfd.
+ */
+int xc_suspend_evtchn_init_exclusive(xc_interface *xch,
+ struct xenevtchn_handle *xce,
+ uint32_t domid, int port, int *lockfd);
+
+/* xce must not be used for anything else */
+int xc_await_suspend(xc_interface *xch, struct xenevtchn_handle *xce,
+ int suspend_evtchn);
+
+/**
+ * The port will be signaled immediately after this call
+ * The caller should check the domain status and look for the next event
+ * On success, *lockfd will be set to >=0 and *lockfd must be preserved
+ * and fed to xc_suspend_evtchn_release. (On error *lockfd is
+ * undefined and xc_suspend_evtchn_release is not allowed.)
+ */
+int xc_suspend_evtchn_init_sane(xc_interface *xch,
+ struct xenevtchn_handle *xce,
+ uint32_t domid, int port, int *lockfd);
+
+int xc_mark_page_online(xc_interface *xch, unsigned long start,
+ unsigned long end, uint32_t *status);
+
+int xc_mark_page_offline(xc_interface *xch, unsigned long start,
+ unsigned long end, uint32_t *status);
+
+int xc_query_page_offline_status(xc_interface *xch, unsigned long start,
+ unsigned long end, uint32_t *status);
+
+int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn);
+
+
+/**
+ * Memory related information, such as PFN types, the P2M table,
+ * the guest word width and the guest page table levels.
+ */
+struct xc_domain_meminfo {
+ unsigned int pt_levels;
+ unsigned int guest_width;
+ xen_pfn_t *pfn_type;
+ xen_pfn_t *p2m_table;
+ unsigned long p2m_size;
+};
+
+int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid,
+ struct xc_domain_meminfo *minfo);
+
+int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *mem);
+
+/**
+ * This function map m2p table
+ * @parm xch a handle to an open hypervisor interface
+ * @parm max_mfn the max pfn
+ * @parm prot the flags to map, such as read/write etc
+ * @parm mfn0 return the first mfn, can be NULL
+ * @return mapped m2p table on success, NULL on failure
+ */
+xen_pfn_t *xc_map_m2p(xc_interface *xch,
+ unsigned long max_mfn,
+ int prot,
+ unsigned long *mfn0);
+#endif /* XENGUEST_H */
--- /dev/null
+/******************************************************************************
+ * xc_cpuid_x86.c
+ *
+ * Compute cpuid of a domain.
+ *
+ * Copyright (c) 2008, Citrix Systems, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <limits.h>
+#include "xc_private.h"
+#include "xc_bitops.h"
+#include <xen/hvm/params.h>
+#include <xen-tools/libs.h>
+
+enum {
+#define XEN_CPUFEATURE(name, value) X86_FEATURE_##name = value,
+#include <xen/arch-x86/cpufeatureset.h>
+};
+
+#include <xen/asm/x86-vendors.h>
+
+#include <xen/lib/x86/cpu-policy.h>
+
+#define bitmaskof(idx) (1u << ((idx) & 31))
+#define featureword_of(idx) ((idx) >> 5)
+
+int xc_get_cpu_levelling_caps(xc_interface *xch, uint32_t *caps)
+{
+ DECLARE_SYSCTL;
+ int ret;
+
+ sysctl.cmd = XEN_SYSCTL_get_cpu_levelling_caps;
+ ret = do_sysctl(xch, &sysctl);
+
+ if ( !ret )
+ *caps = sysctl.u.cpu_levelling_caps.caps;
+
+ return ret;
+}
+
+int xc_get_cpu_featureset(xc_interface *xch, uint32_t index,
+ uint32_t *nr_features, uint32_t *featureset)
+{
+ DECLARE_SYSCTL;
+ DECLARE_HYPERCALL_BOUNCE(featureset,
+ *nr_features * sizeof(*featureset),
+ XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+ int ret;
+
+ if ( xc_hypercall_bounce_pre(xch, featureset) )
+ return -1;
+
+ sysctl.cmd = XEN_SYSCTL_get_cpu_featureset;
+ sysctl.u.cpu_featureset.index = index;
+ sysctl.u.cpu_featureset.nr_features = *nr_features;
+ set_xen_guest_handle(sysctl.u.cpu_featureset.features, featureset);
+
+ ret = do_sysctl(xch, &sysctl);
+
+ xc_hypercall_bounce_post(xch, featureset);
+
+ if ( !ret )
+ *nr_features = sysctl.u.cpu_featureset.nr_features;
+
+ return ret;
+}
+
+uint32_t xc_get_cpu_featureset_size(void)
+{
+ return FEATURESET_NR_ENTRIES;
+}
+
+const uint32_t *xc_get_static_cpu_featuremask(
+ enum xc_static_cpu_featuremask mask)
+{
+ static const uint32_t masks[][FEATURESET_NR_ENTRIES] = {
+#define MASK(x) [XC_FEATUREMASK_ ## x] = INIT_ ## x ## _FEATURES
+
+ MASK(KNOWN),
+ MASK(SPECIAL),
+ MASK(PV_MAX),
+ MASK(PV_DEF),
+ MASK(HVM_SHADOW_MAX),
+ MASK(HVM_SHADOW_DEF),
+ MASK(HVM_HAP_MAX),
+ MASK(HVM_HAP_DEF),
+
+#undef MASK
+ };
+
+ if ( (unsigned int)mask >= ARRAY_SIZE(masks) )
+ return NULL;
+
+ return masks[mask];
+}
+
+int xc_get_cpu_policy_size(xc_interface *xch, uint32_t *nr_leaves,
+ uint32_t *nr_msrs)
+{
+ struct xen_sysctl sysctl = {};
+ int ret;
+
+ sysctl.cmd = XEN_SYSCTL_get_cpu_policy;
+
+ ret = do_sysctl(xch, &sysctl);
+
+ if ( !ret )
+ {
+ *nr_leaves = sysctl.u.cpu_policy.nr_leaves;
+ *nr_msrs = sysctl.u.cpu_policy.nr_msrs;
+ }
+
+ return ret;
+}
+
+int xc_get_system_cpu_policy(xc_interface *xch, uint32_t index,
+ uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves,
+ uint32_t *nr_msrs, xen_msr_entry_t *msrs)
+{
+ struct xen_sysctl sysctl = {};
+ DECLARE_HYPERCALL_BOUNCE(leaves,
+ *nr_leaves * sizeof(*leaves),
+ XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+ DECLARE_HYPERCALL_BOUNCE(msrs,
+ *nr_msrs * sizeof(*msrs),
+ XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+ int ret;
+
+ if ( xc_hypercall_bounce_pre(xch, leaves) ||
+ xc_hypercall_bounce_pre(xch, msrs) )
+ return -1;
+
+ sysctl.cmd = XEN_SYSCTL_get_cpu_policy;
+ sysctl.u.cpu_policy.index = index;
+ sysctl.u.cpu_policy.nr_leaves = *nr_leaves;
+ set_xen_guest_handle(sysctl.u.cpu_policy.cpuid_policy, leaves);
+ sysctl.u.cpu_policy.nr_msrs = *nr_msrs;
+ set_xen_guest_handle(sysctl.u.cpu_policy.msr_policy, msrs);
+
+ ret = do_sysctl(xch, &sysctl);
+
+ xc_hypercall_bounce_post(xch, leaves);
+ xc_hypercall_bounce_post(xch, msrs);
+
+ if ( !ret )
+ {
+ *nr_leaves = sysctl.u.cpu_policy.nr_leaves;
+ *nr_msrs = sysctl.u.cpu_policy.nr_msrs;
+ }
+
+ return ret;
+}
+
+int xc_get_domain_cpu_policy(xc_interface *xch, uint32_t domid,
+ uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves,
+ uint32_t *nr_msrs, xen_msr_entry_t *msrs)
+{
+ DECLARE_DOMCTL;
+ DECLARE_HYPERCALL_BOUNCE(leaves,
+ *nr_leaves * sizeof(*leaves),
+ XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+ DECLARE_HYPERCALL_BOUNCE(msrs,
+ *nr_msrs * sizeof(*msrs),
+ XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+ int ret;
+
+ if ( xc_hypercall_bounce_pre(xch, leaves) ||
+ xc_hypercall_bounce_pre(xch, msrs) )
+ return -1;
+
+ domctl.cmd = XEN_DOMCTL_get_cpu_policy;
+ domctl.domain = domid;
+ domctl.u.cpu_policy.nr_leaves = *nr_leaves;
+ set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves);
+ domctl.u.cpu_policy.nr_msrs = *nr_msrs;
+ set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs);
+
+ ret = do_domctl(xch, &domctl);
+
+ xc_hypercall_bounce_post(xch, leaves);
+ xc_hypercall_bounce_post(xch, msrs);
+
+ if ( !ret )
+ {
+ *nr_leaves = domctl.u.cpu_policy.nr_leaves;
+ *nr_msrs = domctl.u.cpu_policy.nr_msrs;
+ }
+
+ return ret;
+}
+
+int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid,
+ uint32_t nr_leaves, xen_cpuid_leaf_t *leaves,
+ uint32_t nr_msrs, xen_msr_entry_t *msrs,
+ uint32_t *err_leaf_p, uint32_t *err_subleaf_p,
+ uint32_t *err_msr_p)
+{
+ DECLARE_DOMCTL;
+ DECLARE_HYPERCALL_BOUNCE(leaves,
+ nr_leaves * sizeof(*leaves),
+ XC_HYPERCALL_BUFFER_BOUNCE_IN);
+ DECLARE_HYPERCALL_BOUNCE(msrs,
+ nr_msrs * sizeof(*msrs),
+ XC_HYPERCALL_BUFFER_BOUNCE_IN);
+ int ret;
+
+ if ( err_leaf_p )
+ *err_leaf_p = -1;
+ if ( err_subleaf_p )
+ *err_subleaf_p = -1;
+ if ( err_msr_p )
+ *err_msr_p = -1;
+
+ if ( xc_hypercall_bounce_pre(xch, leaves) )
+ return -1;
+
+ if ( xc_hypercall_bounce_pre(xch, msrs) )
+ return -1;
+
+ domctl.cmd = XEN_DOMCTL_set_cpu_policy;
+ domctl.domain = domid;
+ domctl.u.cpu_policy.nr_leaves = nr_leaves;
+ set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves);
+ domctl.u.cpu_policy.nr_msrs = nr_msrs;
+ set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs);
+ domctl.u.cpu_policy.err_leaf = -1;
+ domctl.u.cpu_policy.err_subleaf = -1;
+ domctl.u.cpu_policy.err_msr = -1;
+
+ ret = do_domctl(xch, &domctl);
+
+ xc_hypercall_bounce_post(xch, leaves);
+ xc_hypercall_bounce_post(xch, msrs);
+
+ if ( err_leaf_p )
+ *err_leaf_p = domctl.u.cpu_policy.err_leaf;
+ if ( err_subleaf_p )
+ *err_subleaf_p = domctl.u.cpu_policy.err_subleaf;
+ if ( err_msr_p )
+ *err_msr_p = domctl.u.cpu_policy.err_msr;
+
+ return ret;
+}
+
+static int compare_leaves(const void *l, const void *r)
+{
+ const xen_cpuid_leaf_t *lhs = l;
+ const xen_cpuid_leaf_t *rhs = r;
+
+ if ( lhs->leaf != rhs->leaf )
+ return lhs->leaf < rhs->leaf ? -1 : 1;
+
+ if ( lhs->subleaf != rhs->subleaf )
+ return lhs->subleaf < rhs->subleaf ? -1 : 1;
+
+ return 0;
+}
+
+static xen_cpuid_leaf_t *find_leaf(
+ xen_cpuid_leaf_t *leaves, unsigned int nr_leaves,
+ const struct xc_xend_cpuid *xend)
+{
+ const xen_cpuid_leaf_t key = { xend->leaf, xend->subleaf };
+
+ return bsearch(&key, leaves, nr_leaves, sizeof(*leaves), compare_leaves);
+}
+
+static int xc_cpuid_xend_policy(
+ xc_interface *xch, uint32_t domid, const struct xc_xend_cpuid *xend)
+{
+ int rc;
+ xc_dominfo_t di;
+ unsigned int nr_leaves, nr_msrs;
+ uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1;
+ /*
+ * Three full policies. The host, domain max, and domain current for the
+ * domain type.
+ */
+ xen_cpuid_leaf_t *host = NULL, *max = NULL, *cur = NULL;
+ unsigned int nr_host, nr_max, nr_cur;
+
+ if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
+ di.domid != domid )
+ {
+ ERROR("Failed to obtain d%d info", domid);
+ rc = -ESRCH;
+ goto fail;
+ }
+
+ rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs);
+ if ( rc )
+ {
+ PERROR("Failed to obtain policy info size");
+ rc = -errno;
+ goto fail;
+ }
+
+ rc = -ENOMEM;
+ if ( (host = calloc(nr_leaves, sizeof(*host))) == NULL ||
+ (max = calloc(nr_leaves, sizeof(*max))) == NULL ||
+ (cur = calloc(nr_leaves, sizeof(*cur))) == NULL )
+ {
+ ERROR("Unable to allocate memory for %u CPUID leaves", nr_leaves);
+ goto fail;
+ }
+
+ /* Get the domain's current policy. */
+ nr_msrs = 0;
+ nr_cur = nr_leaves;
+ rc = xc_get_domain_cpu_policy(xch, domid, &nr_cur, cur, &nr_msrs, NULL);
+ if ( rc )
+ {
+ PERROR("Failed to obtain d%d current policy", domid);
+ rc = -errno;
+ goto fail;
+ }
+
+ /* Get the domain's max policy. */
+ nr_msrs = 0;
+ nr_max = nr_leaves;
+ rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_max
+ : XEN_SYSCTL_cpu_policy_pv_max,
+ &nr_max, max, &nr_msrs, NULL);
+ if ( rc )
+ {
+ PERROR("Failed to obtain %s max policy", di.hvm ? "hvm" : "pv");
+ rc = -errno;
+ goto fail;
+ }
+
+ /* Get the host policy. */
+ nr_msrs = 0;
+ nr_host = nr_leaves;
+ rc = xc_get_system_cpu_policy(xch, XEN_SYSCTL_cpu_policy_host,
+ &nr_host, host, &nr_msrs, NULL);
+ if ( rc )
+ {
+ PERROR("Failed to obtain host policy");
+ rc = -errno;
+ goto fail;
+ }
+
+ rc = -EINVAL;
+ for ( ; xend->leaf != XEN_CPUID_INPUT_UNUSED; ++xend )
+ {
+ xen_cpuid_leaf_t *cur_leaf = find_leaf(cur, nr_cur, xend);
+ const xen_cpuid_leaf_t *max_leaf = find_leaf(max, nr_max, xend);
+ const xen_cpuid_leaf_t *host_leaf = find_leaf(host, nr_host, xend);
+
+ if ( cur_leaf == NULL || max_leaf == NULL || host_leaf == NULL )
+ {
+ ERROR("Missing leaf %#x, subleaf %#x", xend->leaf, xend->subleaf);
+ goto fail;
+ }
+
+ for ( unsigned int i = 0; i < ARRAY_SIZE(xend->policy); i++ )
+ {
+ uint32_t *cur_reg = &cur_leaf->a + i;
+ const uint32_t *max_reg = &max_leaf->a + i;
+ const uint32_t *host_reg = &host_leaf->a + i;
+
+ if ( xend->policy[i] == NULL )
+ continue;
+
+ for ( unsigned int j = 0; j < 32; j++ )
+ {
+ bool val;
+
+ if ( xend->policy[i][j] == '1' )
+ val = true;
+ else if ( xend->policy[i][j] == '0' )
+ val = false;
+ else if ( xend->policy[i][j] == 'x' )
+ val = test_bit(31 - j, max_reg);
+ else if ( xend->policy[i][j] == 'k' ||
+ xend->policy[i][j] == 's' )
+ val = test_bit(31 - j, host_reg);
+ else
+ {
+ ERROR("Bad character '%c' in policy[%d] string '%s'",
+ xend->policy[i][j], i, xend->policy[i]);
+ goto fail;
+ }
+
+ clear_bit(31 - j, cur_reg);
+ if ( val )
+ set_bit(31 - j, cur_reg);
+ }
+ }
+ }
+
+ /* Feed the transformed currrent policy back up to Xen. */
+ rc = xc_set_domain_cpu_policy(xch, domid, nr_cur, cur, 0, NULL,
+ &err_leaf, &err_subleaf, &err_msr);
+ if ( rc )
+ {
+ PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)",
+ domid, err_leaf, err_subleaf, err_msr);
+ rc = -errno;
+ goto fail;
+ }
+
+ /* Success! */
+
+ fail:
+ free(cur);
+ free(max);
+ free(host);
+
+ return rc;
+}
+
+int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid, bool restore,
+ const uint32_t *featureset, unsigned int nr_features,
+ bool pae,
+ const struct xc_xend_cpuid *xend)
+{
+ int rc;
+ xc_dominfo_t di;
+ unsigned int i, nr_leaves, nr_msrs;
+ xen_cpuid_leaf_t *leaves = NULL;
+ struct cpuid_policy *p = NULL;
+ uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1;
+ uint32_t host_featureset[FEATURESET_NR_ENTRIES] = {};
+ uint32_t len = ARRAY_SIZE(host_featureset);
+
+ if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
+ di.domid != domid )
+ {
+ ERROR("Failed to obtain d%d info", domid);
+ rc = -ESRCH;
+ goto out;
+ }
+
+ rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs);
+ if ( rc )
+ {
+ PERROR("Failed to obtain policy info size");
+ rc = -errno;
+ goto out;
+ }
+
+ rc = -ENOMEM;
+ if ( (leaves = calloc(nr_leaves, sizeof(*leaves))) == NULL ||
+ (p = calloc(1, sizeof(*p))) == NULL )
+ goto out;
+
+ /* Get the host policy. */
+ rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_host,
+ &len, host_featureset);
+ if ( rc )
+ {
+ /* Tolerate "buffer too small", as we've got the bits we need. */
+ if ( errno == ENOBUFS )
+ rc = 0;
+ else
+ {
+ PERROR("Failed to obtain host featureset");
+ rc = -errno;
+ goto out;
+ }
+ }
+
+ /* Get the domain's default policy. */
+ nr_msrs = 0;
+ rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_default
+ : XEN_SYSCTL_cpu_policy_pv_default,
+ &nr_leaves, leaves, &nr_msrs, NULL);
+ if ( rc )
+ {
+ PERROR("Failed to obtain %s default policy", di.hvm ? "hvm" : "pv");
+ rc = -errno;
+ goto out;
+ }
+
+ rc = x86_cpuid_copy_from_buffer(p, leaves, nr_leaves,
+ &err_leaf, &err_subleaf);
+ if ( rc )
+ {
+ ERROR("Failed to deserialise CPUID (err leaf %#x, subleaf %#x) (%d = %s)",
+ err_leaf, err_subleaf, -rc, strerror(-rc));
+ goto out;
+ }
+
+ /*
+ * Account for feature which have been disabled by default since Xen 4.13,
+ * so migrated-in VM's don't risk seeing features disappearing.
+ */
+ if ( restore )
+ {
+ p->basic.rdrand = test_bit(X86_FEATURE_RDRAND, host_featureset);
+
+ if ( di.hvm )
+ {
+ p->feat.mpx = test_bit(X86_FEATURE_MPX, host_featureset);
+ }
+ }
+
+ if ( featureset )
+ {
+ uint32_t disabled_features[FEATURESET_NR_ENTRIES],
+ feat[FEATURESET_NR_ENTRIES] = {};
+ static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
+ unsigned int i, b;
+
+ /*
+ * The user supplied featureset may be shorter or longer than
+ * FEATURESET_NR_ENTRIES. Shorter is fine, and we will zero-extend.
+ * Longer is fine, so long as it only padded with zeros.
+ */
+ unsigned int user_len = min(FEATURESET_NR_ENTRIES + 0u, nr_features);
+
+ /* Check for truncated set bits. */
+ rc = -EOPNOTSUPP;
+ for ( i = user_len; i < nr_features; ++i )
+ if ( featureset[i] != 0 )
+ goto out;
+
+ memcpy(feat, featureset, sizeof(*featureset) * user_len);
+
+ /* Disable deep dependencies of disabled features. */
+ for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
+ disabled_features[i] = ~feat[i] & deep_features[i];
+
+ for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b )
+ {
+ const uint32_t *dfs;
+
+ if ( !test_bit(b, disabled_features) ||
+ !(dfs = x86_cpuid_lookup_deep_deps(b)) )
+ continue;
+
+ for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
+ {
+ feat[i] &= ~dfs[i];
+ disabled_features[i] &= ~dfs[i];
+ }
+ }
+
+ cpuid_featureset_to_policy(feat, p);
+ }
+ else
+ {
+ if ( di.hvm )
+ p->basic.pae = pae;
+ }
+
+ if ( !di.hvm )
+ {
+ /*
+ * On hardware without CPUID Faulting, PV guests see real topology.
+ * As a consequence, they also need to see the host htt/cmp fields.
+ */
+ p->basic.htt = test_bit(X86_FEATURE_HTT, host_featureset);
+ p->extd.cmp_legacy = test_bit(X86_FEATURE_CMP_LEGACY, host_featureset);
+ }
+ else
+ {
+ /*
+ * Topology for HVM guests is entirely controlled by Xen. For now, we
+ * hardcode APIC_ID = vcpu_id * 2 to give the illusion of no SMT.
+ */
+ p->basic.htt = true;
+ p->extd.cmp_legacy = false;
+
+ /*
+ * Leaf 1 EBX[23:16] is Maximum Logical Processors Per Package.
+ * Update to reflect vLAPIC_ID = vCPU_ID * 2, but make sure to avoid
+ * overflow.
+ */
+ if ( !(p->basic.lppp & 0x80) )
+ p->basic.lppp *= 2;
+
+ switch ( p->x86_vendor )
+ {
+ case X86_VENDOR_INTEL:
+ for ( i = 0; (p->cache.subleaf[i].type &&
+ i < ARRAY_SIZE(p->cache.raw)); ++i )
+ {
+ p->cache.subleaf[i].cores_per_package =
+ (p->cache.subleaf[i].cores_per_package << 1) | 1;
+ p->cache.subleaf[i].threads_per_cache = 0;
+ }
+ break;
+
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ /*
+ * Leaf 0x80000008 ECX[15:12] is ApicIdCoreSize.
+ * Leaf 0x80000008 ECX[7:0] is NumberOfCores (minus one).
+ * Update to reflect vLAPIC_ID = vCPU_ID * 2. But avoid
+ * - overflow,
+ * - going out of sync with leaf 1 EBX[23:16],
+ * - incrementing ApicIdCoreSize when it's zero (which changes the
+ * meaning of bits 7:0).
+ *
+ * UPDATE: I addition to avoiding overflow, some
+ * proprietary operating systems have trouble with
+ * apic_id_size values greater than 7. Limit the value to
+ * 7 for now.
+ */
+ if ( p->extd.nc < 0x7f )
+ {
+ if ( p->extd.apic_id_size != 0 && p->extd.apic_id_size < 0x7 )
+ p->extd.apic_id_size++;
+
+ p->extd.nc = (p->extd.nc << 1) | 1;
+ }
+ break;
+ }
+
+ /*
+ * These settings are necessary to cause earlier HVM_PARAM_NESTEDHVM /
+ * XEN_DOMCTL_disable_migrate settings to be reflected correctly in
+ * CPUID. Xen will discard these bits if configuration hasn't been
+ * set for the domain.
+ */
+ p->extd.itsc = true;
+ p->basic.vmx = true;
+ p->extd.svm = true;
+ }
+
+ rc = x86_cpuid_copy_to_buffer(p, leaves, &nr_leaves);
+ if ( rc )
+ {
+ ERROR("Failed to serialise CPUID (%d = %s)", -rc, strerror(-rc));
+ goto out;
+ }
+
+ rc = xc_set_domain_cpu_policy(xch, domid, nr_leaves, leaves, 0, NULL,
+ &err_leaf, &err_subleaf, &err_msr);
+ if ( rc )
+ {
+ PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)",
+ domid, err_leaf, err_subleaf, err_msr);
+ rc = -errno;
+ goto out;
+ }
+
+ if ( xend && (rc = xc_cpuid_xend_policy(xch, domid, xend)) )
+ goto out;
+
+ rc = 0;
+
+out:
+ free(p);
+ free(leaves);
+
+ return rc;
+}
--- /dev/null
+/*
+ * Xen domain builder -- ARM
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (c) 2011, Citrix Systems
+ */
+#include <inttypes.h>
+#include <assert.h>
+
+#include <xen/xen.h>
+#include <xen/io/protocols.h>
+#include <xen-tools/libs.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+
+#define NR_MAGIC_PAGES 4
+#define CONSOLE_PFN_OFFSET 0
+#define XENSTORE_PFN_OFFSET 1
+#define MEMACCESS_PFN_OFFSET 2
+#define VUART_PFN_OFFSET 3
+
+#define LPAE_SHIFT 9
+
+#define PFN_4K_SHIFT (0)
+#define PFN_2M_SHIFT (PFN_4K_SHIFT+LPAE_SHIFT)
+#define PFN_1G_SHIFT (PFN_2M_SHIFT+LPAE_SHIFT)
+#define PFN_512G_SHIFT (PFN_1G_SHIFT+LPAE_SHIFT)
+
+/* get guest IO ABI protocol */
+const char *xc_domain_get_native_protocol(xc_interface *xch,
+ uint32_t domid)
+{
+ return XEN_IO_PROTO_ABI_ARM;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int alloc_magic_pages(struct xc_dom_image *dom)
+{
+ int rc, i;
+ const xen_pfn_t base = GUEST_MAGIC_BASE >> XC_PAGE_SHIFT;
+ xen_pfn_t p2m[NR_MAGIC_PAGES];
+
+ BUILD_BUG_ON(NR_MAGIC_PAGES > GUEST_MAGIC_SIZE >> XC_PAGE_SHIFT);
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ for (i = 0; i < NR_MAGIC_PAGES; i++)
+ p2m[i] = base + i;
+
+ rc = xc_domain_populate_physmap_exact(
+ dom->xch, dom->guest_domid, NR_MAGIC_PAGES,
+ 0, 0, p2m);
+ if ( rc < 0 )
+ return rc;
+
+ dom->console_pfn = base + CONSOLE_PFN_OFFSET;
+ dom->xenstore_pfn = base + XENSTORE_PFN_OFFSET;
+ dom->vuart_gfn = base + VUART_PFN_OFFSET;
+
+ xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn);
+ xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn);
+ xc_clear_domain_page(dom->xch, dom->guest_domid, base + MEMACCESS_PFN_OFFSET);
+ xc_clear_domain_page(dom->xch, dom->guest_domid, dom->vuart_gfn);
+
+ xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_PFN,
+ dom->console_pfn);
+ xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_PFN,
+ dom->xenstore_pfn);
+ xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_MONITOR_RING_PFN,
+ base + MEMACCESS_PFN_OFFSET);
+ /* allocated by toolstack */
+ xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_EVTCHN,
+ dom->console_evtchn);
+ xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN,
+ dom->xenstore_evtchn);
+
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int start_info_arm(struct xc_dom_image *dom)
+{
+ DOMPRINTF_CALLED(dom->xch);
+ return 0;
+}
+
+static int shared_info_arm(struct xc_dom_image *dom, void *ptr)
+{
+ DOMPRINTF_CALLED(dom->xch);
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int vcpu_arm32(struct xc_dom_image *dom)
+{
+ vcpu_guest_context_any_t any_ctx;
+ vcpu_guest_context_t *ctxt = &any_ctx.c;
+ int rc;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ /* clear everything */
+ memset(ctxt, 0, sizeof(*ctxt));
+
+ ctxt->user_regs.pc32 = dom->parms.virt_entry;
+
+ /* Linux boot protocol. See linux.Documentation/arm/Booting. */
+ ctxt->user_regs.r0_usr = 0; /* SBZ */
+ /* Machine ID: We use DTB therefore no machine id */
+ ctxt->user_regs.r1_usr = 0xffffffff;
+ /* ATAGS/DTB: We currently require that the guest kernel to be
+ * using CONFIG_ARM_APPENDED_DTB. Ensure that r2 does not look
+ * like a valid pointer to a set of ATAGS or a DTB.
+ */
+ ctxt->user_regs.r2_usr = dom->devicetree_blob ?
+ dom->devicetree_seg.vstart : 0xffffffff;
+
+ ctxt->sctlr = SCTLR_GUEST_INIT;
+
+ ctxt->ttbr0 = 0;
+ ctxt->ttbr1 = 0;
+ ctxt->ttbcr = 0; /* Defined Reset Value */
+
+ ctxt->user_regs.cpsr = PSR_GUEST32_INIT;
+
+ ctxt->flags = VGCF_online;
+
+ DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx32,
+ ctxt->user_regs.cpsr, ctxt->user_regs.pc32);
+
+ rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
+ if ( rc != 0 )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
+
+ return rc;
+}
+
+static int vcpu_arm64(struct xc_dom_image *dom)
+{
+ vcpu_guest_context_any_t any_ctx;
+ vcpu_guest_context_t *ctxt = &any_ctx.c;
+ int rc;
+
+ DOMPRINTF_CALLED(dom->xch);
+ /* clear everything */
+ memset(ctxt, 0, sizeof(*ctxt));
+
+ ctxt->user_regs.pc64 = dom->parms.virt_entry;
+
+ /* Linux boot protocol. See linux.Documentation/arm64/booting.txt. */
+ ctxt->user_regs.x0 = dom->devicetree_blob ?
+ dom->devicetree_seg.vstart : 0xffffffff;
+ ctxt->user_regs.x1 = 0;
+ ctxt->user_regs.x2 = 0;
+ ctxt->user_regs.x3 = 0;
+
+ DOMPRINTF("DTB %"PRIx64, ctxt->user_regs.x0);
+
+ ctxt->sctlr = SCTLR_GUEST_INIT;
+
+ ctxt->ttbr0 = 0;
+ ctxt->ttbr1 = 0;
+ ctxt->ttbcr = 0; /* Defined Reset Value */
+
+ ctxt->user_regs.cpsr = PSR_GUEST64_INIT;
+
+ ctxt->flags = VGCF_online;
+
+ DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx64,
+ ctxt->user_regs.cpsr, ctxt->user_regs.pc64);
+
+ rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
+ if ( rc != 0 )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
+
+ return rc;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int set_mode(xc_interface *xch, uint32_t domid, char *guest_type)
+{
+ static const struct {
+ char *guest;
+ uint32_t size;
+ } types[] = {
+ { "xen-3.0-aarch64", 64 },
+ { "xen-3.0-armv7l", 32 },
+ };
+ DECLARE_DOMCTL;
+ int i,rc;
+
+ domctl.domain = domid;
+ domctl.cmd = XEN_DOMCTL_set_address_size;
+ domctl.u.address_size.size = 0;
+
+ for ( i = 0; i < ARRAY_SIZE(types); i++ )
+ if ( !strcmp(types[i].guest, guest_type) )
+ domctl.u.address_size.size = types[i].size;
+ if ( domctl.u.address_size.size == 0 )
+ {
+ xc_dom_printf(xch, "%s: warning: unknown guest type %s",
+ __FUNCTION__, guest_type);
+ return -EINVAL;
+ }
+
+ xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__,
+ guest_type, domctl.u.address_size.size);
+ rc = do_domctl(xch, &domctl);
+ if ( rc != 0 )
+ xc_dom_printf(xch, "%s: warning: failed (rc=%d)",
+ __FUNCTION__, rc);
+ return rc;
+}
+
+/* >0: success, *nr_pfns set to number actually populated
+ * 0: didn't try with this pfn shift (e.g. misaligned base etc)
+ * <0: ERROR
+ */
+static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
+ xen_pfn_t base_pfn, xen_pfn_t *nr_pfns,
+ xen_pfn_t *extents)
+{
+ /* The mask for this level */
+ const uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
+ /* The shift, mask and next boundary for the level above this one */
+ const int next_shift = pfn_shift + LPAE_SHIFT;
+ const uint64_t next_mask = ((uint64_t)1<<next_shift)-1;
+ const xen_pfn_t next_boundary
+ = (base_pfn + ((uint64_t)1<<next_shift)) & ~next_mask;
+
+ int nr, i, count;
+ xen_pfn_t end_pfn = base_pfn + *nr_pfns;
+
+ /* No level zero super pages with current hardware */
+ if ( pfn_shift == PFN_512G_SHIFT )
+ return 0;
+
+ /* base is misaligned for this level */
+ if ( mask & base_pfn )
+ return 0;
+
+ /*
+ * If base is not aligned at the next level up then try and make
+ * it so for next time around.
+ */
+ if ( (base_pfn & next_mask) && end_pfn > next_boundary )
+ end_pfn = next_boundary;
+
+ count = ( end_pfn - base_pfn ) >> pfn_shift;
+
+ /* Nothing to allocate */
+ if ( !count )
+ return 0;
+
+ for ( i = 0 ; i < count ; i ++ )
+ extents[i] = base_pfn + (i<<pfn_shift);
+
+ nr = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count,
+ pfn_shift, 0, extents);
+ if ( nr <= 0 ) return nr;
+ DOMPRINTF("%s: populated %#x/%#x entries with shift %d",
+ __FUNCTION__, nr, count, pfn_shift);
+
+ *nr_pfns = nr << pfn_shift;
+
+ return 1;
+}
+
+static int populate_guest_memory(struct xc_dom_image *dom,
+ xen_pfn_t base_pfn, xen_pfn_t nr_pfns)
+{
+ int rc = 0;
+ xen_pfn_t allocsz, pfn, *extents;
+
+ extents = calloc(1024*1024,sizeof(xen_pfn_t));
+ if ( extents == NULL )
+ {
+ DOMPRINTF("%s: Unable to allocate extent array", __FUNCTION__);
+ return -1;
+ }
+
+ DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
+ __FUNCTION__,
+ (uint64_t)base_pfn << XC_PAGE_SHIFT,
+ (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
+ (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
+
+ for ( pfn = 0; pfn < nr_pfns; pfn += allocsz )
+ {
+ allocsz = min_t(int, 1024*1024, nr_pfns - pfn);
+#if 0 /* Enable this to exercise/debug the code which tries to realign
+ * to a superpage boundary, by misaligning at the start. */
+ if ( pfn == 0 )
+ {
+ allocsz = 1;
+ rc = populate_one_size(dom, PFN_4K_SHIFT,
+ base_pfn + pfn, &allocsz, extents);
+ if (rc < 0) break;
+ if (rc > 0) continue;
+ /* Failed to allocate a single page? */
+ break;
+ }
+#endif
+
+ rc = populate_one_size(dom, PFN_512G_SHIFT,
+ base_pfn + pfn, &allocsz, extents);
+ if ( rc < 0 ) break;
+ if ( rc > 0 ) continue;
+
+ rc = populate_one_size(dom, PFN_1G_SHIFT,
+ base_pfn + pfn, &allocsz, extents);
+ if ( rc < 0 ) break;
+ if ( rc > 0 ) continue;
+
+ rc = populate_one_size(dom, PFN_2M_SHIFT,
+ base_pfn + pfn, &allocsz, extents);
+ if ( rc < 0 ) break;
+ if ( rc > 0 ) continue;
+
+ rc = populate_one_size(dom, PFN_4K_SHIFT,
+ base_pfn + pfn, &allocsz, extents);
+ if ( rc < 0 ) break;
+ if ( rc == 0 )
+ {
+ DOMPRINTF("%s: Not enough RAM", __FUNCTION__);
+ errno = ENOMEM;
+ rc = -1;
+ goto out;
+ }
+ }
+
+out:
+ free(extents);
+ return rc < 0 ? rc : 0;
+}
+
+static int meminit(struct xc_dom_image *dom)
+{
+ int i, rc;
+ uint64_t modbase;
+
+ uint64_t ramsize = (uint64_t)dom->total_pages << XC_PAGE_SHIFT;
+
+ const uint64_t bankbase[] = GUEST_RAM_BANK_BASES;
+ const uint64_t bankmax[] = GUEST_RAM_BANK_SIZES;
+
+ /* Convenient */
+ const uint64_t kernbase = dom->kernel_seg.vstart;
+ const uint64_t kernend = ROUNDUP(dom->kernel_seg.vend, 21/*2MB*/);
+ const uint64_t kernsize = kernend - kernbase;
+ const uint64_t dtb_size = dom->devicetree_blob ?
+ ROUNDUP(dom->devicetree_size, XC_PAGE_SHIFT) : 0;
+ const uint64_t ramdisk_size = dom->modules[0].blob ?
+ ROUNDUP(dom->modules[0].size, XC_PAGE_SHIFT) : 0;
+ const uint64_t modsize = dtb_size + ramdisk_size;
+ const uint64_t ram128mb = bankbase[0] + (128<<20);
+
+ xen_pfn_t p2m_size;
+ uint64_t bank0end;
+
+ assert(dom->rambase_pfn << XC_PAGE_SHIFT == bankbase[0]);
+
+ if ( modsize + kernsize > bankmax[0] )
+ {
+ DOMPRINTF("%s: Not enough memory for the kernel+dtb+initrd",
+ __FUNCTION__);
+ return -1;
+ }
+
+ if ( ramsize == 0 )
+ {
+ DOMPRINTF("%s: ram size is 0", __FUNCTION__);
+ return -1;
+ }
+
+ if ( ramsize > GUEST_RAM_MAX )
+ {
+ DOMPRINTF("%s: ram size is too large for guest address space: "
+ "%"PRIx64" > %llx",
+ __FUNCTION__, ramsize, GUEST_RAM_MAX);
+ return -1;
+ }
+
+ rc = set_mode(dom->xch, dom->guest_domid, dom->guest_type);
+ if ( rc )
+ return rc;
+
+ for ( i = 0; ramsize && i < GUEST_RAM_BANKS; i++ )
+ {
+ uint64_t banksize = ramsize > bankmax[i] ? bankmax[i] : ramsize;
+
+ ramsize -= banksize;
+
+ p2m_size = ( bankbase[i] + banksize - bankbase[0] ) >> XC_PAGE_SHIFT;
+
+ dom->rambank_size[i] = banksize >> XC_PAGE_SHIFT;
+ }
+
+ assert(dom->rambank_size[0] != 0);
+ assert(ramsize == 0); /* Too much RAM is rejected above */
+
+ dom->p2m_size = p2m_size;
+
+ /* setup initial p2m and allocate guest memory */
+ for ( i = 0; i < GUEST_RAM_BANKS && dom->rambank_size[i]; i++ )
+ {
+ if ((rc = populate_guest_memory(dom,
+ bankbase[i] >> XC_PAGE_SHIFT,
+ dom->rambank_size[i])))
+ return rc;
+ }
+
+ /*
+ * We try to place dtb+initrd at 128MB or if we have less RAM
+ * as high as possible. If there is no space then fallback to
+ * just before the kernel.
+ *
+ * If changing this then consider
+ * xen/arch/arm/kernel.c:place_modules as well.
+ */
+ bank0end = bankbase[0] + ((uint64_t)dom->rambank_size[0] << XC_PAGE_SHIFT);
+
+ if ( bank0end >= ram128mb + modsize && kernend < ram128mb )
+ modbase = ram128mb;
+ else if ( bank0end - modsize > kernend )
+ modbase = bank0end - modsize;
+ else if (kernbase - bankbase[0] > modsize )
+ modbase = kernbase - modsize;
+ else
+ return -1;
+
+ DOMPRINTF("%s: placing boot modules at 0x%" PRIx64, __FUNCTION__, modbase);
+
+ /*
+ * Must map DTB *after* initrd, to satisfy order of calls to
+ * xc_dom_alloc_segment in xc_dom_build_image, which must map
+ * things at monotonolically increasing addresses.
+ */
+ if ( ramdisk_size )
+ {
+ dom->modules[0].seg.vstart = modbase;
+ dom->modules[0].seg.vend = modbase + ramdisk_size;
+
+ DOMPRINTF("%s: ramdisk: 0x%" PRIx64 " -> 0x%" PRIx64 "",
+ __FUNCTION__,
+ dom->modules[0].seg.vstart, dom->modules[0].seg.vend);
+
+ modbase += ramdisk_size;
+ }
+
+ if ( dtb_size )
+ {
+ dom->devicetree_seg.vstart = modbase;
+ dom->devicetree_seg.vend = modbase + dtb_size;
+
+ DOMPRINTF("%s: devicetree: 0x%" PRIx64 " -> 0x%" PRIx64 "",
+ __FUNCTION__,
+ dom->devicetree_seg.vstart, dom->devicetree_seg.vend);
+
+ modbase += dtb_size;
+ }
+
+ return 0;
+}
+
+bool xc_dom_translated(const struct xc_dom_image *dom)
+{
+ return true;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int bootearly(struct xc_dom_image *dom)
+{
+ DOMPRINTF("%s: doing nothing", __FUNCTION__);
+ return 0;
+}
+
+static int bootlate(struct xc_dom_image *dom)
+{
+ /* XXX
+ * map shared info
+ * map grant tables
+ * setup shared info
+ */
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static struct xc_dom_arch xc_dom_32 = {
+ .guest_type = "xen-3.0-armv7l",
+ .native_protocol = XEN_IO_PROTO_ABI_ARM,
+ .page_shift = PAGE_SHIFT_ARM,
+ .sizeof_pfn = 8,
+ .alloc_magic_pages = alloc_magic_pages,
+ .start_info = start_info_arm,
+ .shared_info = shared_info_arm,
+ .vcpu = vcpu_arm32,
+ .meminit = meminit,
+ .bootearly = bootearly,
+ .bootlate = bootlate,
+};
+
+static struct xc_dom_arch xc_dom_64 = {
+ .guest_type = "xen-3.0-aarch64",
+ .native_protocol = XEN_IO_PROTO_ABI_ARM,
+ .page_shift = PAGE_SHIFT_ARM,
+ .sizeof_pfn = 8,
+ .alloc_magic_pages = alloc_magic_pages,
+ .start_info = start_info_arm,
+ .shared_info = shared_info_arm,
+ .vcpu = vcpu_arm64,
+ .meminit = meminit,
+ .bootearly = bootearly,
+ .bootlate = bootlate,
+};
+
+static void __init register_arch_hooks(void)
+{
+ xc_dom_register_arch_hooks(&xc_dom_32);
+ xc_dom_register_arch_hooks(&xc_dom_64);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Xen domain builder -- ARM zImage bits
+ *
+ * Parse and load ARM zImage kernel images.
+ *
+ * Copyright (C) 2012, Citrix Systems.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+
+#include <arpa/inet.h> /* XXX ntohl is not the right function... */
+
+struct minimal_dtb_header {
+ uint32_t magic;
+ uint32_t total_size;
+ /* There are other fields but we don't use them yet. */
+};
+
+#define DTB_MAGIC 0xd00dfeed
+
+/* ------------------------------------------------------------ */
+/* 32-bit zImage Support */
+/* ------------------------------------------------------------ */
+
+#define ZIMAGE32_MAGIC_OFFSET 0x24
+#define ZIMAGE32_START_OFFSET 0x28
+#define ZIMAGE32_END_OFFSET 0x2c
+
+#define ZIMAGE32_MAGIC 0x016f2818
+
+static int xc_dom_probe_zimage32_kernel(struct xc_dom_image *dom)
+{
+ uint32_t *zimage;
+
+ if ( dom->kernel_blob == NULL )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: no kernel image loaded", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( dom->kernel_size < 0x30 /*sizeof(struct setup_header)*/ )
+ {
+ xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ zimage = (uint32_t *)dom->kernel_blob;
+ if ( zimage[ZIMAGE32_MAGIC_OFFSET/4] != ZIMAGE32_MAGIC )
+ {
+ xc_dom_printf(dom->xch, "%s: kernel is not an arm32 zImage", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int xc_dom_parse_zimage32_kernel(struct xc_dom_image *dom)
+{
+ uint32_t *zimage;
+ uint32_t start, entry_addr;
+ uint64_t v_start, v_end;
+ uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ zimage = (uint32_t *)dom->kernel_blob;
+
+ /* Do not load kernel at the very first RAM address */
+ v_start = rambase + 0x8000;
+
+ if ( dom->kernel_size > UINT64_MAX - v_start )
+ {
+ DOMPRINTF("%s: kernel is too large\n", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ v_end = v_start + dom->kernel_size;
+
+ /*
+ * If start is invalid then the guest will start at some invalid
+ * address and crash, but this happens in guest context so doesn't
+ * concern us here.
+ */
+ start = zimage[ZIMAGE32_START_OFFSET/4];
+
+ if (start == 0)
+ entry_addr = v_start;
+ else
+ entry_addr = start;
+
+ /* find kernel segment */
+ dom->kernel_seg.vstart = v_start;
+ dom->kernel_seg.vend = v_end;
+
+ dom->parms.virt_entry = entry_addr;
+ dom->parms.virt_base = rambase;
+
+ dom->guest_type = "xen-3.0-armv7l";
+ DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "",
+ __FUNCTION__, dom->guest_type,
+ dom->kernel_seg.vstart, dom->kernel_seg.vend);
+ return 0;
+}
+
+/* ------------------------------------------------------------ */
+/* 64-bit zImage Support */
+/* ------------------------------------------------------------ */
+
+#define ZIMAGE64_MAGIC_V0 0x14000008
+#define ZIMAGE64_MAGIC_V1 0x644d5241 /* "ARM\x64" */
+
+/* linux/Documentation/arm64/booting.txt */
+struct zimage64_hdr {
+ uint32_t magic0;
+ uint32_t res0;
+ uint64_t text_offset; /* Image load offset */
+ uint64_t res1;
+ uint64_t res2;
+ /* zImage V1 only from here */
+ uint64_t res3;
+ uint64_t res4;
+ uint64_t res5;
+ uint32_t magic1;
+ uint32_t res6;
+};
+static int xc_dom_probe_zimage64_kernel(struct xc_dom_image *dom)
+{
+ struct zimage64_hdr *zimage;
+
+ if ( dom->kernel_blob == NULL )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: no kernel image loaded", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( dom->kernel_size < sizeof(*zimage) )
+ {
+ xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ zimage = dom->kernel_blob;
+ if ( zimage->magic0 != ZIMAGE64_MAGIC_V0 &&
+ zimage->magic1 != ZIMAGE64_MAGIC_V1 )
+ {
+ xc_dom_printf(dom->xch, "%s: kernel is not an arm64 Image", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int xc_dom_parse_zimage64_kernel(struct xc_dom_image *dom)
+{
+ struct zimage64_hdr *zimage;
+ uint64_t v_start, v_end;
+ uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ zimage = dom->kernel_blob;
+
+ if ( zimage->text_offset > UINT64_MAX - rambase )
+ {
+ DOMPRINTF("%s: kernel text offset is too large\n", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ v_start = rambase + zimage->text_offset;
+
+ if ( dom->kernel_size > UINT64_MAX - v_start )
+ {
+ DOMPRINTF("%s: kernel is too large\n", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ v_end = v_start + dom->kernel_size;
+
+ dom->kernel_seg.vstart = v_start;
+ dom->kernel_seg.vend = v_end;
+
+ /* Call the kernel at offset 0 */
+ dom->parms.virt_entry = v_start;
+ dom->parms.virt_base = rambase;
+
+ dom->guest_type = "xen-3.0-aarch64";
+ DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "",
+ __FUNCTION__, dom->guest_type,
+ dom->kernel_seg.vstart, dom->kernel_seg.vend);
+
+ return 0;
+}
+
+/* ------------------------------------------------------------ */
+/* Common zImage Support */
+/* ------------------------------------------------------------ */
+
+static int xc_dom_load_zimage_kernel(struct xc_dom_image *dom)
+{
+ void *dst;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ dst = xc_dom_seg_to_ptr(dom, &dom->kernel_seg);
+ if ( dst == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->kernel_seg) => NULL",
+ __func__);
+ return -1;
+ }
+
+ DOMPRINTF("%s: kernel seg %#"PRIx64"-%#"PRIx64,
+ __func__, dom->kernel_seg.vstart, dom->kernel_seg.vend);
+ DOMPRINTF("%s: copy %zd bytes from blob %p to dst %p",
+ __func__, dom->kernel_size, dom->kernel_blob, dst);
+
+ memcpy(dst, dom->kernel_blob, dom->kernel_size);
+
+ return 0;
+}
+
+static struct xc_dom_loader zimage32_loader = {
+ .name = "Linux zImage (ARM32)",
+ .probe = xc_dom_probe_zimage32_kernel,
+ .parser = xc_dom_parse_zimage32_kernel,
+ .loader = xc_dom_load_zimage_kernel,
+};
+
+static struct xc_dom_loader zimage64_loader = {
+ .name = "Linux zImage (ARM64)",
+ .probe = xc_dom_probe_zimage64_kernel,
+ .parser = xc_dom_parse_zimage64_kernel,
+ .loader = xc_dom_load_zimage_kernel,
+};
+
+static void __init register_loader(void)
+{
+ xc_dom_register_loader(&zimage32_loader);
+ xc_dom_register_loader(&zimage64_loader);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Some of the field descriptions were copied from "The Multiboot
+ * Specification", Copyright 1995, 96 Bryan Ford <baford@cs.utah.edu>,
+ * Erich Stefan Boleyn <erich@uruk.org> Copyright 1999, 2000, 2001, 2002
+ * Free Software Foundation, Inc.
+ */
+
+/******************************************************************************
+ *
+ * Loads simple binary images. It's like a .COM file in MS-DOS. No headers are
+ * present. The only requirement is that it must have a xen_bin_image table
+ * somewhere in the first 8192 bytes, starting on a 32-bit aligned address.
+ * Those familiar with the multiboot specification should recognize this, it's
+ * (almost) the same as the multiboot header.
+ * The layout of the xen_bin_image table is:
+ *
+ * Offset Type Name Note
+ * 0 uint32_t magic required
+ * 4 uint32_t flags required
+ * 8 uint32_t checksum required
+ * 12 uint32_t header_addr required
+ * 16 uint32_t load_addr required
+ * 20 uint32_t load_end_addr required
+ * 24 uint32_t bss_end_addr required
+ * 28 uint32_t entry_addr required
+ *
+ * - magic
+ * Magic number identifying the table. For images to be loaded by Xen 3, the
+ * magic value is 0x336ec578 ("xEn3" with the 0x80 bit of the "E" set).
+ * - flags
+ * bit 0: indicates whether the image needs to be loaded on a page boundary
+ * bit 1: reserved, must be 0 (the multiboot spec uses this bit to indicate
+ * that memory info should be passed to the image)
+ * bit 2: reserved, must be 0 (the multiboot spec uses this bit to indicate
+ * that the bootloader should pass video mode info to the image)
+ * bit 16: reserved, must be 1 (the multiboot spec uses this bit to indicate
+ * that the values in the fields header_addr - entry_addr are
+ * valid)
+ * All other bits should be set to 0.
+ * - checksum
+ * When added to "magic" and "flags", the resulting value should be 0.
+ * - header_addr
+ * Contains the virtual address corresponding to the beginning of the
+ * table - the memory location at which the magic value is supposed to be
+ * loaded. This field serves to synchronize the mapping between OS image
+ * offsets and virtual memory addresses.
+ * - load_addr
+ * Contains the virtual address of the beginning of the text segment. The
+ * offset in the OS image file at which to start loading is defined by the
+ * offset at which the table was found, minus (header addr - load addr).
+ * load addr must be less than or equal to header addr.
+ * - load_end_addr
+ * Contains the virtual address of the end of the data segment.
+ * (load_end_addr - load_addr) specifies how much data to load. This implies
+ * that the text and data segments must be consecutive in the OS image. If
+ * this field is zero, the domain builder assumes that the text and data
+ * segments occupy the whole OS image file.
+ * - bss_end_addr
+ * Contains the virtual address of the end of the bss segment. The domain
+ * builder initializes this area to zero, and reserves the memory it occupies
+ * to avoid placing boot modules and other data relevant to the loaded image
+ * in that area. If this field is zero, the domain builder assumes that no bss
+ * segment is present.
+ * - entry_addr
+ * The virtual address at which to start execution of the loaded image.
+ *
+ */
+
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+
+#define round_pgup(_p) (((_p)+(PAGE_SIZE_X86-1))&PAGE_MASK_X86)
+#define round_pgdown(_p) ((_p)&PAGE_MASK_X86)
+
+struct xen_bin_image_table
+{
+ uint32_t magic;
+ uint32_t flags;
+ uint32_t checksum;
+ uint32_t header_addr;
+ uint32_t load_addr;
+ uint32_t load_end_addr;
+ uint32_t bss_end_addr;
+ uint32_t entry_addr;
+};
+
+#define XEN_MULTIBOOT_MAGIC3 0x336ec578
+
+#define XEN_MULTIBOOT_FLAG_ALIGN4K 0x00000001
+#define XEN_MULTIBOOT_FLAG_NEEDMEMINFO 0x00000002
+#define XEN_MULTIBOOT_FLAG_NEEDVIDINFO 0x00000004
+#define XEN_MULTIBOOT_FLAG_ADDRSVALID 0x00010000
+#define XEN_MULTIBOOT_FLAG_PAE_SHIFT 14
+#define XEN_MULTIBOOT_FLAG_PAE_MASK (3 << XEN_MULTIBOOT_FLAG_PAE_SHIFT)
+
+/* Flags we test for */
+#define FLAGS_MASK ((~ 0) & (~ XEN_MULTIBOOT_FLAG_ALIGN4K) & \
+ (~ XEN_MULTIBOOT_FLAG_PAE_MASK))
+#define FLAGS_REQUIRED XEN_MULTIBOOT_FLAG_ADDRSVALID
+
+/* --------------------------------------------------------------------- */
+
+static struct xen_bin_image_table *find_table(struct xc_dom_image *dom)
+{
+ struct xen_bin_image_table *table;
+ uint32_t *probe_ptr;
+ uint32_t *probe_end;
+
+ if ( dom->kernel_size < sizeof(*table) )
+ return NULL;
+ probe_ptr = dom->kernel_blob;
+ if ( dom->kernel_size > (8192 + sizeof(*table)) )
+ probe_end = dom->kernel_blob + 8192;
+ else
+ probe_end = dom->kernel_blob + dom->kernel_size - sizeof(*table);
+
+ for ( table = NULL; probe_ptr < probe_end; probe_ptr++ )
+ {
+ if ( *probe_ptr == XEN_MULTIBOOT_MAGIC3 )
+ {
+ table = (struct xen_bin_image_table *) probe_ptr;
+ /* Checksum correct? */
+ if ( (table->magic + table->flags + table->checksum) == 0 )
+ return table;
+ }
+ }
+ return NULL;
+}
+
+static int xc_dom_probe_bin_kernel(struct xc_dom_image *dom)
+{
+ return find_table(dom) ? 0 : -EINVAL;
+}
+
+static int xc_dom_parse_bin_kernel(struct xc_dom_image *dom)
+{
+ struct xen_bin_image_table *image_info;
+ char *image = dom->kernel_blob;
+ size_t image_size = dom->kernel_size;
+ uint32_t start_addr;
+ uint32_t load_end_addr;
+ uint32_t bss_end_addr;
+ uint32_t pae_flags;
+
+ image_info = find_table(dom);
+ if ( !image_info )
+ return -EINVAL;
+
+ DOMPRINTF("%s: multiboot header fields", __FUNCTION__);
+ DOMPRINTF(" flags: 0x%" PRIx32 "", image_info->flags);
+ DOMPRINTF(" header_addr: 0x%" PRIx32 "", image_info->header_addr);
+ DOMPRINTF(" load_addr: 0x%" PRIx32 "", image_info->load_addr);
+ DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", image_info->load_end_addr);
+ DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", image_info->bss_end_addr);
+ DOMPRINTF(" entry_addr: 0x%" PRIx32 "", image_info->entry_addr);
+
+ /* Check the flags */
+ if ( (image_info->flags & FLAGS_MASK) != FLAGS_REQUIRED )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s: xen_bin_image_table flags required "
+ "0x%08" PRIx32 " found 0x%08" PRIx32 "",
+ __FUNCTION__, FLAGS_REQUIRED, image_info->flags & FLAGS_MASK);
+ return -EINVAL;
+ }
+
+ /* Sanity check on the addresses */
+ if ( (image_info->header_addr < image_info->load_addr) ||
+ ((char *) image_info - image) <
+ (image_info->header_addr - image_info->load_addr) )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid header_addr.",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+
+ start_addr = image_info->header_addr - ((char *)image_info - image);
+ load_end_addr = image_info->load_end_addr ?: start_addr + image_size;
+ bss_end_addr = image_info->bss_end_addr ?: load_end_addr;
+
+ DOMPRINTF("%s: calculated addresses", __FUNCTION__);
+ DOMPRINTF(" start_addr: 0x%" PRIx32 "", start_addr);
+ DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", load_end_addr);
+ DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", bss_end_addr);
+
+ if ( (start_addr + image_size) < load_end_addr )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid load_end_addr.",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( bss_end_addr < load_end_addr)
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid bss_end_addr.",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+
+ dom->kernel_seg.vstart = image_info->load_addr;
+ dom->kernel_seg.vend = bss_end_addr;
+ dom->parms.virt_base = start_addr;
+ dom->parms.virt_entry = image_info->entry_addr;
+
+ pae_flags = image_info->flags & XEN_MULTIBOOT_FLAG_PAE_MASK;
+ switch (pae_flags >> XEN_MULTIBOOT_FLAG_PAE_SHIFT) {
+ case 0:
+ dom->guest_type = "xen-3.0-x86_32";
+ break;
+ case 1:
+ dom->guest_type = "xen-3.0-x86_32p";
+ break;
+ case 2:
+ dom->guest_type = "xen-3.0-x86_64";
+ break;
+ case 3:
+ /* Kernel detects PAE at runtime. So try to figure whenever
+ * xen supports PAE and advertise a PAE-capable kernel in case
+ * it does. */
+ dom->guest_type = "xen-3.0-x86_32";
+ if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") )
+ {
+ DOMPRINTF("%s: PAE fixup", __FUNCTION__);
+ dom->guest_type = "xen-3.0-x86_32p";
+ dom->parms.pae = XEN_PAE_EXTCR3;
+ }
+ break;
+ }
+ return 0;
+}
+
+static int xc_dom_load_bin_kernel(struct xc_dom_image *dom)
+{
+ struct xen_bin_image_table *image_info;
+ char *image = dom->kernel_blob;
+ char *dest;
+ size_t image_size = dom->kernel_size;
+ size_t dest_size;
+ uint32_t start_addr;
+ uint32_t load_end_addr;
+ uint32_t bss_end_addr;
+ uint32_t skip, text_size, bss_size;
+
+ image_info = find_table(dom);
+ if ( !image_info )
+ return -EINVAL;
+
+ start_addr = image_info->header_addr - ((char *)image_info - image);
+ load_end_addr = image_info->load_end_addr ?: start_addr + image_size;
+ bss_end_addr = image_info->bss_end_addr ?: load_end_addr;
+
+ /* It's possible that we need to skip the first part of the image */
+ skip = image_info->load_addr - start_addr;
+ text_size = load_end_addr - image_info->load_addr;
+ bss_size = bss_end_addr - load_end_addr;
+
+ DOMPRINTF("%s: calculated sizes", __FUNCTION__);
+ DOMPRINTF(" skip: 0x%" PRIx32 "", skip);
+ DOMPRINTF(" text_size: 0x%" PRIx32 "", text_size);
+ DOMPRINTF(" bss_size: 0x%" PRIx32 "", bss_size);
+
+ dest = xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart, &dest_size);
+ if ( dest == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart)"
+ " => NULL", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( dest_size < text_size ||
+ dest_size - text_size < bss_size )
+ {
+ DOMPRINTF("%s: mapped region is too small for image", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( image_size < skip ||
+ image_size - skip < text_size )
+ {
+ DOMPRINTF("%s: image is too small for declared text size",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+
+ memcpy(dest, image + skip, text_size);
+ memset(dest + text_size, 0, bss_size);
+
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static struct xc_dom_loader bin_loader = {
+ .name = "multiboot-binary",
+ .probe = xc_dom_probe_bin_kernel,
+ .parser = xc_dom_parse_bin_kernel,
+ .loader = xc_dom_load_bin_kernel,
+};
+
+static void __init register_loader(void)
+{
+ xc_dom_register_loader(&bin_loader);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Xen domain builder -- xen booter.
+ *
+ * This is the code which actually boots a fresh
+ * prepared domain image as xen guest domain.
+ *
+ * ==> this is the only domain builder code piece
+ * where xen hypercalls are allowed <==
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <zlib.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+#include "xc_core.h"
+#include <xen/hvm/params.h>
+#include <xen/grant_table.h>
+
+/* ------------------------------------------------------------------------ */
+
+static int setup_hypercall_page(struct xc_dom_image *dom)
+{
+ DECLARE_DOMCTL;
+ xen_pfn_t pfn;
+ int rc;
+
+ if ( dom->parms.virt_hypercall == -1 )
+ return 0;
+ pfn = (dom->parms.virt_hypercall - dom->parms.virt_base)
+ >> XC_DOM_PAGE_SHIFT(dom);
+
+ DOMPRINTF("%s: vaddr=0x%" PRIx64 " pfn=0x%" PRIpfn "", __FUNCTION__,
+ dom->parms.virt_hypercall, pfn);
+ domctl.cmd = XEN_DOMCTL_hypercall_init;
+ domctl.domain = dom->guest_domid;
+ domctl.u.hypercall_init.gmfn = xc_dom_p2m(dom, pfn);
+ rc = do_domctl(dom->xch, &domctl);
+ if ( rc != 0 )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: HYPERCALL_INIT failed: %d - %s)",
+ __FUNCTION__, errno, strerror(errno));
+ return rc;
+}
+
+
+/* ------------------------------------------------------------------------ */
+
+int xc_dom_compat_check(struct xc_dom_image *dom)
+{
+ xen_capabilities_info_t xen_caps;
+ char *item, *ptr;
+ int match, found = 0;
+
+ strncpy(xen_caps, dom->xen_caps, XEN_CAPABILITIES_INFO_LEN - 1);
+ xen_caps[XEN_CAPABILITIES_INFO_LEN - 1] = '\0';
+
+ for ( item = strtok_r(xen_caps, " ", &ptr);
+ item != NULL ; item = strtok_r(NULL, " ", &ptr) )
+ {
+ match = !strcmp(dom->guest_type, item);
+ DOMPRINTF("%s: supported guest type: %s%s", __FUNCTION__,
+ item, match ? " <= matches" : "");
+ if ( match )
+ found++;
+ }
+ if ( !found )
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s: guest type %s not supported by xen kernel, sorry",
+ __FUNCTION__, dom->guest_type);
+
+ return found;
+}
+
+int xc_dom_boot_xen_init(struct xc_dom_image *dom, xc_interface *xch, uint32_t domid)
+{
+ dom->xch = xch;
+ dom->guest_domid = domid;
+
+ dom->xen_version = xc_version(xch, XENVER_version, NULL);
+ if ( xc_version(xch, XENVER_capabilities, &dom->xen_caps) < 0 )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR, "can't get xen capabilities");
+ return -1;
+ }
+ DOMPRINTF("%s: ver %d.%d, caps %s", __FUNCTION__,
+ dom->xen_version >> 16, dom->xen_version & 0xff,
+ dom->xen_caps);
+ return 0;
+}
+
+int xc_dom_boot_mem_init(struct xc_dom_image *dom)
+{
+ long rc;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ rc = dom->arch_hooks->meminit(dom);
+ if ( rc != 0 )
+ {
+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+ "%s: can't allocate low memory for domain",
+ __FUNCTION__);
+ return rc;
+ }
+
+ return 0;
+}
+
+void *xc_dom_boot_domU_map(struct xc_dom_image *dom, xen_pfn_t pfn,
+ xen_pfn_t count)
+{
+ int page_shift = XC_DOM_PAGE_SHIFT(dom);
+ privcmd_mmap_entry_t *entries;
+ void *ptr;
+ int i;
+ int err;
+
+ entries = xc_dom_malloc(dom, count * sizeof(privcmd_mmap_entry_t));
+ if ( entries == NULL )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn
+ " [malloc]", __FUNCTION__, pfn, count);
+ return NULL;
+ }
+
+ for ( i = 0; i < count; i++ )
+ entries[i].mfn = xc_dom_p2m(dom, pfn + i);
+
+ ptr = xc_map_foreign_ranges(dom->xch, dom->guest_domid,
+ count << page_shift, PROT_READ | PROT_WRITE, 1 << page_shift,
+ entries, count);
+ if ( ptr == NULL )
+ {
+ err = errno;
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn
+ " [mmap, errno=%i (%s)]", __FUNCTION__, pfn, count,
+ err, strerror(err));
+ return NULL;
+ }
+
+ return ptr;
+}
+
+int xc_dom_boot_image(struct xc_dom_image *dom)
+{
+ xc_dominfo_t info;
+ int rc;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ /* misc stuff*/
+ if ( (rc = dom->arch_hooks->bootearly(dom)) != 0 )
+ return rc;
+
+ /* collect some info */
+ rc = xc_domain_getinfo(dom->xch, dom->guest_domid, 1, &info);
+ if ( rc < 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: getdomaininfo failed (rc=%d)", __FUNCTION__, rc);
+ return rc;
+ }
+ if ( rc == 0 || info.domid != dom->guest_domid )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: Huh? No domains found (nr_domains=%d) "
+ "or domid mismatch (%d != %d)", __FUNCTION__,
+ rc, info.domid, dom->guest_domid);
+ return -1;
+ }
+ dom->shared_info_mfn = info.shared_info_frame;
+
+ /* sanity checks */
+ if ( !xc_dom_compat_check(dom) )
+ return -1;
+
+ /* initial mm setup */
+ if ( dom->arch_hooks->setup_pgtables &&
+ (rc = dom->arch_hooks->setup_pgtables(dom)) != 0 )
+ return rc;
+
+ /* start info page */
+ if ( dom->arch_hooks->start_info )
+ dom->arch_hooks->start_info(dom);
+
+ /* hypercall page */
+ if ( (rc = setup_hypercall_page(dom)) != 0 )
+ return rc;
+ xc_dom_log_memory_footprint(dom);
+
+ /* misc x86 stuff */
+ if ( (rc = dom->arch_hooks->bootlate(dom)) != 0 )
+ return rc;
+
+ /* let the vm run */
+ if ( (rc = dom->arch_hooks->vcpu(dom)) != 0 )
+ return rc;
+ xc_dom_unmap_all(dom);
+
+ return rc;
+}
+
+static xen_pfn_t xc_dom_gnttab_setup(xc_interface *xch, uint32_t domid)
+{
+ gnttab_setup_table_t setup;
+ DECLARE_HYPERCALL_BUFFER(xen_pfn_t, gmfnp);
+ int rc;
+ xen_pfn_t gmfn;
+
+ gmfnp = xc_hypercall_buffer_alloc(xch, gmfnp, sizeof(*gmfnp));
+ if (gmfnp == NULL)
+ return -1;
+
+ setup.dom = domid;
+ setup.nr_frames = 1;
+ set_xen_guest_handle(setup.frame_list, gmfnp);
+ setup.status = 0;
+
+ rc = xc_gnttab_op(xch, GNTTABOP_setup_table, &setup, sizeof(setup), 1);
+ gmfn = *gmfnp;
+ xc_hypercall_buffer_free(xch, gmfnp);
+
+ if ( rc != 0 || setup.status != GNTST_okay )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to setup domU grant table "
+ "[errno=%d, status=%" PRId16 "]\n",
+ __FUNCTION__, rc != 0 ? errno : 0, setup.status);
+ return -1;
+ }
+
+ return gmfn;
+}
+
+static void xc_dom_set_gnttab_entry(xc_interface *xch,
+ grant_entry_v1_t *gnttab,
+ unsigned int idx,
+ uint32_t guest_domid,
+ uint32_t backend_domid,
+ xen_pfn_t guest_gfn)
+{
+ if ( guest_domid == backend_domid || guest_gfn == -1 )
+ return;
+
+ xc_dom_printf(xch, "%s: d%d gnt[%u] -> d%d 0x%"PRI_xen_pfn,
+ __func__, guest_domid, idx, backend_domid, guest_gfn);
+
+ gnttab[idx].flags = GTF_permit_access;
+ gnttab[idx].domid = backend_domid;
+ gnttab[idx].frame = guest_gfn;
+}
+
+static int compat_gnttab_seed(xc_interface *xch, uint32_t domid,
+ xen_pfn_t console_gfn,
+ xen_pfn_t xenstore_gfn,
+ uint32_t console_domid,
+ uint32_t xenstore_domid)
+{
+
+ xen_pfn_t gnttab_gfn;
+ grant_entry_v1_t *gnttab;
+
+ gnttab_gfn = xc_dom_gnttab_setup(xch, domid);
+ if ( gnttab_gfn == -1 )
+ return -1;
+
+ gnttab = xc_map_foreign_range(xch,
+ domid,
+ PAGE_SIZE,
+ PROT_READ|PROT_WRITE,
+ gnttab_gfn);
+ if ( gnttab == NULL )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to map d%d grant table "
+ "[errno=%d]\n",
+ __func__, domid, errno);
+ return -1;
+ }
+
+ xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_CONSOLE,
+ domid, console_domid, console_gfn);
+ xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_XENSTORE,
+ domid, xenstore_domid, xenstore_gfn);
+
+ if ( munmap(gnttab, PAGE_SIZE) == -1 )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to unmap d%d grant table "
+ "[errno=%d]\n",
+ __func__, domid, errno);
+ return -1;
+ }
+
+ /* Guest shouldn't really touch its grant table until it has
+ * enabled its caches. But lets be nice. */
+ xc_domain_cacheflush(xch, domid, gnttab_gfn, 1);
+
+ return 0;
+}
+
+static int compat_gnttab_hvm_seed(xc_interface *xch, uint32_t domid,
+ xen_pfn_t console_gfn,
+ xen_pfn_t xenstore_gfn,
+ uint32_t console_domid,
+ uint32_t xenstore_domid)
+{
+ int rc;
+ xen_pfn_t scratch_gfn;
+ struct xen_add_to_physmap xatp = {
+ .domid = domid,
+ .space = XENMAPSPACE_grant_table,
+ .idx = 0,
+ };
+ struct xen_remove_from_physmap xrfp = {
+ .domid = domid,
+ };
+
+ rc = xc_core_arch_get_scratch_gpfn(xch, domid, &scratch_gfn);
+ if ( rc < 0 )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to get a scratch gfn from d%d"
+ "[errno=%d]\n",
+ __func__, domid, errno);
+ return -1;
+ }
+ xatp.gpfn = scratch_gfn;
+ xrfp.gpfn = scratch_gfn;
+
+ xc_dom_printf(xch, "%s: d%d: pfn=0x%"PRI_xen_pfn, __func__,
+ domid, scratch_gfn);
+
+ rc = do_memory_op(xch, XENMEM_add_to_physmap, &xatp, sizeof(xatp));
+ if ( rc != 0 )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to add gnttab to d%d physmap "
+ "[errno=%d]\n",
+ __func__, domid, errno);
+ return -1;
+ }
+
+ rc = compat_gnttab_seed(xch, domid,
+ console_gfn, xenstore_gfn,
+ console_domid, xenstore_domid);
+ if (rc != 0)
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to seed gnttab entries for d%d\n",
+ __func__, domid);
+ (void) do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp,
+ sizeof(xrfp));
+ return -1;
+ }
+
+ rc = do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp, sizeof(xrfp));
+ if (rc != 0)
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to remove gnttab from d%d physmap "
+ "[errno=%d]\n",
+ __func__, domid, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+int xc_dom_gnttab_seed(xc_interface *xch, uint32_t guest_domid,
+ bool is_hvm, xen_pfn_t console_gfn,
+ xen_pfn_t xenstore_gfn, uint32_t console_domid,
+ uint32_t xenstore_domid)
+{
+ xenforeignmemory_handle* fmem = xch->fmem;
+ xenforeignmemory_resource_handle *fres;
+ void *addr = NULL;
+
+ fres = xenforeignmemory_map_resource(
+ fmem, guest_domid, XENMEM_resource_grant_table,
+ XENMEM_resource_grant_table_id_shared, 0, 1, &addr,
+ PROT_READ | PROT_WRITE, 0);
+ if ( !fres )
+ {
+ if ( errno == EOPNOTSUPP )
+ return is_hvm ?
+ compat_gnttab_hvm_seed(xch, guest_domid,
+ console_gfn, xenstore_gfn,
+ console_domid, xenstore_domid) :
+ compat_gnttab_seed(xch, guest_domid,
+ console_gfn, xenstore_gfn,
+ console_domid, xenstore_domid);
+
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: failed to acquire d%d grant table [errno=%d]\n",
+ __func__, guest_domid, errno);
+ return -1;
+ }
+
+ xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_CONSOLE,
+ guest_domid, console_domid, console_gfn);
+ xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_XENSTORE,
+ guest_domid, xenstore_domid, xenstore_gfn);
+
+ xenforeignmemory_unmap_resource(fmem, fres);
+
+ return 0;
+}
+
+int xc_dom_gnttab_init(struct xc_dom_image *dom)
+{
+ bool is_hvm = xc_dom_translated(dom);
+ xen_pfn_t console_gfn = xc_dom_p2m(dom, dom->console_pfn);
+ xen_pfn_t xenstore_gfn = xc_dom_p2m(dom, dom->xenstore_pfn);
+
+ return xc_dom_gnttab_seed(dom->xch, dom->guest_domid, is_hvm,
+ console_gfn, xenstore_gfn,
+ dom->console_domid, dom->xenstore_domid);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Xen domain builder -- bzImage bits
+ *
+ * Parse and load bzImage kernel images.
+ *
+ * This relies on version 2.08 of the boot protocol, which contains an
+ * ELF file embedded in the bzImage. The loader extracts this ELF
+ * image and passes it off to the standard ELF loader.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
+ * written 2007 by Jeremy Fitzhardinge <jeremy@xensource.com>
+ * written 2008 by Ian Campbell <ijc@hellion.org.uk>
+ * written 2009 by Chris Lalancette <clalance@redhat.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xg_dom_decompress.h"
+
+#include <xen-tools/libs.h>
+
+#ifndef __MINIOS__
+
+#if defined(HAVE_BZLIB)
+
+#include <bzlib.h>
+
+static int xc_try_bzip2_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ bz_stream stream;
+ int ret;
+ char *out_buf;
+ char *tmp_buf;
+ int retval = -1;
+ unsigned int outsize;
+ uint64_t total;
+
+ stream.bzalloc = NULL;
+ stream.bzfree = NULL;
+ stream.opaque = NULL;
+
+ if ( dom->kernel_size == 0)
+ {
+ DOMPRINTF("BZIP2: Input is 0 size");
+ return -1;
+ }
+
+ ret = BZ2_bzDecompressInit(&stream, 0, 0);
+ if ( ret != BZ_OK )
+ {
+ DOMPRINTF("BZIP2: Error initting stream");
+ return -1;
+ }
+
+ /* sigh. We don't know up-front how much memory we are going to need
+ * for the output buffer. Allocate the output buffer to be equal
+ * the input buffer to start, and we'll realloc as needed.
+ */
+ outsize = dom->kernel_size;
+
+ /*
+ * stream.avail_in and outsize are unsigned int, while kernel_size
+ * is a size_t. Check we aren't overflowing.
+ */
+ if ( outsize != dom->kernel_size )
+ {
+ DOMPRINTF("BZIP2: Input too large");
+ goto bzip2_cleanup;
+ }
+
+ out_buf = malloc(outsize);
+ if ( out_buf == NULL )
+ {
+ DOMPRINTF("BZIP2: Failed to alloc memory");
+ goto bzip2_cleanup;
+ }
+
+ stream.next_in = dom->kernel_blob;
+ stream.avail_in = dom->kernel_size;
+
+ stream.next_out = out_buf;
+ stream.avail_out = dom->kernel_size;
+
+ for ( ; ; )
+ {
+ ret = BZ2_bzDecompress(&stream);
+ if ( ret == BZ_STREAM_END )
+ {
+ DOMPRINTF("BZIP2: Saw data stream end");
+ retval = 0;
+ break;
+ }
+ if ( ret != BZ_OK )
+ {
+ DOMPRINTF("BZIP2: error %d", ret);
+ free(out_buf);
+ goto bzip2_cleanup;
+ }
+
+ if ( stream.avail_out == 0 )
+ {
+ /* Protect against output buffer overflow */
+ if ( outsize > UINT_MAX / 2 )
+ {
+ DOMPRINTF("BZIP2: output buffer overflow");
+ free(out_buf);
+ goto bzip2_cleanup;
+ }
+
+ if ( xc_dom_kernel_check_size(dom, outsize * 2) )
+ {
+ DOMPRINTF("BZIP2: output too large");
+ free(out_buf);
+ goto bzip2_cleanup;
+ }
+
+ tmp_buf = realloc(out_buf, outsize * 2);
+ if ( tmp_buf == NULL )
+ {
+ DOMPRINTF("BZIP2: Failed to realloc memory");
+ free(out_buf);
+ goto bzip2_cleanup;
+ }
+ out_buf = tmp_buf;
+
+ stream.next_out = out_buf + outsize;
+ stream.avail_out = (outsize * 2) - outsize;
+ outsize *= 2;
+ }
+ else if ( stream.avail_in == 0 )
+ {
+ /*
+ * If there is output buffer available then this indicates
+ * that BZ2_bzDecompress would like more input data to be
+ * provided. However our complete input buffer is in
+ * memory and provided upfront so if avail_in is zero this
+ * actually indicates a truncated input.
+ */
+ DOMPRINTF("BZIP2: not enough input");
+ free(out_buf);
+ goto bzip2_cleanup;
+ }
+ }
+
+ total = (((uint64_t)stream.total_out_hi32) << 32) | stream.total_out_lo32;
+
+ if ( xc_dom_register_external(dom, out_buf, total) )
+ {
+ DOMPRINTF("BZIP2: Error registering stream output");
+ free(out_buf);
+ goto bzip2_cleanup;
+ }
+
+ DOMPRINTF("%s: BZIP2 decompress OK, 0x%zx -> 0x%lx",
+ __FUNCTION__, *size, (long unsigned int) total);
+
+ *blob = out_buf;
+ *size = total;
+
+ bzip2_cleanup:
+ BZ2_bzDecompressEnd(&stream);
+
+ return retval;
+}
+
+#else /* !defined(HAVE_BZLIB) */
+
+static int xc_try_bzip2_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: BZIP2 decompress support unavailable",
+ __FUNCTION__);
+ return -1;
+}
+
+#endif
+
+#if defined(HAVE_LZMA)
+
+#include <lzma.h>
+
+static int _xc_try_lzma_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size,
+ lzma_stream *stream, const char *what)
+{
+ lzma_ret ret;
+ lzma_action action = LZMA_RUN;
+ unsigned char *out_buf;
+ unsigned char *tmp_buf;
+ int retval = -1;
+ size_t outsize;
+ const char *msg;
+
+ if ( dom->kernel_size == 0)
+ {
+ DOMPRINTF("%s: Input is 0 size", what);
+ return -1;
+ }
+
+ /* sigh. We don't know up-front how much memory we are going to need
+ * for the output buffer. Allocate the output buffer to be equal
+ * the input buffer to start, and we'll realloc as needed.
+ */
+ outsize = dom->kernel_size;
+ out_buf = malloc(outsize);
+ if ( out_buf == NULL )
+ {
+ DOMPRINTF("%s: Failed to alloc memory", what);
+ goto lzma_cleanup;
+ }
+
+ stream->next_in = dom->kernel_blob;
+ stream->avail_in = dom->kernel_size;
+
+ stream->next_out = out_buf;
+ stream->avail_out = dom->kernel_size;
+
+ for ( ; ; )
+ {
+ ret = lzma_code(stream, action);
+ if ( ret == LZMA_STREAM_END )
+ {
+ DOMPRINTF("%s: Saw data stream end", what);
+ retval = 0;
+ break;
+ }
+ if ( ret != LZMA_OK )
+ {
+ switch ( ret )
+ {
+ case LZMA_MEM_ERROR:
+ msg = strerror(ENOMEM);
+ break;
+
+ case LZMA_MEMLIMIT_ERROR:
+ msg = "Memory usage limit reached";
+ break;
+
+ case LZMA_FORMAT_ERROR:
+ msg = "File format not recognized";
+ break;
+
+ case LZMA_OPTIONS_ERROR:
+ // FIXME: Better message?
+ msg = "Unsupported compression options";
+ break;
+
+ case LZMA_DATA_ERROR:
+ msg = "File is corrupt";
+ break;
+
+ case LZMA_BUF_ERROR:
+ msg = "Unexpected end of input";
+ break;
+
+ default:
+ msg = "Internal program error (bug)";
+ break;
+ }
+ DOMPRINTF("%s: %s decompression error: %s",
+ __FUNCTION__, what, msg);
+ free(out_buf);
+ goto lzma_cleanup;
+ }
+
+ if ( stream->avail_out == 0 )
+ {
+ /* Protect against output buffer overflow */
+ if ( outsize > SIZE_MAX / 2 )
+ {
+ DOMPRINTF("%s: output buffer overflow", what);
+ free(out_buf);
+ goto lzma_cleanup;
+ }
+
+ if ( xc_dom_kernel_check_size(dom, outsize * 2) )
+ {
+ DOMPRINTF("%s: output too large", what);
+ free(out_buf);
+ goto lzma_cleanup;
+ }
+
+ tmp_buf = realloc(out_buf, outsize * 2);
+ if ( tmp_buf == NULL )
+ {
+ DOMPRINTF("%s: Failed to realloc memory", what);
+ free(out_buf);
+ goto lzma_cleanup;
+ }
+ out_buf = tmp_buf;
+
+ stream->next_out = out_buf + outsize;
+ stream->avail_out = (outsize * 2) - outsize;
+ outsize *= 2;
+ }
+ }
+
+ if ( xc_dom_register_external(dom, out_buf, stream->total_out) )
+ {
+ DOMPRINTF("%s: Error registering stream output", what);
+ free(out_buf);
+ goto lzma_cleanup;
+ }
+
+ DOMPRINTF("%s: %s decompress OK, 0x%zx -> 0x%zx",
+ __FUNCTION__, what, *size, (size_t)stream->total_out);
+
+ *blob = out_buf;
+ *size = stream->total_out;
+
+ lzma_cleanup:
+ lzma_end(stream);
+
+ return retval;
+}
+
+/* 128 Mb is the minimum size (half-way) documented to work for all inputs. */
+#define LZMA_BLOCK_SIZE (128*1024*1024)
+
+static int xc_try_xz_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ lzma_stream stream = LZMA_STREAM_INIT;
+
+ if ( lzma_stream_decoder(&stream, LZMA_BLOCK_SIZE, 0) != LZMA_OK )
+ {
+ DOMPRINTF("XZ: Failed to init decoder");
+ return -1;
+ }
+
+ return _xc_try_lzma_decode(dom, blob, size, &stream, "XZ");
+}
+
+static int xc_try_lzma_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ lzma_stream stream = LZMA_STREAM_INIT;
+
+ if ( lzma_alone_decoder(&stream, LZMA_BLOCK_SIZE) != LZMA_OK )
+ {
+ DOMPRINTF("LZMA: Failed to init decoder");
+ return -1;
+ }
+
+ return _xc_try_lzma_decode(dom, blob, size, &stream, "LZMA");
+}
+
+#else /* !defined(HAVE_LZMA) */
+
+static int xc_try_xz_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: XZ decompress support unavailable",
+ __FUNCTION__);
+ return -1;
+}
+
+static int xc_try_lzma_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: LZMA decompress support unavailable",
+ __FUNCTION__);
+ return -1;
+}
+
+#endif
+
+#if defined(HAVE_LZO1X)
+
+#include <lzo/lzo1x.h>
+
+#define LZOP_HEADER_HAS_FILTER 0x00000800
+#define LZOP_MAX_BLOCK_SIZE (64*1024*1024)
+
+static inline uint_fast16_t lzo_read_16(const unsigned char *buf)
+{
+ return buf[1] | (buf[0] << 8);
+}
+
+static inline uint_fast32_t lzo_read_32(const unsigned char *buf)
+{
+ return lzo_read_16(buf + 2) | ((uint32_t)lzo_read_16(buf) << 16);
+}
+
+static int xc_try_lzo1x_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ int ret;
+ const unsigned char *cur = dom->kernel_blob;
+ unsigned char *out_buf = NULL;
+ size_t left = dom->kernel_size;
+ const char *msg;
+ unsigned version;
+ static const unsigned char magic[] = {
+ 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a
+ };
+
+ /*
+ * lzo_uint should match size_t. Check that this is the case to be
+ * sure we won't overflow various lzo_uint fields.
+ */
+ BUILD_BUG_ON(sizeof(lzo_uint) != sizeof(size_t));
+
+ ret = lzo_init();
+ if ( ret != LZO_E_OK )
+ {
+ DOMPRINTF("LZO1x: Failed to init library (%d)\n", ret);
+ return -1;
+ }
+
+ if ( left < 16 || memcmp(cur, magic, 9) )
+ {
+ DOMPRINTF("LZO1x: Unrecognized magic\n");
+ return -1;
+ }
+
+ /* get version (2bytes), skip library version (2),
+ * 'need to be extracted' version (2) and method (1) */
+ version = lzo_read_16(cur + 9);
+ cur += 16;
+ left -= 16;
+
+ if ( version >= 0x0940 )
+ {
+ /* skip level */
+ ++cur;
+ if ( left )
+ --left;
+ }
+
+ if ( left >= 4 && (lzo_read_32(cur) & LZOP_HEADER_HAS_FILTER) )
+ ret = 8; /* flags + filter info */
+ else
+ ret = 4; /* flags */
+
+ /* skip mode and mtime_low */
+ ret += 8;
+ if ( version >= 0x0940 )
+ ret += 4; /* skip mtime_high */
+
+ /* don't care about the file name, and skip checksum */
+ if ( left > ret )
+ ret += 1 + cur[ret] + 4;
+
+ if ( left < ret )
+ {
+ DOMPRINTF("LZO1x: Incomplete header\n");
+ return -1;
+ }
+ cur += ret;
+ left -= ret;
+
+ for ( *size = 0; ; )
+ {
+ lzo_uint src_len, dst_len, out_len;
+ unsigned char *tmp_buf;
+
+ msg = "Short input";
+ if ( left < 4 )
+ break;
+
+ dst_len = lzo_read_32(cur);
+ if ( !dst_len )
+ {
+ msg = "Error registering stream output";
+ if ( xc_dom_register_external(dom, out_buf, *size) )
+ break;
+
+ return 0;
+ }
+
+ if ( dst_len > LZOP_MAX_BLOCK_SIZE )
+ {
+ msg = "Block size too large";
+ break;
+ }
+
+ if ( left < 12 )
+ break;
+
+ src_len = lzo_read_32(cur + 4);
+ cur += 12; /* also skip block checksum info */
+ left -= 12;
+
+ msg = "Bad source length";
+ if ( src_len <= 0 || src_len > dst_len || src_len > left )
+ break;
+
+ msg = "Output buffer overflow";
+ if ( *size > SIZE_MAX - dst_len )
+ break;
+
+ msg = "Decompressed image too large";
+ if ( xc_dom_kernel_check_size(dom, *size + dst_len) )
+ break;
+
+ msg = "Failed to (re)alloc memory";
+ tmp_buf = realloc(out_buf, *size + dst_len);
+ if ( tmp_buf == NULL )
+ break;
+
+ out_buf = tmp_buf;
+ out_len = dst_len;
+
+ ret = lzo1x_decompress_safe(cur, src_len,
+ out_buf + *size, &out_len, NULL);
+ switch ( ret )
+ {
+ case LZO_E_OK:
+ msg = "Input underrun";
+ if ( out_len != dst_len )
+ break;
+
+ *blob = out_buf;
+ *size += out_len;
+ cur += src_len;
+ left -= src_len;
+ continue;
+
+ case LZO_E_INPUT_NOT_CONSUMED:
+ msg = "Unconsumed input";
+ break;
+
+ case LZO_E_OUTPUT_OVERRUN:
+ msg = "Output overrun";
+ break;
+
+ case LZO_E_INPUT_OVERRUN:
+ msg = "Input overrun";
+ break;
+
+ case LZO_E_LOOKBEHIND_OVERRUN:
+ msg = "Look-behind overrun";
+ break;
+
+ case LZO_E_EOF_NOT_FOUND:
+ msg = "No EOF marker";
+ break;
+
+ case LZO_E_ERROR:
+ msg = "General error";
+ break;
+
+ default:
+ msg = "Internal program error (bug)";
+ break;
+ }
+
+ break;
+ }
+
+ free(out_buf);
+ DOMPRINTF("LZO1x decompression error: %s\n", msg);
+
+ return -1;
+}
+
+#else /* !defined(HAVE_LZO1X) */
+
+static int xc_try_lzo1x_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: LZO1x decompress support unavailable\n",
+ __FUNCTION__);
+ return -1;
+}
+
+#endif
+
+#else /* __MINIOS__ */
+
+int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size);
+int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size);
+int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size);
+int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size);
+
+#endif /* !__MINIOS__ */
+
+struct setup_header {
+ uint8_t _pad0[0x1f1]; /* skip uninteresting stuff */
+ uint8_t setup_sects;
+ uint16_t root_flags;
+ uint32_t syssize;
+ uint16_t ram_size;
+ uint16_t vid_mode;
+ uint16_t root_dev;
+ uint16_t boot_flag;
+ uint16_t jump;
+ uint32_t header;
+#define HDR_MAGIC "HdrS"
+#define HDR_MAGIC_SZ 4
+ uint16_t version;
+#define VERSION(h,l) (((h)<<8) | (l))
+ uint32_t realmode_swtch;
+ uint16_t start_sys;
+ uint16_t kernel_version;
+ uint8_t type_of_loader;
+ uint8_t loadflags;
+ uint16_t setup_move_size;
+ uint32_t code32_start;
+ uint32_t ramdisk_image;
+ uint32_t ramdisk_size;
+ uint32_t bootsect_kludge;
+ uint16_t heap_end_ptr;
+ uint16_t _pad1;
+ uint32_t cmd_line_ptr;
+ uint32_t initrd_addr_max;
+ uint32_t kernel_alignment;
+ uint8_t relocatable_kernel;
+ uint8_t _pad2[3];
+ uint32_t cmdline_size;
+ uint32_t hardware_subarch;
+ uint64_t hardware_subarch_data;
+ uint32_t payload_offset;
+ uint32_t payload_length;
+} __attribute__((packed));
+
+extern struct xc_dom_loader elf_loader;
+
+static int check_magic(struct xc_dom_image *dom, const void *magic, size_t len)
+{
+ if (len > dom->kernel_size)
+ return 0;
+
+ return (memcmp(dom->kernel_blob, magic, len) == 0);
+}
+
+static int xc_dom_probe_bzimage_kernel(struct xc_dom_image *dom)
+{
+ struct setup_header *hdr;
+ uint64_t payload_offset, payload_length;
+ int ret;
+
+ if ( dom->kernel_blob == NULL )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: no kernel image loaded", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( dom->kernel_size < sizeof(struct setup_header) )
+ {
+ xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ hdr = dom->kernel_blob;
+
+ if ( memcmp(&hdr->header, HDR_MAGIC, HDR_MAGIC_SZ) != 0 )
+ {
+ xc_dom_printf(dom->xch, "%s: kernel is not a bzImage", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( hdr->version < VERSION(2,8) )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: boot protocol"
+ " too old (%04x)", __FUNCTION__, hdr->version);
+ return -EINVAL;
+ }
+
+
+ /* upcast to 64 bits to avoid overflow */
+ /* setup_sects is u8 and so cannot overflow */
+ payload_offset = (hdr->setup_sects + 1) * 512;
+ payload_offset += hdr->payload_offset;
+ payload_length = hdr->payload_length;
+
+ if ( payload_offset >= dom->kernel_size )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload offset overflow",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+ if ( (payload_offset + payload_length) > dom->kernel_size )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload length overflow",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+
+ dom->kernel_blob = dom->kernel_blob + payload_offset;
+ dom->kernel_size = payload_length;
+
+ if ( check_magic(dom, "\037\213", 2) )
+ {
+ ret = xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size);
+ if ( ret == -1 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: unable to"
+ " gzip decompress kernel", __FUNCTION__);
+ return -EINVAL;
+ }
+ }
+ else if ( check_magic(dom, "\102\132\150", 3) )
+ {
+ ret = xc_try_bzip2_decode(dom, &dom->kernel_blob, &dom->kernel_size);
+ if ( ret < 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s unable to BZIP2 decompress kernel",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+ }
+ else if ( check_magic(dom, "\3757zXZ", 6) )
+ {
+ ret = xc_try_xz_decode(dom, &dom->kernel_blob, &dom->kernel_size);
+ if ( ret < 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s unable to XZ decompress kernel",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+ }
+ else if ( check_magic(dom, "\135\000", 2) )
+ {
+ ret = xc_try_lzma_decode(dom, &dom->kernel_blob, &dom->kernel_size);
+ if ( ret < 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s unable to LZMA decompress kernel",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+ }
+ else if ( check_magic(dom, "\x89LZO", 5) )
+ {
+ ret = xc_try_lzo1x_decode(dom, &dom->kernel_blob, &dom->kernel_size);
+ if ( ret < 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s unable to LZO decompress kernel\n",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+ }
+ else if ( check_magic(dom, "\x02\x21", 2) )
+ {
+ ret = xc_try_lz4_decode(dom, &dom->kernel_blob, &dom->kernel_size);
+ if ( ret < 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s unable to LZ4 decompress kernel\n",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+ }
+ else
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s: unknown compression format", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ return elf_loader.probe(dom);
+}
+
+static int xc_dom_parse_bzimage_kernel(struct xc_dom_image *dom)
+{
+ return elf_loader.parser(dom);
+}
+
+static int xc_dom_load_bzimage_kernel(struct xc_dom_image *dom)
+{
+ return elf_loader.loader(dom);
+}
+
+static struct xc_dom_loader bzimage_loader = {
+ .name = "Linux bzImage",
+ .probe = xc_dom_probe_bzimage_kernel,
+ .parser = xc_dom_parse_bzimage_kernel,
+ .loader = xc_dom_load_bzimage_kernel,
+};
+
+static void __init register_loader(void)
+{
+ xc_dom_register_loader(&bzimage_loader);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Xen domain builder -- compatibility code.
+ *
+ * Replacements for xc_linux_build & friends,
+ * as example code and to make the new builder
+ * usable as drop-in replacement.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <zlib.h>
+
+#include "xenctrl.h"
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+
+/* ------------------------------------------------------------------------ */
+
+int xc_linux_build(xc_interface *xch, uint32_t domid,
+ unsigned int mem_mb,
+ const char *image_name,
+ const char *initrd_name,
+ const char *cmdline,
+ const char *features,
+ unsigned long flags,
+ unsigned int store_evtchn,
+ unsigned long *store_mfn,
+ unsigned int console_evtchn,
+ unsigned long *console_mfn)
+{
+ struct xc_dom_image *dom;
+ int rc;
+
+ xc_dom_loginit(xch);
+ dom = xc_dom_allocate(xch, cmdline, features);
+ if (dom == NULL)
+ return -1;
+ if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 )
+ goto out;
+ if ( initrd_name && strlen(initrd_name) &&
+ ((rc = xc_dom_module_file(dom, initrd_name, NULL)) != 0) )
+ goto out;
+
+ dom->flags |= flags;
+ dom->console_evtchn = console_evtchn;
+ dom->xenstore_evtchn = store_evtchn;
+
+ if ( (rc = xc_dom_boot_xen_init(dom, xch, domid)) != 0 )
+ goto out;
+ if ( (rc = xc_dom_parse_image(dom)) != 0 )
+ goto out;
+ if ( (rc = xc_dom_mem_init(dom, mem_mb)) != 0 )
+ goto out;
+ if ( (rc = xc_dom_boot_mem_init(dom)) != 0 )
+ goto out;
+ if ( (rc = xc_dom_build_image(dom)) != 0 )
+ goto out;
+ if ( (rc = xc_dom_boot_image(dom)) != 0 )
+ goto out;
+ if ( (rc = xc_dom_gnttab_init(dom)) != 0)
+ goto out;
+
+ *console_mfn = xc_dom_p2m(dom, dom->console_pfn);
+ *store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
+
+ out:
+ xc_dom_release(dom);
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Xen domain builder -- core bits.
+ *
+ * The core code goes here:
+ * - allocate and release domain structs.
+ * - memory management functions.
+ * - misc helper functions.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <zlib.h>
+#include <assert.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+#include "_paths.h"
+
+/* ------------------------------------------------------------------------ */
+/* debugging */
+
+
+
+static const char *default_logfile = XEN_LOG_DIR "/domain-builder-ng.log";
+
+int xc_dom_loginit(xc_interface *xch) {
+ if (xch->dombuild_logger) return 0;
+
+ if (!xch->dombuild_logger_file) {
+ xch->dombuild_logger_file = fopen(default_logfile, "a");
+ if (!xch->dombuild_logger_file) {
+ PERROR("Could not open logfile `%s'", default_logfile);
+ return -1;
+ }
+ }
+
+ xch->dombuild_logger = xch->dombuild_logger_tofree =
+ (xentoollog_logger*)
+ xtl_createlogger_stdiostream(xch->dombuild_logger_file, XTL_DETAIL,
+ XTL_STDIOSTREAM_SHOW_DATE|XTL_STDIOSTREAM_SHOW_PID);
+ if (!xch->dombuild_logger)
+ return -1;
+
+ xc_dom_printf(xch, "### ----- xc domain builder logfile opened -----");
+
+ return 0;
+}
+
+void xc_dom_printf(xc_interface *xch, const char *fmt, ...)
+{
+ va_list args;
+ if (!xch->dombuild_logger) return;
+ va_start(args, fmt);
+ xtl_logv(xch->dombuild_logger, XTL_DETAIL, -1, "domainbuilder", fmt, args);
+ va_end(args);
+}
+
+void xc_dom_panic_func(xc_interface *xch,
+ const char *file, int line, xc_error_code err,
+ const char *fmt, ...)
+{
+ va_list args;
+ char msg[XC_MAX_ERROR_MSG_LEN];
+
+ va_start(args, fmt);
+ vsnprintf(msg, sizeof(msg), fmt, args);
+ va_end(args);
+ msg[sizeof(msg)-1] = 0;
+
+ xc_report(xch,
+ xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler,
+ XTL_ERROR, err, "panic: %s:%d: %s",
+ file, line, msg);
+}
+
+static void print_mem(struct xc_dom_image *dom, const char *name, size_t mem)
+{
+ if ( mem > (32 * 1024 * 1024) )
+ DOMPRINTF("%-24s : %zd MB", name, mem / (1024 * 1024));
+ else if ( mem > (32 * 1024) )
+ DOMPRINTF("%-24s : %zd kB", name, mem / 1024);
+ else
+ DOMPRINTF("%-24s : %zd bytes", name, mem);
+}
+
+void xc_dom_log_memory_footprint(struct xc_dom_image *dom)
+{
+ DOMPRINTF("domain builder memory footprint");
+ DOMPRINTF(" allocated");
+ print_mem(dom, " malloc", dom->alloc_malloc);
+ print_mem(dom, " anon mmap", dom->alloc_mem_map);
+ DOMPRINTF(" mapped");
+ print_mem(dom, " file mmap", dom->alloc_file_map);
+ print_mem(dom, " domU mmap", dom->alloc_domU_map);
+}
+
+/* ------------------------------------------------------------------------ */
+/* simple memory pool */
+
+void *xc_dom_malloc(struct xc_dom_image *dom, size_t size)
+{
+ struct xc_dom_mem *block;
+
+ if ( size > SIZE_MAX - sizeof(*block) )
+ {
+ DOMPRINTF("%s: unreasonable allocation size", __FUNCTION__);
+ return NULL;
+ }
+ block = malloc(sizeof(*block) + size);
+ if ( block == NULL )
+ {
+ DOMPRINTF("%s: allocation failed", __FUNCTION__);
+ return NULL;
+ }
+ memset(block, 0, sizeof(*block) + size);
+ block->type = XC_DOM_MEM_TYPE_MALLOC_INTERNAL;
+ block->next = dom->memblocks;
+ dom->memblocks = block;
+ dom->alloc_malloc += sizeof(*block) + size;
+ if ( size > (100 * 1024) )
+ print_mem(dom, __FUNCTION__, size);
+ return block->memory;
+}
+
+void *xc_dom_malloc_page_aligned(struct xc_dom_image *dom, size_t size)
+{
+ struct xc_dom_mem *block;
+
+ block = malloc(sizeof(*block));
+ if ( block == NULL )
+ {
+ DOMPRINTF("%s: allocation failed", __FUNCTION__);
+ return NULL;
+ }
+ memset(block, 0, sizeof(*block));
+ block->len = size;
+ block->ptr = mmap(NULL, block->len,
+ PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+ -1, 0);
+ if ( block->ptr == MAP_FAILED )
+ {
+ DOMPRINTF("%s: mmap failed", __FUNCTION__);
+ free(block);
+ return NULL;
+ }
+ block->type = XC_DOM_MEM_TYPE_MMAP;
+ block->next = dom->memblocks;
+ dom->memblocks = block;
+ dom->alloc_malloc += sizeof(*block);
+ dom->alloc_mem_map += block->len;
+ if ( size > (100 * 1024) )
+ print_mem(dom, __FUNCTION__, size);
+ return block->ptr;
+}
+
+int xc_dom_register_external(struct xc_dom_image *dom, void *ptr, size_t size)
+{
+ struct xc_dom_mem *block;
+
+ block = malloc(sizeof(*block));
+ if ( block == NULL )
+ {
+ DOMPRINTF("%s: allocation failed", __FUNCTION__);
+ return -1;
+ }
+ memset(block, 0, sizeof(*block));
+ block->ptr = ptr;
+ block->len = size;
+ block->type = XC_DOM_MEM_TYPE_MALLOC_EXTERNAL;
+ block->next = dom->memblocks;
+ dom->memblocks = block;
+ dom->alloc_malloc += sizeof(*block);
+ dom->alloc_mem_map += block->len;
+ return 0;
+}
+
+void *xc_dom_malloc_filemap(struct xc_dom_image *dom,
+ const char *filename, size_t * size,
+ const size_t max_size)
+{
+ struct xc_dom_mem *block = NULL;
+ int fd = -1;
+ off_t offset;
+
+ fd = open(filename, O_RDONLY);
+ if ( fd == -1 ) {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "failed to open file '%s': %s",
+ filename, strerror(errno));
+ goto err;
+ }
+
+ if ( (lseek(fd, 0, SEEK_SET) == -1) ||
+ ((offset = lseek(fd, 0, SEEK_END)) == -1) ) {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "failed to seek on file '%s': %s",
+ filename, strerror(errno));
+ goto err;
+ }
+
+ *size = offset;
+
+ if ( max_size && *size > max_size )
+ {
+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+ "tried to map file which is too large");
+ goto err;
+ }
+
+ if ( !*size )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "'%s': zero length file", filename);
+ goto err;
+ }
+
+ block = malloc(sizeof(*block));
+ if ( block == NULL ) {
+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+ "failed to allocate block (%zu bytes)",
+ sizeof(*block));
+ goto err;
+ }
+
+ memset(block, 0, sizeof(*block));
+ block->len = *size;
+ block->ptr = mmap(NULL, block->len, PROT_READ,
+ MAP_SHARED, fd, 0);
+ if ( block->ptr == MAP_FAILED ) {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "failed to mmap file '%s': %s",
+ filename, strerror(errno));
+ goto err;
+ }
+
+ block->type = XC_DOM_MEM_TYPE_MMAP;
+ block->next = dom->memblocks;
+ dom->memblocks = block;
+ dom->alloc_malloc += sizeof(*block);
+ dom->alloc_file_map += block->len;
+ close(fd);
+ if ( *size > (100 * 1024) )
+ print_mem(dom, __FUNCTION__, *size);
+ return block->ptr;
+
+ err:
+ if ( fd != -1 )
+ close(fd);
+ free(block);
+ DOMPRINTF("%s: failed (on file `%s')", __FUNCTION__, filename);
+ return NULL;
+}
+
+static void xc_dom_free_all(struct xc_dom_image *dom)
+{
+ struct xc_dom_mem *block;
+
+ while ( (block = dom->memblocks) != NULL )
+ {
+ dom->memblocks = block->next;
+ switch ( block->type )
+ {
+ case XC_DOM_MEM_TYPE_MALLOC_INTERNAL:
+ break;
+ case XC_DOM_MEM_TYPE_MALLOC_EXTERNAL:
+ free(block->ptr);
+ break;
+ case XC_DOM_MEM_TYPE_MMAP:
+ munmap(block->ptr, block->len);
+ break;
+ }
+ free(block);
+ }
+}
+
+char *xc_dom_strdup(struct xc_dom_image *dom, const char *str)
+{
+ size_t len = strlen(str) + 1;
+ char *nstr = xc_dom_malloc(dom, len);
+
+ if ( nstr == NULL )
+ return NULL;
+ memcpy(nstr, str, len);
+ return nstr;
+}
+
+/* ------------------------------------------------------------------------ */
+/* decompression buffer sizing */
+int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz)
+{
+ /* No limit */
+ if ( !dom->max_kernel_size )
+ return 0;
+
+ if ( sz > dom->max_kernel_size )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "kernel image too large");
+ return 1;
+ }
+
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+/* read files, copy memory blocks, with transparent gunzip */
+
+size_t xc_dom_check_gzip(xc_interface *xch, void *blob, size_t ziplen)
+{
+ unsigned char *gzlen;
+ size_t unziplen;
+
+ if ( ziplen < 6 )
+ /* Too small. We need (i.e. the subsequent code relies on)
+ * 2 bytes for the magic number plus 4 bytes length. */
+ return 0;
+
+ if ( strncmp(blob, "\037\213", 2) )
+ /* not gzipped */
+ return 0;
+
+ gzlen = blob + ziplen - 4;
+ unziplen = (size_t)gzlen[3] << 24 | gzlen[2] << 16 | gzlen[1] << 8 | gzlen[0];
+ if ( unziplen > XC_DOM_DECOMPRESS_MAX )
+ {
+ xc_dom_printf
+ (xch,
+ "%s: size (zip %zd, unzip %zd) looks insane, skip gunzip",
+ __FUNCTION__, ziplen, unziplen);
+ return 0;
+ }
+
+ return unziplen + 16;
+}
+
+int xc_dom_do_gunzip(xc_interface *xch,
+ void *src, size_t srclen, void *dst, size_t dstlen)
+{
+ z_stream zStream;
+ int rc;
+
+ memset(&zStream, 0, sizeof(zStream));
+ zStream.next_in = src;
+ zStream.avail_in = srclen;
+ zStream.next_out = dst;
+ zStream.avail_out = dstlen;
+ rc = inflateInit2(&zStream, (MAX_WBITS + 32)); /* +32 means "handle gzip" */
+ if ( rc != Z_OK )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: inflateInit2 failed (rc=%d)", __FUNCTION__, rc);
+ return -1;
+ }
+ rc = inflate(&zStream, Z_FINISH);
+ inflateEnd(&zStream);
+ if ( rc != Z_STREAM_END )
+ {
+ xc_dom_panic(xch, XC_INTERNAL_ERROR,
+ "%s: inflate failed (rc=%d)", __FUNCTION__, rc);
+ return -1;
+ }
+
+ xc_dom_printf(xch, "%s: unzip ok, 0x%zx -> 0x%zx",
+ __FUNCTION__, srclen, dstlen);
+ return 0;
+}
+
+int xc_dom_try_gunzip(struct xc_dom_image *dom, void **blob, size_t * size)
+{
+ void *unzip;
+ size_t unziplen;
+
+ unziplen = xc_dom_check_gzip(dom->xch, *blob, *size);
+ if ( unziplen == 0 )
+ return 0;
+
+ if ( xc_dom_kernel_check_size(dom, unziplen) )
+ return 0;
+
+ unzip = xc_dom_malloc(dom, unziplen);
+ if ( unzip == NULL )
+ return -1;
+
+ if ( xc_dom_do_gunzip(dom->xch, *blob, *size, unzip, unziplen) == -1 )
+ return -1;
+
+ *blob = unzip;
+ *size = unziplen;
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+/* domain memory */
+
+void *xc_dom_pfn_to_ptr(struct xc_dom_image *dom, xen_pfn_t pfn,
+ xen_pfn_t count)
+{
+ xen_pfn_t count_out_dummy;
+ return xc_dom_pfn_to_ptr_retcount(dom, pfn, count, &count_out_dummy);
+}
+
+void *xc_dom_pfn_to_ptr_retcount(struct xc_dom_image *dom, xen_pfn_t pfn,
+ xen_pfn_t count, xen_pfn_t *count_out)
+{
+ struct xc_dom_phys *phys;
+ xen_pfn_t offset;
+ unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom);
+ char *mode = "unset";
+
+ *count_out = 0;
+
+ offset = pfn - dom->rambase_pfn;
+ if ( offset > dom->total_pages || /* multiple checks to avoid overflows */
+ count > dom->total_pages ||
+ offset > dom->total_pages - count )
+ {
+ DOMPRINTF("%s: pfn %"PRI_xen_pfn" out of range (0x%" PRIpfn " > 0x%" PRIpfn ")",
+ __FUNCTION__, pfn, offset, dom->total_pages);
+ return NULL;
+ }
+
+ /* already allocated? */
+ for ( phys = dom->phys_pages; phys != NULL; phys = phys->next )
+ {
+ if ( pfn >= (phys->first + phys->count) )
+ continue;
+ if ( count )
+ {
+ /* size given: must be completely within the already allocated block */
+ if ( (pfn + count) <= phys->first )
+ continue;
+ if ( (pfn < phys->first) ||
+ ((pfn + count) > (phys->first + phys->count)) )
+ {
+ DOMPRINTF("%s: request overlaps allocated block"
+ " (req 0x%" PRIpfn "+0x%" PRIpfn ","
+ " blk 0x%" PRIpfn "+0x%" PRIpfn ")",
+ __FUNCTION__, pfn, count, phys->first,
+ phys->count);
+ return NULL;
+ }
+ *count_out = count;
+ }
+ else
+ {
+ /* no size given: block must be allocated already,
+ just hand out a pointer to it */
+ if ( pfn < phys->first )
+ continue;
+ if ( pfn >= phys->first + phys->count )
+ continue;
+ *count_out = phys->count - (pfn - phys->first);
+ }
+ return phys->ptr + ((pfn - phys->first) << page_shift);
+ }
+
+ /* allocating is allowed with size specified only */
+ if ( count == 0 )
+ {
+ DOMPRINTF("%s: no block found, no size given,"
+ " can't malloc (pfn 0x%" PRIpfn ")",
+ __FUNCTION__, pfn);
+ return NULL;
+ }
+
+ /* not found, no overlap => allocate */
+ phys = xc_dom_malloc(dom, sizeof(*phys));
+ if ( phys == NULL )
+ return NULL;
+ memset(phys, 0, sizeof(*phys));
+ phys->first = pfn;
+ phys->count = count;
+
+ if ( dom->guest_domid )
+ {
+ mode = "domU mapping";
+ phys->ptr = xc_dom_boot_domU_map(dom, phys->first, phys->count);
+ if ( phys->ptr == NULL )
+ return NULL;
+ dom->alloc_domU_map += phys->count << page_shift;
+ }
+ else
+ {
+ int err;
+
+ mode = "anonymous memory";
+ phys->ptr = mmap(NULL, phys->count << page_shift,
+ PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+ -1, 0);
+ if ( phys->ptr == MAP_FAILED )
+ {
+ err = errno;
+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+ "%s: oom: can't allocate 0x%" PRIpfn " pages"
+ " [mmap, errno=%i (%s)]",
+ __FUNCTION__, count, err, strerror(err));
+ return NULL;
+ }
+ dom->alloc_mem_map += phys->count << page_shift;
+ }
+
+#if 1
+ DOMPRINTF("%s: %s: pfn 0x%" PRIpfn "+0x%" PRIpfn " at %p",
+ __FUNCTION__, mode, phys->first, phys->count, phys->ptr);
+#endif
+ phys->next = dom->phys_pages;
+ dom->phys_pages = phys;
+ return phys->ptr;
+}
+
+static int xc_dom_chk_alloc_pages(struct xc_dom_image *dom, char *name,
+ xen_pfn_t pages)
+{
+ unsigned int page_size = XC_DOM_PAGE_SIZE(dom);
+
+ if ( pages > dom->total_pages || /* multiple test avoids overflow probs */
+ dom->pfn_alloc_end - dom->rambase_pfn > dom->total_pages ||
+ pages > dom->total_pages - dom->pfn_alloc_end + dom->rambase_pfn )
+ {
+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+ "%s: segment %s too large (0x%"PRIpfn" > "
+ "0x%"PRIpfn" - 0x%"PRIpfn" pages)", __FUNCTION__, name,
+ pages, dom->total_pages,
+ dom->pfn_alloc_end - dom->rambase_pfn);
+ return -1;
+ }
+
+ dom->pfn_alloc_end += pages;
+ dom->virt_alloc_end += pages * page_size;
+
+ if ( dom->allocate )
+ dom->allocate(dom);
+
+ return 0;
+}
+
+static int xc_dom_alloc_pad(struct xc_dom_image *dom, xen_vaddr_t boundary)
+{
+ unsigned int page_size = XC_DOM_PAGE_SIZE(dom);
+ xen_pfn_t pages;
+
+ if ( boundary & (page_size - 1) )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: segment boundary isn't page aligned (0x%" PRIx64 ")",
+ __FUNCTION__, boundary);
+ return -1;
+ }
+ if ( boundary < dom->virt_alloc_end )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: segment boundary too low (0x%" PRIx64 " < 0x%" PRIx64
+ ")", __FUNCTION__, boundary, dom->virt_alloc_end);
+ return -1;
+ }
+ pages = (boundary - dom->virt_alloc_end) / page_size;
+
+ return xc_dom_chk_alloc_pages(dom, "padding", pages);
+}
+
+int xc_dom_alloc_segment(struct xc_dom_image *dom,
+ struct xc_dom_seg *seg, char *name,
+ xen_vaddr_t start, xen_vaddr_t size)
+{
+ unsigned int page_size = XC_DOM_PAGE_SIZE(dom);
+ xen_pfn_t pages;
+ void *ptr;
+
+ if ( start && xc_dom_alloc_pad(dom, start) )
+ return -1;
+
+ pages = (size + page_size - 1) / page_size;
+ start = dom->virt_alloc_end;
+
+ seg->pfn = dom->pfn_alloc_end;
+ seg->pages = pages;
+
+ if ( xc_dom_chk_alloc_pages(dom, name, pages) )
+ return -1;
+
+ /* map and clear pages */
+ ptr = xc_dom_seg_to_ptr(dom, seg);
+ if ( ptr == NULL )
+ return -1;
+ memset(ptr, 0, pages * page_size);
+
+ seg->vstart = start;
+ seg->vend = dom->virt_alloc_end;
+
+ DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " -> 0x%" PRIx64
+ " (pfn 0x%" PRIpfn " + 0x%" PRIpfn " pages)",
+ __FUNCTION__, name, seg->vstart, seg->vend, seg->pfn, pages);
+
+ return 0;
+}
+
+xen_pfn_t xc_dom_alloc_page(struct xc_dom_image *dom, char *name)
+{
+ xen_vaddr_t start;
+ xen_pfn_t pfn;
+
+ start = dom->virt_alloc_end;
+ pfn = dom->pfn_alloc_end - dom->rambase_pfn;
+
+ if ( xc_dom_chk_alloc_pages(dom, name, 1) )
+ return INVALID_PFN;
+
+ DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " (pfn 0x%" PRIpfn ")",
+ __FUNCTION__, name, start, pfn);
+ return pfn;
+}
+
+void xc_dom_unmap_one(struct xc_dom_image *dom, xen_pfn_t pfn)
+{
+ unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom);
+ struct xc_dom_phys *phys, *prev = NULL;
+
+ for ( phys = dom->phys_pages; phys != NULL; phys = phys->next )
+ {
+ if ( (pfn >= phys->first) && (pfn < (phys->first + phys->count)) )
+ break;
+ prev = phys;
+ }
+ if ( !phys )
+ {
+ DOMPRINTF("%s: Huh? no mapping with pfn 0x%" PRIpfn "",
+ __FUNCTION__, pfn);
+ return;
+ }
+
+ munmap(phys->ptr, phys->count << page_shift);
+ if ( prev )
+ prev->next = phys->next;
+ else
+ dom->phys_pages = phys->next;
+
+ xc_domain_cacheflush(dom->xch, dom->guest_domid, phys->first, phys->count);
+}
+
+void xc_dom_unmap_all(struct xc_dom_image *dom)
+{
+ while ( dom->phys_pages )
+ xc_dom_unmap_one(dom, dom->phys_pages->first);
+}
+
+/* ------------------------------------------------------------------------ */
+/* pluggable kernel loaders */
+
+static struct xc_dom_loader *first_loader = NULL;
+static struct xc_dom_arch *first_hook = NULL;
+
+void xc_dom_register_loader(struct xc_dom_loader *loader)
+{
+ loader->next = first_loader;
+ first_loader = loader;
+}
+
+static struct xc_dom_loader *xc_dom_find_loader(struct xc_dom_image *dom)
+{
+ struct xc_dom_loader *loader = first_loader;
+
+ while ( loader != NULL )
+ {
+ DOMPRINTF("%s: trying %s loader ... ", __FUNCTION__, loader->name);
+ if ( loader->probe(dom) == 0 )
+ {
+ DOMPRINTF("loader probe OK");
+ return loader;
+ }
+ DOMPRINTF("loader probe failed");
+ loader = loader->next;
+ }
+ xc_dom_panic(dom->xch,
+ XC_INVALID_KERNEL, "%s: no loader found", __FUNCTION__);
+ return NULL;
+}
+
+void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks)
+{
+ hooks->next = first_hook;
+ first_hook = hooks;
+}
+
+int xc_dom_set_arch_hooks(struct xc_dom_image *dom)
+{
+ struct xc_dom_arch *hooks = first_hook;
+
+ while ( hooks != NULL )
+ {
+ if ( !strcmp(hooks->guest_type, dom->guest_type) )
+ {
+ if ( hooks->arch_private_size )
+ {
+ dom->arch_private = malloc(hooks->arch_private_size);
+ if ( dom->arch_private == NULL )
+ return -1;
+ memset(dom->arch_private, 0, hooks->arch_private_size);
+ dom->alloc_malloc += hooks->arch_private_size;
+ }
+ dom->arch_hooks = hooks;
+ return 0;
+ }
+ hooks = hooks->next;
+ }
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s: not found (type %s)", __FUNCTION__, dom->guest_type);
+ return -1;
+}
+
+/* ------------------------------------------------------------------------ */
+/* public interface */
+
+void xc_dom_release(struct xc_dom_image *dom)
+{
+ DOMPRINTF_CALLED(dom->xch);
+ if ( dom->phys_pages )
+ xc_dom_unmap_all(dom);
+ xc_dom_free_all(dom);
+ free(dom->arch_private);
+ free(dom);
+}
+
+struct xc_dom_image *xc_dom_allocate(xc_interface *xch,
+ const char *cmdline, const char *features)
+{
+ struct xc_dom_image *dom;
+
+ xc_dom_printf(xch, "%s: cmdline=\"%s\", features=\"%s\"",
+ __FUNCTION__, cmdline ? cmdline : "",
+ features ? features : "");
+ dom = malloc(sizeof(*dom));
+ if ( !dom )
+ goto err;
+
+ memset(dom, 0, sizeof(*dom));
+ dom->xch = xch;
+
+ dom->max_kernel_size = XC_DOM_DECOMPRESS_MAX;
+ dom->max_module_size = XC_DOM_DECOMPRESS_MAX;
+ dom->max_devicetree_size = XC_DOM_DECOMPRESS_MAX;
+
+ if ( cmdline )
+ dom->cmdline = xc_dom_strdup(dom, cmdline);
+ if ( features )
+ elf_xen_parse_features(features, dom->f_requested, NULL);
+
+ dom->parms.virt_base = UNSET_ADDR;
+ dom->parms.virt_entry = UNSET_ADDR;
+ dom->parms.virt_hypercall = UNSET_ADDR;
+ dom->parms.virt_hv_start_low = UNSET_ADDR;
+ dom->parms.elf_paddr_offset = UNSET_ADDR;
+ dom->parms.p2m_base = UNSET_ADDR;
+
+ dom->flags = SIF_VIRT_P2M_4TOOLS;
+
+ dom->alloc_malloc += sizeof(*dom);
+ return dom;
+
+ err:
+ if ( dom )
+ xc_dom_release(dom);
+ return NULL;
+}
+
+int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz)
+{
+ DOMPRINTF("%s: kernel_max_size=%zx", __FUNCTION__, sz);
+ dom->max_kernel_size = sz;
+ return 0;
+}
+
+int xc_dom_module_max_size(struct xc_dom_image *dom, size_t sz)
+{
+ DOMPRINTF("%s: module_max_size=%zx", __FUNCTION__, sz);
+ dom->max_module_size = sz;
+ return 0;
+}
+
+int xc_dom_devicetree_max_size(struct xc_dom_image *dom, size_t sz)
+{
+ DOMPRINTF("%s: devicetree_max_size=%zx", __FUNCTION__, sz);
+ dom->max_devicetree_size = sz;
+ return 0;
+}
+
+int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename)
+{
+ DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename);
+ dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size,
+ dom->max_kernel_size);
+ if ( dom->kernel_blob == NULL )
+ return -1;
+ return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size);
+}
+
+int xc_dom_module_file(struct xc_dom_image *dom, const char *filename, const char *cmdline)
+{
+ unsigned int mod = dom->num_modules++;
+
+ DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename);
+ dom->modules[mod].blob =
+ xc_dom_malloc_filemap(dom, filename, &dom->modules[mod].size,
+ dom->max_module_size);
+
+ if ( dom->modules[mod].blob == NULL )
+ return -1;
+
+ if ( cmdline )
+ {
+ dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline);
+
+ if ( dom->modules[mod].cmdline == NULL )
+ return -1;
+ }
+ else
+ {
+ dom->modules[mod].cmdline = NULL;
+ }
+
+ return 0;
+}
+
+int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename)
+{
+#if defined (__arm__) || defined(__aarch64__)
+ DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename);
+ dom->devicetree_blob =
+ xc_dom_malloc_filemap(dom, filename, &dom->devicetree_size,
+ dom->max_devicetree_size);
+
+ if ( dom->devicetree_blob == NULL )
+ return -1;
+ return 0;
+#else
+ errno = -EINVAL;
+ return -1;
+#endif
+}
+
+int xc_dom_kernel_mem(struct xc_dom_image *dom, const void *mem, size_t memsize)
+{
+ DOMPRINTF_CALLED(dom->xch);
+ dom->kernel_blob = (void *)mem;
+ dom->kernel_size = memsize;
+ return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size);
+}
+
+int xc_dom_module_mem(struct xc_dom_image *dom, const void *mem,
+ size_t memsize, const char *cmdline)
+{
+ unsigned int mod = dom->num_modules++;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ dom->modules[mod].blob = (void *)mem;
+ dom->modules[mod].size = memsize;
+
+ if ( cmdline )
+ {
+ dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline);
+
+ if ( dom->modules[mod].cmdline == NULL )
+ return -1;
+ }
+ else
+ {
+ dom->modules[mod].cmdline = NULL;
+ }
+
+ return 0;
+}
+
+int xc_dom_devicetree_mem(struct xc_dom_image *dom, const void *mem,
+ size_t memsize)
+{
+ DOMPRINTF_CALLED(dom->xch);
+ dom->devicetree_blob = (void *)mem;
+ dom->devicetree_size = memsize;
+ return 0;
+}
+
+int xc_dom_parse_image(struct xc_dom_image *dom)
+{
+ int i;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ /* parse kernel image */
+ dom->kernel_loader = xc_dom_find_loader(dom);
+ if ( dom->kernel_loader == NULL )
+ goto err;
+ if ( dom->kernel_loader->parser(dom) != 0 )
+ goto err;
+ if ( dom->guest_type == NULL )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: guest_type not set", __FUNCTION__);
+ goto err;
+ }
+
+ /* check features */
+ for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
+ {
+ dom->f_active[i] |= dom->f_requested[i]; /* cmd line */
+ dom->f_active[i] |= dom->parms.f_required[i]; /* kernel */
+ if ( (dom->f_active[i] & dom->parms.f_supported[i]) !=
+ dom->f_active[i] )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_PARAM,
+ "%s: unsupported feature requested", __FUNCTION__);
+ goto err;
+ }
+ }
+ return 0;
+
+ err:
+ return -1;
+}
+
+int xc_dom_rambase_init(struct xc_dom_image *dom, uint64_t rambase)
+{
+ dom->rambase_pfn = rambase >> XC_PAGE_SHIFT;
+ dom->pfn_alloc_end = dom->rambase_pfn;
+ DOMPRINTF("%s: RAM starts at %"PRI_xen_pfn,
+ __FUNCTION__, dom->rambase_pfn);
+ return 0;
+}
+
+int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb)
+{
+ unsigned int page_shift;
+ xen_pfn_t nr_pages;
+
+ if ( xc_dom_set_arch_hooks(dom) )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set",
+ __FUNCTION__);
+ return -1;
+ }
+
+ page_shift = XC_DOM_PAGE_SHIFT(dom);
+ nr_pages = mem_mb << (20 - page_shift);
+
+ DOMPRINTF("%s: mem %d MB, pages 0x%" PRIpfn " pages, %dk each",
+ __FUNCTION__, mem_mb, nr_pages, 1 << (page_shift-10));
+ dom->total_pages = nr_pages;
+
+ DOMPRINTF("%s: 0x%" PRIpfn " pages",
+ __FUNCTION__, dom->total_pages);
+
+ return 0;
+}
+
+static int xc_dom_build_module(struct xc_dom_image *dom, unsigned int mod)
+{
+ size_t unziplen, modulelen;
+ void *modulemap;
+ char name[10];
+
+ if ( !dom->modules[mod].seg.vstart )
+ unziplen = xc_dom_check_gzip(dom->xch,
+ dom->modules[mod].blob, dom->modules[mod].size);
+ else
+ unziplen = 0;
+
+ modulelen = max(unziplen, dom->modules[mod].size);
+ if ( dom->max_module_size )
+ {
+ if ( unziplen && modulelen > dom->max_module_size )
+ {
+ modulelen = min(unziplen, dom->modules[mod].size);
+ if ( unziplen > modulelen )
+ unziplen = 0;
+ }
+ if ( modulelen > dom->max_module_size )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "module %u image too large", mod);
+ goto err;
+ }
+ }
+
+ snprintf(name, sizeof(name), "module%u", mod);
+ if ( xc_dom_alloc_segment(dom, &dom->modules[mod].seg, name,
+ dom->modules[mod].seg.vstart, modulelen) != 0 )
+ goto err;
+ modulemap = xc_dom_seg_to_ptr(dom, &dom->modules[mod].seg);
+ if ( modulemap == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->modules[%u].seg) => NULL",
+ __FUNCTION__, mod);
+ goto err;
+ }
+ if ( unziplen )
+ {
+ if ( xc_dom_do_gunzip(dom->xch, dom->modules[mod].blob, dom->modules[mod].size,
+ modulemap, unziplen) != -1 )
+ return 0;
+ if ( dom->modules[mod].size > modulelen )
+ goto err;
+ }
+
+ /* Fall back to handing over the raw blob. */
+ memcpy(modulemap, dom->modules[mod].blob, dom->modules[mod].size);
+ /* If an unzip attempt was made, the buffer may no longer be all zero. */
+ if ( unziplen > dom->modules[mod].size )
+ memset(modulemap + dom->modules[mod].size, 0,
+ unziplen - dom->modules[mod].size);
+
+ return 0;
+
+ err:
+ return -1;
+}
+
+static int populate_acpi_pages(struct xc_dom_image *dom,
+ xen_pfn_t *extents,
+ unsigned int num_pages)
+{
+ int rc;
+ xc_interface *xch = dom->xch;
+ uint32_t domid = dom->guest_domid;
+ unsigned long idx;
+ unsigned long first_high_idx = 4UL << (30 - PAGE_SHIFT); /* 4GB */
+
+ for ( ; num_pages; num_pages--, extents++ )
+ {
+
+ if ( xc_domain_populate_physmap(xch, domid, 1, 0, 0, extents) == 1 )
+ continue;
+
+ if ( dom->highmem_end )
+ {
+ idx = --dom->highmem_end;
+ if ( idx == first_high_idx )
+ dom->highmem_end = 0;
+ }
+ else
+ {
+ idx = --dom->lowmem_end;
+ }
+
+ rc = xc_domain_add_to_physmap(xch, domid,
+ XENMAPSPACE_gmfn,
+ idx, *extents);
+ if ( rc )
+ return rc;
+ }
+
+ return 0;
+}
+
+static int xc_dom_load_acpi(struct xc_dom_image *dom)
+{
+ int j, i = 0;
+ unsigned num_pages;
+ xen_pfn_t *extents, base;
+ void *ptr;
+
+ while ( (i < MAX_ACPI_MODULES) && dom->acpi_modules[i].length )
+ {
+ DOMPRINTF("%s: %d bytes at address %" PRIx64, __FUNCTION__,
+ dom->acpi_modules[i].length,
+ dom->acpi_modules[i].guest_addr_out);
+
+ num_pages = (dom->acpi_modules[i].length +
+ (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK) +
+ (XC_PAGE_SIZE - 1)) >> XC_PAGE_SHIFT;
+ extents = malloc(num_pages * sizeof(*extents));
+ if ( !extents )
+ {
+ DOMPRINTF("%s: Out of memory", __FUNCTION__);
+ goto err;
+ }
+
+ base = dom->acpi_modules[i].guest_addr_out >> XC_PAGE_SHIFT;
+ for ( j = 0; j < num_pages; j++ )
+ extents[j] = base + j;
+ if ( populate_acpi_pages(dom, extents, num_pages) )
+ {
+ DOMPRINTF("%s: Can populate ACPI pages", __FUNCTION__);
+ goto err;
+ }
+
+ ptr = xc_map_foreign_range(dom->xch, dom->guest_domid,
+ XC_PAGE_SIZE * num_pages,
+ PROT_READ | PROT_WRITE, base);
+ if ( !ptr )
+ {
+ DOMPRINTF("%s: Can't map %d pages at 0x%"PRI_xen_pfn,
+ __FUNCTION__, num_pages, base);
+ goto err;
+ }
+
+ memcpy((uint8_t *)ptr +
+ (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK),
+ dom->acpi_modules[i].data, dom->acpi_modules[i].length);
+ munmap(ptr, XC_PAGE_SIZE * num_pages);
+
+ free(extents);
+ i++;
+ }
+
+ return 0;
+
+err:
+ free(extents);
+ return -1;
+}
+
+int xc_dom_build_image(struct xc_dom_image *dom)
+{
+ unsigned int page_size;
+ bool unmapped_initrd;
+ unsigned int mod;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ /* check for arch hooks */
+ if ( dom->arch_hooks == NULL )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set",
+ __FUNCTION__);
+ goto err;
+ }
+ page_size = XC_DOM_PAGE_SIZE(dom);
+ if ( dom->parms.virt_base != UNSET_ADDR )
+ dom->virt_alloc_end = dom->parms.virt_base;
+
+ /* load kernel */
+ if ( xc_dom_alloc_segment(dom, &dom->kernel_seg, "kernel",
+ dom->kernel_seg.vstart,
+ dom->kernel_seg.vend -
+ dom->kernel_seg.vstart) != 0 )
+ goto err;
+ if ( dom->kernel_loader->loader(dom) != 0 )
+ goto err;
+
+ /* Don't load ramdisk / other modules now if no initial mapping required. */
+ for ( mod = 0; mod < dom->num_modules; mod++ )
+ {
+ unmapped_initrd = (dom->parms.unmapped_initrd &&
+ !dom->modules[mod].seg.vstart);
+
+ if ( dom->modules[mod].blob && !unmapped_initrd )
+ {
+ if ( xc_dom_build_module(dom, mod) != 0 )
+ goto err;
+
+ if ( mod == 0 )
+ {
+ dom->initrd_start = dom->modules[mod].seg.vstart;
+ dom->initrd_len =
+ dom->modules[mod].seg.vend - dom->modules[mod].seg.vstart;
+ }
+ }
+ }
+
+ /* load devicetree */
+ if ( dom->devicetree_blob )
+ {
+ void *devicetreemap;
+
+ if ( xc_dom_alloc_segment(dom, &dom->devicetree_seg, "devicetree",
+ dom->devicetree_seg.vstart,
+ dom->devicetree_size) != 0 )
+ goto err;
+ devicetreemap = xc_dom_seg_to_ptr(dom, &dom->devicetree_seg);
+ if ( devicetreemap == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->devicetree_seg) => NULL",
+ __FUNCTION__);
+ goto err;
+ }
+ memcpy(devicetreemap, dom->devicetree_blob, dom->devicetree_size);
+ }
+
+ /* load ACPI tables */
+ if ( xc_dom_load_acpi(dom) != 0 )
+ goto err;
+
+ /* allocate other pages */
+ if ( !dom->arch_hooks->p2m_base_supported ||
+ dom->parms.p2m_base >= dom->parms.virt_base ||
+ (dom->parms.p2m_base & (XC_DOM_PAGE_SIZE(dom) - 1)) )
+ dom->parms.p2m_base = UNSET_ADDR;
+ if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base == UNSET_ADDR &&
+ dom->arch_hooks->alloc_p2m_list(dom) != 0 )
+ goto err;
+ if ( dom->arch_hooks->alloc_magic_pages(dom) != 0 )
+ goto err;
+ if ( dom->arch_hooks->alloc_pgtables &&
+ dom->arch_hooks->alloc_pgtables(dom) != 0 )
+ goto err;
+ if ( dom->alloc_bootstack )
+ {
+ dom->bootstack_pfn = xc_dom_alloc_page(dom, "boot stack");
+ if ( dom->bootstack_pfn == INVALID_PFN )
+ goto err;
+ }
+
+ DOMPRINTF("%-20s: virt_alloc_end : 0x%" PRIx64 "",
+ __FUNCTION__, dom->virt_alloc_end);
+ DOMPRINTF("%-20s: virt_pgtab_end : 0x%" PRIx64 "",
+ __FUNCTION__, dom->virt_pgtab_end);
+
+ /* Make sure all memory mapped by initial page tables is available */
+ if ( dom->virt_pgtab_end && xc_dom_alloc_pad(dom, dom->virt_pgtab_end) )
+ return -1;
+
+ for ( mod = 0; mod < dom->num_modules; mod++ )
+ {
+ unmapped_initrd = (dom->parms.unmapped_initrd &&
+ !dom->modules[mod].seg.vstart);
+
+ /* Load ramdisk / other modules if no initial mapping required. */
+ if ( dom->modules[mod].blob && unmapped_initrd )
+ {
+ if ( xc_dom_build_module(dom, mod) != 0 )
+ goto err;
+
+ if ( mod == 0 )
+ {
+ dom->flags |= SIF_MOD_START_PFN;
+ dom->initrd_start = dom->modules[mod].seg.pfn;
+ dom->initrd_len = page_size * dom->modules[mod].seg.pages;
+ }
+ }
+ }
+
+ /* Allocate p2m list if outside of initial kernel mapping. */
+ if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base != UNSET_ADDR )
+ {
+ if ( dom->arch_hooks->alloc_p2m_list(dom) != 0 )
+ goto err;
+ dom->p2m_seg.vstart = dom->parms.p2m_base;
+ }
+
+ return 0;
+
+ err:
+ return -1;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#ifndef __MINIOS__
+# include "xenctrl_dom.h"
+#else
+# include "xg_dom_decompress_unsafe.h"
+#endif
+
+int xc_try_lz4_decode(struct xc_dom_image *dom, void **blob, size_t *size);
+
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <stdint.h>
+
+#include "xg_private.h"
+#include "xg_dom_decompress.h"
+
+#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define likely(a) a
+#define unlikely(a) a
+
+static inline uint_fast16_t le16_to_cpup(const unsigned char *buf)
+{
+ return buf[0] | (buf[1] << 8);
+}
+
+static inline uint_fast32_t le32_to_cpup(const unsigned char *buf)
+{
+ return le16_to_cpup(buf) | ((uint32_t)le16_to_cpup(buf + 2) << 16);
+}
+
+#include "../../xen/include/xen/lz4.h"
+#include "../../xen/common/decompress.h"
+
+#ifndef __MINIOS__
+
+#include "../../xen/common/lz4/decompress.c"
+
+#define ARCHIVE_MAGICNUMBER 0x184C2102
+
+int xc_try_lz4_decode(
+ struct xc_dom_image *dom, void **blob, size_t *psize)
+{
+ int ret = -1;
+ unsigned char *inp = *blob, *output, *outp;
+ ssize_t size = *psize - 4;
+ size_t out_len, dest_len, chunksize;
+ const char *msg;
+
+ if (size < 4) {
+ msg = "input too small";
+ goto exit_0;
+ }
+
+ out_len = get_unaligned_le32(inp + size);
+ if (xc_dom_kernel_check_size(dom, out_len)) {
+ msg = "Decompressed image too large";
+ goto exit_0;
+ }
+
+ output = malloc(out_len);
+ if (!output) {
+ msg = "Could not allocate output buffer";
+ goto exit_0;
+ }
+ outp = output;
+
+ chunksize = get_unaligned_le32(inp);
+ if (chunksize == ARCHIVE_MAGICNUMBER) {
+ inp += 4;
+ size -= 4;
+ } else {
+ msg = "invalid header";
+ goto exit_2;
+ }
+
+ for (;;) {
+ if (size < 4) {
+ msg = "missing data";
+ goto exit_2;
+ }
+ chunksize = get_unaligned_le32(inp);
+ if (chunksize == ARCHIVE_MAGICNUMBER) {
+ inp += 4;
+ size -= 4;
+ continue;
+ }
+ inp += 4;
+ size -= 4;
+ if (chunksize > size) {
+ msg = "insufficient input data";
+ goto exit_2;
+ }
+
+ dest_len = out_len - (outp - output);
+ ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp,
+ &dest_len);
+ if (ret < 0) {
+ msg = "decoding failed";
+ goto exit_2;
+ }
+
+ ret = -1;
+ outp += dest_len;
+ size -= chunksize;
+
+ if (size == 0)
+ {
+ if ( xc_dom_register_external(dom, output, out_len) )
+ {
+ msg = "Error registering stream output";
+ goto exit_2;
+ }
+ *blob = output;
+ *psize = out_len;
+ return 0;
+ }
+
+ if (size < 0) {
+ msg = "data corrupted";
+ goto exit_2;
+ }
+
+ inp += chunksize;
+ }
+
+exit_2:
+ free(output);
+exit_0:
+ DOMPRINTF("LZ4 decompression error: %s\n", msg);
+ return ret;
+}
+
+#else /* __MINIOS__ */
+
+#include "../../xen/common/unlz4.c"
+
+int xc_try_lz4_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ return xc_dom_decompress_unsafe(unlz4, dom, blob, size);
+}
+
+#endif
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xg_dom_decompress_unsafe.h"
+
+static struct xc_dom_image *unsafe_dom;
+static unsigned char *output_blob;
+static unsigned int output_size;
+
+static void unsafe_error(const char *msg)
+{
+ xc_dom_panic(unsafe_dom->xch, XC_INVALID_KERNEL, "%s", msg);
+}
+
+static int unsafe_flush(void *src, unsigned int size)
+{
+ void *n = realloc(output_blob, output_size + size);
+ if (!n)
+ return -1;
+ output_blob = n;
+
+ memcpy(&output_blob[output_size], src, size);
+ output_size += size;
+ return size;
+}
+
+int xc_dom_decompress_unsafe(
+ decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ int ret;
+
+ unsafe_dom = dom;
+ output_blob = NULL;
+ output_size = 0;
+
+ ret = fn(dom->kernel_blob, dom->kernel_size, NULL, unsafe_flush, NULL, NULL, unsafe_error);
+
+ if (ret)
+ free(output_blob);
+ else {
+ *blob = output_blob;
+ *size = output_size;
+ }
+
+ return ret;
+}
--- /dev/null
+#include "xenctrl_dom.h"
+
+typedef int decompress_fn(unsigned char *inbuf, unsigned int len,
+ int (*fill)(void*, unsigned int),
+ int (*flush)(void*, unsigned int),
+ unsigned char *outbuf, unsigned int *posp,
+ void (*error)(const char *x));
+
+int xc_dom_decompress_unsafe(
+ decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size)
+ __attribute__((visibility("internal")));
+
+int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size)
+ __attribute__((visibility("internal")));
+int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size)
+ __attribute__((visibility("internal")));
+int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size)
+ __attribute__((visibility("internal")));
+int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size)
+ __attribute__((visibility("internal")));
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xg_dom_decompress_unsafe.h"
+
+#include "../../xen/common/bunzip2.c"
+
+int xc_try_bzip2_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ return xc_dom_decompress_unsafe(bunzip2, dom, blob, size);
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xg_dom_decompress_unsafe.h"
+
+#include "../../xen/common/unlzma.c"
+
+int xc_try_lzma_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ return xc_dom_decompress_unsafe(unlzma, dom, blob, size);
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <endian.h>
+#include <stdint.h>
+
+#include "xg_private.h"
+#include "xg_dom_decompress_unsafe.h"
+
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint64_t u64;
+
+#define likely(a) a
+#define noinline
+#define unlikely(a) a
+
+static inline u16 be16_to_cpup(const u16 *p)
+{
+ u16 v = *p;
+#if BYTE_ORDER == LITTLE_ENDIAN
+ return (((v & 0x00ffU) << 8) |
+ ((v & 0xff00U) >> 8));
+#else
+ return v;
+#endif
+}
+
+static inline u32 be32_to_cpup(const u32 *p)
+{
+ u32 v = *p;
+#if BYTE_ORDER == LITTLE_ENDIAN
+ return (((v & 0x000000ffUL) << 24) |
+ ((v & 0x0000ff00UL) << 8) |
+ ((v & 0x00ff0000UL) >> 8) |
+ ((v & 0xff000000UL) >> 24));
+#else
+ return v;
+#endif
+}
+
+#include "../../xen/common/lzo.c"
+#include "../../xen/common/unlzo.c"
+
+int xc_try_lzo1x_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ return xc_dom_decompress_unsafe(unlzo, dom, blob, size);
+}
--- /dev/null
+#include <stdio.h>
+#include <endian.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xg_dom_decompress_unsafe.h"
+
+// TODO
+#define XZ_DEC_X86
+
+typedef char bool_t;
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint32_t __le32;
+
+static inline u32 cpu_to_le32(const u32 v)
+{
+#if BYTE_ORDER == BIG_ENDIAN
+ return (((v & 0x000000ffUL) << 24) |
+ ((v & 0x0000ff00UL) << 8) |
+ ((v & 0x00ff0000UL) >> 8) |
+ ((v & 0xff000000UL) >> 24));
+#else
+ return v;
+#endif
+}
+
+static inline u32 le32_to_cpup(const u32 *p)
+{
+ return cpu_to_le32(*p);
+}
+
+#define __force
+#define always_inline
+
+#include "../../xen/common/unxz.c"
+
+int xc_try_xz_decode(
+ struct xc_dom_image *dom, void **blob, size_t *size)
+{
+ return xc_dom_decompress_unsafe(unxz, dom, blob, size);
+}
--- /dev/null
+/*
+ * Xen domain builder -- ELF bits.
+ *
+ * Parse and load ELF kernel images.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <inttypes.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+#include "xc_bitops.h"
+
+#define XEN_VER "xen-3.0"
+
+/* ------------------------------------------------------------------------ */
+
+static void log_callback(struct elf_binary *elf, void *caller_data,
+ bool iserr, const char *fmt, va_list al) {
+ xc_interface *xch = caller_data;
+
+ xc_reportv(xch,
+ xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler,
+ iserr ? XTL_ERROR : XTL_DETAIL,
+ iserr ? XC_INVALID_KERNEL : XC_ERROR_NONE,
+ fmt, al);
+}
+
+void xc_elf_set_logfile(xc_interface *xch, struct elf_binary *elf,
+ int verbose) {
+ elf_set_log(elf, log_callback, xch, verbose /* convert to bool */);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static char *xc_dom_guest_type(struct xc_dom_image *dom,
+ struct elf_binary *elf)
+{
+ uint64_t machine = elf_uval(elf, elf->ehdr, e_machine);
+
+ if ( dom->container_type == XC_DOM_HVM_CONTAINER &&
+ dom->parms.phys_entry != UNSET_ADDR32 )
+ return "hvm-3.0-x86_32";
+ if ( dom->container_type == XC_DOM_HVM_CONTAINER )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s: image not capable of booting inside a HVM container",
+ __FUNCTION__);
+ return NULL;
+ }
+
+ switch ( machine )
+ {
+ case EM_386:
+ switch ( dom->parms.pae )
+ {
+ case XEN_PAE_BIMODAL:
+ if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") )
+ return "xen-3.0-x86_32p";
+ return "xen-3.0-x86_32";
+ case XEN_PAE_EXTCR3:
+ case XEN_PAE_YES:
+ return "xen-3.0-x86_32p";
+ case XEN_PAE_NO:
+ default:
+ return "xen-3.0-x86_32";
+ }
+ case EM_X86_64:
+ return "xen-3.0-x86_64";
+ default:
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s: unknown image type %"PRIu64,
+ __FUNCTION__, machine);
+ return NULL;
+ }
+}
+
+/* ------------------------------------------------------------------------ */
+/* parse elf binary */
+
+static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose)
+{
+ if ( dom->kernel_blob == NULL )
+ {
+ if ( verbose )
+ xc_dom_panic(dom->xch,
+ XC_INTERNAL_ERROR, "%s: no kernel image loaded",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+
+ if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) )
+ {
+ if ( verbose )
+ xc_dom_panic(dom->xch,
+ XC_INVALID_KERNEL, "%s: kernel is not an ELF image",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static elf_negerrnoval xc_dom_probe_elf_kernel(struct xc_dom_image *dom)
+{
+ struct elf_binary elf;
+ int rc;
+
+ rc = check_elf_kernel(dom, 0);
+ if ( rc != 0 )
+ return rc;
+
+ rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size);
+ if ( rc != 0 )
+ return rc;
+
+ /*
+ * We need to check that it contains Xen ELFNOTES,
+ * or else we might be trying to load a plain ELF.
+ */
+ elf_parse_binary(&elf);
+ rc = elf_xen_parse(&elf, &dom->parms);
+ if ( rc != 0 )
+ return rc;
+
+ return 0;
+}
+
+static elf_negerrnoval xc_dom_parse_elf_kernel(struct xc_dom_image *dom)
+{
+ struct elf_binary *elf;
+ elf_negerrnoval rc;
+
+ rc = check_elf_kernel(dom, 1);
+ if ( rc != 0 )
+ return rc;
+
+ elf = xc_dom_malloc(dom, sizeof(*elf));
+ if ( elf == NULL )
+ return -ENOMEM;
+ dom->private_loader = elf;
+ rc = elf_init(elf, dom->kernel_blob, dom->kernel_size) != 0 ? -EINVAL : 0;
+ xc_elf_set_logfile(dom->xch, elf, 1);
+ if ( rc != 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image",
+ __FUNCTION__);
+ return rc;
+ }
+
+ /* parse binary and get xen meta info */
+ elf_parse_binary(elf);
+ if ( elf_xen_parse(elf, &dom->parms) != 0 )
+ {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if ( elf_xen_feature_get(XENFEAT_dom0, dom->parms.f_required) )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Kernel does not"
+ " support unprivileged (DomU) operation", __FUNCTION__);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* find kernel segment */
+ dom->kernel_seg.vstart = dom->parms.virt_kstart;
+ dom->kernel_seg.vend = dom->parms.virt_kend;
+
+ dom->guest_type = xc_dom_guest_type(dom, elf);
+ if ( dom->guest_type == NULL )
+ return -EINVAL;
+ DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "",
+ __FUNCTION__, dom->guest_type,
+ dom->kernel_seg.vstart, dom->kernel_seg.vend);
+ rc = 0;
+out:
+ if ( elf_check_broken(elf) )
+ DOMPRINTF("%s: ELF broken: %s", __FUNCTION__,
+ elf_check_broken(elf));
+
+ return rc;
+}
+
+static elf_errorstatus xc_dom_load_elf_kernel(struct xc_dom_image *dom)
+{
+ struct elf_binary *elf = dom->private_loader;
+ elf_errorstatus rc;
+ xen_pfn_t pages;
+
+ elf->dest_base = xc_dom_seg_to_ptr_pages(dom, &dom->kernel_seg, &pages);
+ if ( elf->dest_base == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom,dom->kernel_seg)"
+ " => NULL", __FUNCTION__);
+ return -1;
+ }
+ elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom);
+
+ rc = elf_load_binary(elf);
+ if ( rc < 0 )
+ {
+ DOMPRINTF("%s: failed to load elf binary", __FUNCTION__);
+ return rc;
+ }
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+struct xc_dom_loader elf_loader = {
+ .name = "ELF-generic",
+ .probe = xc_dom_probe_elf_kernel,
+ .parser = xc_dom_parse_elf_kernel,
+ .loader = xc_dom_load_elf_kernel,
+};
+
+static void __init register_loader(void)
+{
+ xc_dom_register_loader(&elf_loader);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Xen domain builder -- HVM specific bits.
+ *
+ * Parse and load ELF firmware images for HVM domains.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+#include "xc_bitops.h"
+
+/* ------------------------------------------------------------------------ */
+/* parse elf binary */
+
+static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose)
+{
+ if ( dom->kernel_blob == NULL )
+ {
+ if ( verbose )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: no kernel image loaded", __func__);
+ return -EINVAL;
+ }
+
+ if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) )
+ {
+ if ( verbose )
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
+ "%s: kernel is not an ELF image", __func__);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static elf_negerrnoval xc_dom_probe_hvm_kernel(struct xc_dom_image *dom)
+{
+ struct elf_binary elf;
+ int rc;
+
+ /* This loader is designed for HVM guest firmware. */
+ if ( dom->container_type != XC_DOM_HVM_CONTAINER )
+ return -EINVAL;
+
+ rc = check_elf_kernel(dom, 0);
+ if ( rc != 0 )
+ return rc;
+
+ rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size);
+ if ( rc != 0 )
+ return rc;
+
+ /*
+ * We need to check that there are no Xen ELFNOTES, or
+ * else we might be trying to load a PV kernel.
+ */
+ elf_parse_binary(&elf);
+ rc = elf_xen_parse(&elf, &dom->parms);
+ if ( rc == 0 )
+ return -EINVAL;
+
+ return 0;
+}
+
+static elf_errorstatus xc_dom_parse_hvm_kernel(struct xc_dom_image *dom)
+ /*
+ * This function sometimes returns -1 for error and sometimes
+ * an errno value. ?!?!
+ */
+{
+ struct elf_binary *elf;
+ elf_errorstatus rc;
+
+ rc = check_elf_kernel(dom, 1);
+ if ( rc != 0 )
+ return rc;
+
+ elf = xc_dom_malloc(dom, sizeof(*elf));
+ if ( elf == NULL )
+ return -1;
+ dom->private_loader = elf;
+ rc = elf_init(elf, dom->kernel_blob, dom->kernel_size);
+ xc_elf_set_logfile(dom->xch, elf, 1);
+ if ( rc != 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image",
+ __func__);
+ return rc;
+ }
+
+ if ( !elf_32bit(elf) )
+ {
+ xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: ELF image is not 32bit",
+ __func__);
+ return -EINVAL;
+ }
+
+ /* parse binary and get xen meta info */
+ elf_parse_binary(elf);
+
+ /* find kernel segment */
+ dom->kernel_seg.vstart = elf->pstart;
+ dom->kernel_seg.vend = elf->pend;
+
+ dom->guest_type = "hvm-3.0-x86_32";
+
+ if ( elf_check_broken(elf) )
+ DOMPRINTF("%s: ELF broken: %s", __func__, elf_check_broken(elf));
+
+ return rc;
+}
+
+static int module_init_one(struct xc_dom_image *dom,
+ struct xc_hvm_firmware_module *module,
+ char *name)
+{
+ struct xc_dom_seg seg;
+ void *dest;
+
+ if ( module->length && !module->guest_addr_out )
+ {
+ if ( xc_dom_alloc_segment(dom, &seg, name, 0, module->length) )
+ goto err;
+ dest = xc_dom_seg_to_ptr(dom, &seg);
+ if ( dest == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &seg) => NULL",
+ __FUNCTION__);
+ goto err;
+ }
+ memcpy(dest, module->data, module->length);
+ module->guest_addr_out = seg.vstart;
+
+ assert(dom->mmio_start > 0 && dom->mmio_start < UINT32_MAX);
+ if ( module->guest_addr_out > dom->mmio_start ||
+ module->guest_addr_out + module->length > dom->mmio_start )
+ {
+ DOMPRINTF("%s: Module %s would be loaded abrove 4GB",
+ __FUNCTION__, name);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ return -1;
+}
+
+static int modules_init(struct xc_dom_image *dom)
+{
+ int rc;
+
+ rc = module_init_one(dom, &dom->system_firmware_module,
+ "System Firmware module");
+ if ( rc ) goto err;
+ /* Only one module can be added */
+ rc = module_init_one(dom, &dom->acpi_modules[0], "ACPI module");
+ if ( rc ) goto err;
+ rc = module_init_one(dom, &dom->smbios_module, "SMBIOS module");
+ if ( rc ) goto err;
+
+ return 0;
+err:
+ return -1;
+}
+
+static elf_errorstatus xc_dom_load_hvm_kernel(struct xc_dom_image *dom)
+{
+ struct elf_binary *elf = dom->private_loader;
+ privcmd_mmap_entry_t *entries = NULL;
+ size_t pages = (elf->pend - elf->pstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ elf_errorstatus rc;
+ int i;
+
+ /* Map address space for initial elf image. */
+ entries = calloc(pages, sizeof(privcmd_mmap_entry_t));
+ if ( entries == NULL )
+ return -ENOMEM;
+
+ for ( i = 0; i < pages; i++ )
+ entries[i].mfn = (elf->pstart >> PAGE_SHIFT) + i;
+
+ elf->dest_base = xc_map_foreign_ranges(
+ dom->xch, dom->guest_domid, pages << PAGE_SHIFT,
+ PROT_READ | PROT_WRITE, 1 << PAGE_SHIFT,
+ entries, pages);
+ if ( elf->dest_base == NULL )
+ {
+ DOMPRINTF("%s: unable to map guest memory space", __func__);
+ rc = -EFAULT;
+ goto error;
+ }
+
+ elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom);
+
+ rc = elf_load_binary(elf);
+ if ( rc < 0 )
+ {
+ DOMPRINTF("%s: failed to load elf binary", __func__);
+ goto error;
+ }
+
+ munmap(elf->dest_base, elf->dest_size);
+
+ rc = modules_init(dom);
+ if ( rc != 0 )
+ {
+ DOMPRINTF("%s: unable to load modules.", __func__);
+ goto error;
+ }
+
+ dom->parms.phys_entry = elf_uval(elf, elf->ehdr, e_entry);
+
+ free(entries);
+ return 0;
+
+ error:
+ assert(rc != 0);
+ free(entries);
+ return rc;
+}
+
+/* ------------------------------------------------------------------------ */
+
+struct xc_dom_loader hvm_loader = {
+ .name = "HVM-generic",
+ .probe = xc_dom_probe_hvm_kernel,
+ .parser = xc_dom_parse_hvm_kernel,
+ .loader = xc_dom_load_hvm_kernel,
+};
+
+static void __init register_loader(void)
+{
+ xc_dom_register_loader(&hvm_loader);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * Xen domain builder -- i386 and x86_64 bits.
+ *
+ * Most architecture-specific code for x86 goes here.
+ * - prepare page tables.
+ * - fill architecture-specific structs.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include <xen/xen.h>
+#include <xen/foreign/x86_32.h>
+#include <xen/foreign/x86_64.h>
+#include <xen/hvm/hvm_info_table.h>
+#include <xen/arch-x86/hvm/start_info.h>
+#include <xen/io/protocols.h>
+
+#include <xen-tools/libs.h>
+
+#include "xg_private.h"
+#include "xenctrl_dom.h"
+#include "xenctrl.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define SUPERPAGE_BATCH_SIZE 512
+
+#define SUPERPAGE_2MB_SHIFT 9
+#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
+#define SUPERPAGE_1GB_SHIFT 18
+#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
+
+#define X86_CR0_PE 0x01
+#define X86_CR0_ET 0x10
+
+#define X86_DR6_DEFAULT 0xffff0ff0u
+#define X86_DR7_DEFAULT 0x00000400u
+
+#define MTRR_TYPE_WRBACK 6
+#define MTRR_DEF_TYPE_ENABLE (1u << 11)
+
+#define SPECIALPAGE_PAGING 0
+#define SPECIALPAGE_ACCESS 1
+#define SPECIALPAGE_SHARING 2
+#define SPECIALPAGE_BUFIOREQ 3
+#define SPECIALPAGE_XENSTORE 4
+#define SPECIALPAGE_IOREQ 5
+#define SPECIALPAGE_IDENT_PT 6
+#define SPECIALPAGE_CONSOLE 7
+#define special_pfn(x) \
+ (X86_HVM_END_SPECIAL_REGION - X86_HVM_NR_SPECIAL_PAGES + (x))
+
+#define NR_IOREQ_SERVER_PAGES 8
+#define ioreq_server_pfn(x) (special_pfn(0) - NR_IOREQ_SERVER_PAGES + (x))
+
+#define bits_to_mask(bits) (((xen_vaddr_t)1 << (bits))-1)
+#define round_down(addr, mask) ((addr) & ~(mask))
+#define round_up(addr, mask) ((addr) | (mask))
+#define round_pg_up(addr) (((addr) + PAGE_SIZE_X86 - 1) & ~(PAGE_SIZE_X86 - 1))
+
+#define HVMLOADER_MODULE_MAX_COUNT 2
+#define HVMLOADER_MODULE_CMDLINE_SIZE MAX_GUEST_CMDLINE
+
+struct xc_dom_params {
+ unsigned levels;
+ xen_vaddr_t vaddr_mask;
+ x86_pgentry_t lvl_prot[4];
+};
+
+struct xc_dom_x86_mapping_lvl {
+ xen_vaddr_t from;
+ xen_vaddr_t to;
+ xen_pfn_t pfn;
+ unsigned int pgtables;
+};
+
+struct xc_dom_x86_mapping {
+ struct xc_dom_x86_mapping_lvl area;
+ struct xc_dom_x86_mapping_lvl lvls[4];
+};
+
+struct xc_dom_image_x86 {
+ unsigned n_mappings;
+#define MAPPING_MAX 2
+ struct xc_dom_x86_mapping maps[MAPPING_MAX];
+ const struct xc_dom_params *params;
+
+ /* PV: Pointer to the in-guest P2M. */
+ void *p2m_guest;
+};
+
+/* get guest IO ABI protocol */
+const char *xc_domain_get_native_protocol(xc_interface *xch,
+ uint32_t domid)
+{
+ int ret;
+ uint32_t guest_width;
+ const char *protocol;
+
+ ret = xc_domain_get_guest_width(xch, domid, &guest_width);
+
+ if ( ret )
+ return NULL;
+
+ switch (guest_width) {
+ case 4: /* 32 bit guest */
+ protocol = XEN_IO_PROTO_ABI_X86_32;
+ break;
+ case 8: /* 64 bit guest */
+ protocol = XEN_IO_PROTO_ABI_X86_64;
+ break;
+ default:
+ protocol = NULL;
+ }
+
+ return protocol;
+}
+
+static int count_pgtables(struct xc_dom_image *dom, xen_vaddr_t from,
+ xen_vaddr_t to, xen_pfn_t pfn)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ struct xc_dom_x86_mapping *map, *map_cmp;
+ xen_pfn_t pfn_end;
+ xen_vaddr_t mask;
+ unsigned bits;
+ int l, m;
+
+ if ( domx86->n_mappings == MAPPING_MAX )
+ {
+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+ "%s: too many mappings\n", __FUNCTION__);
+ return -ENOMEM;
+ }
+ map = domx86->maps + domx86->n_mappings;
+
+ pfn_end = pfn + ((to - from) >> PAGE_SHIFT_X86);
+ if ( pfn_end >= dom->p2m_size )
+ {
+ xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+ "%s: not enough memory for initial mapping (%#"PRIpfn" > %#"PRIpfn")",
+ __FUNCTION__, pfn_end, dom->p2m_size);
+ return -ENOMEM;
+ }
+ for ( m = 0; m < domx86->n_mappings; m++ )
+ {
+ map_cmp = domx86->maps + m;
+ if ( from < map_cmp->area.to && to > map_cmp->area.from )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: overlapping mappings\n", __FUNCTION__);
+ return -EINVAL;
+ }
+ }
+
+ memset(map, 0, sizeof(*map));
+ map->area.from = from & domx86->params->vaddr_mask;
+ map->area.to = to & domx86->params->vaddr_mask;
+
+ for ( l = domx86->params->levels - 1; l >= 0; l-- )
+ {
+ map->lvls[l].pfn = dom->pfn_alloc_end + map->area.pgtables;
+ if ( l == domx86->params->levels - 1 )
+ {
+ /* Top level page table in first mapping only. */
+ if ( domx86->n_mappings == 0 )
+ {
+ map->lvls[l].from = 0;
+ map->lvls[l].to = domx86->params->vaddr_mask;
+ map->lvls[l].pgtables = 1;
+ map->area.pgtables++;
+ }
+ continue;
+ }
+
+ bits = PAGE_SHIFT_X86 + (l + 1) * PGTBL_LEVEL_SHIFT_X86;
+ mask = bits_to_mask(bits);
+ map->lvls[l].from = map->area.from & ~mask;
+ map->lvls[l].to = map->area.to | mask;
+
+ if ( domx86->params->levels == PGTBL_LEVELS_I386 &&
+ domx86->n_mappings == 0 && to < 0xc0000000 && l == 1 )
+ {
+ DOMPRINTF("%s: PAE: extra l2 page table for l3#3", __FUNCTION__);
+ map->lvls[l].to = domx86->params->vaddr_mask;
+ }
+
+ for ( m = 0; m < domx86->n_mappings; m++ )
+ {
+ map_cmp = domx86->maps + m;
+ if ( map_cmp->lvls[l].from == map_cmp->lvls[l].to )
+ continue;
+ if ( map->lvls[l].from >= map_cmp->lvls[l].from &&
+ map->lvls[l].to <= map_cmp->lvls[l].to )
+ {
+ map->lvls[l].from = 0;
+ map->lvls[l].to = 0;
+ break;
+ }
+ assert(map->lvls[l].from >= map_cmp->lvls[l].from ||
+ map->lvls[l].to <= map_cmp->lvls[l].to);
+ if ( map->lvls[l].from >= map_cmp->lvls[l].from &&
+ map->lvls[l].from <= map_cmp->lvls[l].to )
+ map->lvls[l].from = map_cmp->lvls[l].to + 1;
+ if ( map->lvls[l].to >= map_cmp->lvls[l].from &&
+ map->lvls[l].to <= map_cmp->lvls[l].to )
+ map->lvls[l].to = map_cmp->lvls[l].from - 1;
+ }
+ if ( map->lvls[l].from < map->lvls[l].to )
+ map->lvls[l].pgtables =
+ ((map->lvls[l].to - map->lvls[l].from) >> bits) + 1;
+ DOMPRINTF("%s: 0x%016" PRIx64 "/%d: 0x%016" PRIx64 " -> 0x%016" PRIx64
+ ", %d table(s)", __FUNCTION__, mask, bits,
+ map->lvls[l].from, map->lvls[l].to, map->lvls[l].pgtables);
+ map->area.pgtables += map->lvls[l].pgtables;
+ }
+
+ return 0;
+}
+
+static int alloc_pgtables_pv(struct xc_dom_image *dom)
+{
+ int pages, extra_pages;
+ xen_vaddr_t try_virt_end;
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;
+
+ extra_pages = dom->alloc_bootstack ? 1 : 0;
+ extra_pages += (512 * 1024) / PAGE_SIZE_X86; /* 512kB padding */
+ pages = extra_pages;
+ for ( ; ; )
+ {
+ try_virt_end = round_up(dom->virt_alloc_end + pages * PAGE_SIZE_X86,
+ bits_to_mask(22)); /* 4MB alignment */
+
+ if ( count_pgtables(dom, dom->parms.virt_base, try_virt_end, 0) )
+ return -1;
+
+ pages = map->area.pgtables + extra_pages;
+ if ( dom->virt_alloc_end + pages * PAGE_SIZE_X86 <= try_virt_end + 1 )
+ break;
+ }
+ map->area.pfn = 0;
+ domx86->n_mappings++;
+ dom->virt_pgtab_end = try_virt_end + 1;
+
+ return xc_dom_alloc_segment(dom, &dom->pgtables_seg, "page tables", 0,
+ map->area.pgtables * PAGE_SIZE_X86);
+}
+
+/* ------------------------------------------------------------------------ */
+/* i386 pagetables */
+
+static int alloc_pgtables_x86_32_pae(struct xc_dom_image *dom)
+{
+ static const struct xc_dom_params x86_32_params = {
+ .levels = PGTBL_LEVELS_I386,
+ .vaddr_mask = bits_to_mask(VIRT_BITS_I386),
+ .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED,
+ /*
+ * 64bit Xen runs 32bit PV guests with the PAE entries in an L3
+ * pagetable. They don't behave exactly like native PAE paging.
+ */
+ .lvl_prot[1 ... 2] =
+ _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
+ };
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+
+ domx86->params = &x86_32_params;
+
+ return alloc_pgtables_pv(dom);
+}
+
+#define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86)
+#define pgentry_to_pfn(entry) ((xen_pfn_t)((entry) >> PAGE_SHIFT_X86))
+
+/*
+ * Move the l3 page table page below 4G for guests which do not
+ * support the extended-cr3 format. The l3 is currently empty so we
+ * do not need to preserve the current contents.
+ */
+static xen_pfn_t move_l3_below_4G(struct xc_dom_image *dom,
+ xen_pfn_t l3pfn,
+ xen_pfn_t l3mfn)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ uint32_t *p2m_guest = domx86->p2m_guest;
+ xen_pfn_t new_l3mfn;
+ struct xc_mmu *mmu;
+ void *l3tab;
+
+ mmu = xc_alloc_mmu_updates(dom->xch, dom->guest_domid);
+ if ( mmu == NULL )
+ {
+ DOMPRINTF("%s: failed at %d", __FUNCTION__, __LINE__);
+ return l3mfn;
+ }
+
+ xc_dom_unmap_one(dom, l3pfn);
+
+ new_l3mfn = xc_make_page_below_4G(dom->xch, dom->guest_domid, l3mfn);
+ if ( !new_l3mfn )
+ goto out;
+
+ p2m_guest[l3pfn] = dom->pv_p2m[l3pfn] = new_l3mfn;
+
+ if ( xc_add_mmu_update(dom->xch, mmu,
+ (((unsigned long long)new_l3mfn)
+ << XC_DOM_PAGE_SHIFT(dom)) |
+ MMU_MACHPHYS_UPDATE, l3pfn) )
+ goto out;
+
+ if ( xc_flush_mmu_updates(dom->xch, mmu) )
+ goto out;
+
+ /*
+ * This ensures that the entire pgtables_seg is mapped by a single
+ * mmap region. arch_setup_bootlate() relies on this to be able to
+ * unmap and pin the pagetables.
+ */
+ if ( xc_dom_seg_to_ptr(dom, &dom->pgtables_seg) == NULL )
+ goto out;
+
+ l3tab = xc_dom_pfn_to_ptr(dom, l3pfn, 1);
+ if ( l3tab == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_pfn_to_ptr(dom, l3pfn, 1) => NULL",
+ __FUNCTION__);
+ goto out; /* our one call site will call xc_dom_panic and fail */
+ }
+ memset(l3tab, 0, XC_DOM_PAGE_SIZE(dom));
+
+ DOMPRINTF("%s: successfully relocated L3 below 4G. "
+ "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn"=>%#"PRIpfn")",
+ __FUNCTION__, l3pfn, l3mfn, new_l3mfn);
+
+ l3mfn = new_l3mfn;
+
+ out:
+ free(mmu);
+
+ return l3mfn;
+}
+
+static x86_pgentry_t *get_pg_table(struct xc_dom_image *dom, int m, int l)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ struct xc_dom_x86_mapping *map;
+ x86_pgentry_t *pg;
+
+ map = domx86->maps + m;
+ pg = xc_dom_pfn_to_ptr(dom, map->lvls[l].pfn, 0);
+ if ( pg )
+ return pg;
+
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: xc_dom_pfn_to_ptr failed", __FUNCTION__);
+ return NULL;
+}
+
+static x86_pgentry_t get_pg_prot(struct xc_dom_image *dom, int l, xen_pfn_t pfn)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ struct xc_dom_x86_mapping *map;
+ xen_pfn_t pfn_s, pfn_e;
+ x86_pgentry_t prot;
+ unsigned m;
+
+ prot = domx86->params->lvl_prot[l];
+ if ( l > 0 )
+ return prot;
+
+ for ( m = 0; m < domx86->n_mappings; m++ )
+ {
+ map = domx86->maps + m;
+ pfn_s = map->lvls[domx86->params->levels - 1].pfn;
+ pfn_e = map->area.pgtables + pfn_s;
+ if ( pfn >= pfn_s && pfn < pfn_e )
+ return prot & ~_PAGE_RW;
+ }
+
+ return prot;
+}
+
+static int setup_pgtables_pv(struct xc_dom_image *dom)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ struct xc_dom_x86_mapping *map1, *map2;
+ struct xc_dom_x86_mapping_lvl *lvl;
+ xen_vaddr_t from, to;
+ xen_pfn_t pfn, p, p_s, p_e;
+ x86_pgentry_t *pg;
+ unsigned m1, m2;
+ int l;
+
+ for ( l = domx86->params->levels - 1; l >= 0; l-- )
+ for ( m1 = 0; m1 < domx86->n_mappings; m1++ )
+ {
+ map1 = domx86->maps + m1;
+ from = map1->lvls[l].from;
+ to = map1->lvls[l].to;
+ pg = get_pg_table(dom, m1, l);
+ if ( !pg )
+ return -1;
+ for ( m2 = 0; m2 < domx86->n_mappings; m2++ )
+ {
+ map2 = domx86->maps + m2;
+ lvl = (l > 0) ? map2->lvls + l - 1 : &map2->area;
+ if ( l > 0 && lvl->pgtables == 0 )
+ continue;
+ if ( lvl->from >= to || lvl->to <= from )
+ continue;
+ p_s = (max(from, lvl->from) - from) >>
+ (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86);
+ p_e = (min(to, lvl->to) - from) >>
+ (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86);
+ pfn = ((max(from, lvl->from) - lvl->from) >>
+ (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86)) + lvl->pfn;
+ for ( p = p_s; p <= p_e; p++ )
+ {
+ pg[p] = pfn_to_paddr(xc_dom_p2m(dom, pfn)) |
+ get_pg_prot(dom, l, pfn);
+ pfn++;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int setup_pgtables_x86_32_pae(struct xc_dom_image *dom)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ uint32_t *p2m_guest = domx86->p2m_guest;
+ xen_pfn_t l3mfn, l3pfn, i;
+
+ /* Copy dom->pv_p2m[] into the guest. */
+ for ( i = 0; i < dom->p2m_size; ++i )
+ {
+ if ( dom->pv_p2m[i] != INVALID_PFN )
+ p2m_guest[i] = dom->pv_p2m[i];
+ else
+ p2m_guest[i] = -1;
+ }
+
+ l3pfn = domx86->maps[0].lvls[2].pfn;
+ l3mfn = xc_dom_p2m(dom, l3pfn);
+ if ( dom->parms.pae == XEN_PAE_YES )
+ {
+ if ( l3mfn >= 0x100000 )
+ l3mfn = move_l3_below_4G(dom, l3pfn, l3mfn);
+
+ if ( l3mfn >= 0x100000 )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,"%s: cannot move L3"
+ " below 4G. extended-cr3 not supported by guest. "
+ "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn")",
+ __FUNCTION__, l3pfn, l3mfn);
+ return -EINVAL;
+ }
+ }
+
+ return setup_pgtables_pv(dom);
+}
+
+/* ------------------------------------------------------------------------ */
+/* x86_64 pagetables */
+
+static int alloc_pgtables_x86_64(struct xc_dom_image *dom)
+{
+ const static struct xc_dom_params x86_64_params = {
+ .levels = PGTBL_LEVELS_X86_64,
+ .vaddr_mask = bits_to_mask(VIRT_BITS_X86_64),
+ .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED,
+ .lvl_prot[1 ... 3] =
+ _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
+ };
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+
+ domx86->params = &x86_64_params;
+
+ return alloc_pgtables_pv(dom);
+}
+
+static int setup_pgtables_x86_64(struct xc_dom_image *dom)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ uint64_t *p2m_guest = domx86->p2m_guest;
+ xen_pfn_t i;
+
+ /* Copy dom->pv_p2m[] into the guest. */
+ for ( i = 0; i < dom->p2m_size; ++i )
+ {
+ if ( dom->pv_p2m[i] != INVALID_PFN )
+ p2m_guest[i] = dom->pv_p2m[i];
+ else
+ p2m_guest[i] = -1;
+ }
+
+ return setup_pgtables_pv(dom);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int alloc_p2m_list(struct xc_dom_image *dom, size_t p2m_alloc_size)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+
+ if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach",
+ 0, p2m_alloc_size) )
+ return -1;
+
+ domx86->p2m_guest = xc_dom_seg_to_ptr(dom, &dom->p2m_seg);
+ if ( domx86->p2m_guest == NULL )
+ return -1;
+
+ return 0;
+}
+
+static int alloc_p2m_list_x86_32(struct xc_dom_image *dom)
+{
+ size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn;
+
+ p2m_alloc_size = round_pg_up(p2m_alloc_size);
+ return alloc_p2m_list(dom, p2m_alloc_size);
+}
+
+static int alloc_p2m_list_x86_64(struct xc_dom_image *dom)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;
+ size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn;
+ xen_vaddr_t from, to;
+ unsigned lvl;
+
+ p2m_alloc_size = round_pg_up(p2m_alloc_size);
+ if ( dom->parms.p2m_base != UNSET_ADDR )
+ {
+ from = dom->parms.p2m_base;
+ to = from + p2m_alloc_size - 1;
+ if ( count_pgtables(dom, from, to, dom->pfn_alloc_end) )
+ return -1;
+
+ map->area.pfn = dom->pfn_alloc_end;
+ for ( lvl = 0; lvl < 4; lvl++ )
+ map->lvls[lvl].pfn += p2m_alloc_size >> PAGE_SHIFT_X86;
+ domx86->n_mappings++;
+ p2m_alloc_size += map->area.pgtables << PAGE_SHIFT_X86;
+ }
+
+ return alloc_p2m_list(dom, p2m_alloc_size);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int alloc_magic_pages_pv(struct xc_dom_image *dom)
+{
+ dom->start_info_pfn = xc_dom_alloc_page(dom, "start info");
+ if ( dom->start_info_pfn == INVALID_PFN )
+ return -1;
+
+ dom->xenstore_pfn = xc_dom_alloc_page(dom, "xenstore");
+ if ( dom->xenstore_pfn == INVALID_PFN )
+ return -1;
+ xc_clear_domain_page(dom->xch, dom->guest_domid,
+ xc_dom_p2m(dom, dom->xenstore_pfn));
+
+ dom->console_pfn = xc_dom_alloc_page(dom, "console");
+ if ( dom->console_pfn == INVALID_PFN )
+ return -1;
+ xc_clear_domain_page(dom->xch, dom->guest_domid,
+ xc_dom_p2m(dom, dom->console_pfn));
+
+ dom->alloc_bootstack = 1;
+
+ return 0;
+}
+
+static void build_hvm_info(void *hvm_info_page, struct xc_dom_image *dom)
+{
+ struct hvm_info_table *hvm_info = (struct hvm_info_table *)
+ (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
+ uint8_t sum;
+ int i;
+
+ memset(hvm_info_page, 0, PAGE_SIZE);
+
+ /* Fill in the header. */
+ memcpy(hvm_info->signature, "HVM INFO", sizeof(hvm_info->signature));
+ hvm_info->length = sizeof(struct hvm_info_table);
+
+ /* Sensible defaults: these can be overridden by the caller. */
+ hvm_info->apic_mode = 1;
+ hvm_info->nr_vcpus = 1;
+ memset(hvm_info->vcpu_online, 0xff, sizeof(hvm_info->vcpu_online));
+
+ /* Memory parameters. */
+ hvm_info->low_mem_pgend = dom->lowmem_end >> PAGE_SHIFT;
+ hvm_info->high_mem_pgend = dom->highmem_end >> PAGE_SHIFT;
+ hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0);
+
+ /* Finish with the checksum. */
+ for ( i = 0, sum = 0; i < hvm_info->length; i++ )
+ sum += ((uint8_t *)hvm_info)[i];
+ hvm_info->checksum = -sum;
+}
+
+static int alloc_magic_pages_hvm(struct xc_dom_image *dom)
+{
+ unsigned long i;
+ uint32_t *ident_pt, domid = dom->guest_domid;
+ int rc;
+ xen_pfn_t special_array[X86_HVM_NR_SPECIAL_PAGES];
+ xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES];
+ xc_interface *xch = dom->xch;
+ size_t start_info_size = sizeof(struct hvm_start_info);
+
+ /* Allocate and clear special pages. */
+ for ( i = 0; i < X86_HVM_NR_SPECIAL_PAGES; i++ )
+ special_array[i] = special_pfn(i);
+
+ rc = xc_domain_populate_physmap_exact(xch, domid, X86_HVM_NR_SPECIAL_PAGES,
+ 0, 0, special_array);
+ if ( rc != 0 )
+ {
+ DOMPRINTF("Could not allocate special pages.");
+ goto error_out;
+ }
+
+ if ( xc_clear_domain_pages(xch, domid, special_pfn(0),
+ X86_HVM_NR_SPECIAL_PAGES) )
+ goto error_out;
+
+ xc_hvm_param_set(xch, domid, HVM_PARAM_STORE_PFN,
+ special_pfn(SPECIALPAGE_XENSTORE));
+ xc_hvm_param_set(xch, domid, HVM_PARAM_BUFIOREQ_PFN,
+ special_pfn(SPECIALPAGE_BUFIOREQ));
+ xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_PFN,
+ special_pfn(SPECIALPAGE_IOREQ));
+ xc_hvm_param_set(xch, domid, HVM_PARAM_CONSOLE_PFN,
+ special_pfn(SPECIALPAGE_CONSOLE));
+ xc_hvm_param_set(xch, domid, HVM_PARAM_PAGING_RING_PFN,
+ special_pfn(SPECIALPAGE_PAGING));
+ xc_hvm_param_set(xch, domid, HVM_PARAM_MONITOR_RING_PFN,
+ special_pfn(SPECIALPAGE_ACCESS));
+ xc_hvm_param_set(xch, domid, HVM_PARAM_SHARING_RING_PFN,
+ special_pfn(SPECIALPAGE_SHARING));
+
+ start_info_size +=
+ sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT;
+
+ start_info_size +=
+ HVMLOADER_MODULE_CMDLINE_SIZE * HVMLOADER_MODULE_MAX_COUNT;
+
+ start_info_size +=
+ dom->e820_entries * sizeof(struct hvm_memmap_table_entry);
+
+ if ( !dom->device_model )
+ {
+ if ( dom->cmdline )
+ {
+ dom->cmdline_size = ROUNDUP(strlen(dom->cmdline) + 1, 8);
+ start_info_size += dom->cmdline_size;
+ }
+ }
+ else
+ {
+ /*
+ * Allocate and clear additional ioreq server pages. The default
+ * server will use the IOREQ and BUFIOREQ special pages above.
+ */
+ for ( i = 0; i < NR_IOREQ_SERVER_PAGES; i++ )
+ ioreq_server_array[i] = ioreq_server_pfn(i);
+
+ rc = xc_domain_populate_physmap_exact(xch, domid, NR_IOREQ_SERVER_PAGES, 0,
+ 0, ioreq_server_array);
+ if ( rc != 0 )
+ {
+ DOMPRINTF("Could not allocate ioreq server pages.");
+ goto error_out;
+ }
+
+ if ( xc_clear_domain_pages(xch, domid, ioreq_server_pfn(0),
+ NR_IOREQ_SERVER_PAGES) )
+ goto error_out;
+
+ /* Tell the domain where the pages are and how many there are */
+ xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_SERVER_PFN,
+ ioreq_server_pfn(0));
+ xc_hvm_param_set(xch, domid, HVM_PARAM_NR_IOREQ_SERVER_PAGES,
+ NR_IOREQ_SERVER_PAGES);
+ }
+
+ rc = xc_dom_alloc_segment(dom, &dom->start_info_seg,
+ "HVM start info", 0, start_info_size);
+ if ( rc != 0 )
+ {
+ DOMPRINTF("Unable to reserve memory for the start info");
+ goto out;
+ }
+
+ /*
+ * Identity-map page table is required for running with CR0.PG=0 when
+ * using Intel EPT. Create a 32-bit non-PAE page directory of superpages.
+ */
+ if ( (ident_pt = xc_map_foreign_range(
+ xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ special_pfn(SPECIALPAGE_IDENT_PT))) == NULL )
+ goto error_out;
+ for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+ ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ munmap(ident_pt, PAGE_SIZE);
+ xc_hvm_param_set(xch, domid, HVM_PARAM_IDENT_PT,
+ special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT);
+
+ dom->console_pfn = special_pfn(SPECIALPAGE_CONSOLE);
+ xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn);
+
+ dom->xenstore_pfn = special_pfn(SPECIALPAGE_XENSTORE);
+ xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn);
+
+ dom->parms.virt_hypercall = -1;
+
+ rc = 0;
+ goto out;
+ error_out:
+ rc = -1;
+ out:
+
+ return rc;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int start_info_x86_32(struct xc_dom_image *dom)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ start_info_x86_32_t *start_info =
+ xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1);
+ xen_pfn_t shinfo =
+ xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ if ( start_info == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__);
+ return -1; /* our caller throws away our return value :-/ */
+ }
+
+ memset(start_info, 0, sizeof(*start_info));
+ strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
+ start_info->magic[sizeof(start_info->magic) - 1] = '\0';
+ start_info->nr_pages = dom->total_pages;
+ start_info->shared_info = shinfo << PAGE_SHIFT_X86;
+ start_info->pt_base = dom->pgtables_seg.vstart;
+ start_info->nr_pt_frames = domx86->maps[0].area.pgtables;
+ start_info->mfn_list = dom->p2m_seg.vstart;
+
+ start_info->flags = dom->flags;
+ start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
+ start_info->store_evtchn = dom->xenstore_evtchn;
+ start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn);
+ start_info->console.domU.evtchn = dom->console_evtchn;
+
+ if ( dom->modules[0].blob )
+ {
+ start_info->mod_start = dom->initrd_start;
+ start_info->mod_len = dom->initrd_len;
+ }
+
+ if ( dom->cmdline )
+ {
+ strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE);
+ start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0';
+ }
+
+ return 0;
+}
+
+static int start_info_x86_64(struct xc_dom_image *dom)
+{
+ struct xc_dom_image_x86 *domx86 = dom->arch_private;
+ start_info_x86_64_t *start_info =
+ xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1);
+ xen_pfn_t shinfo =
+ xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ if ( start_info == NULL )
+ {
+ DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__);
+ return -1; /* our caller throws away our return value :-/ */
+ }
+
+ memset(start_info, 0, sizeof(*start_info));
+ strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
+ start_info->magic[sizeof(start_info->magic) - 1] = '\0';
+ start_info->nr_pages = dom->total_pages;
+ start_info->shared_info = shinfo << PAGE_SHIFT_X86;
+ start_info->pt_base = dom->pgtables_seg.vstart;
+ start_info->nr_pt_frames = domx86->maps[0].area.pgtables;
+ start_info->mfn_list = dom->p2m_seg.vstart;
+ if ( dom->parms.p2m_base != UNSET_ADDR )
+ {
+ start_info->first_p2m_pfn = dom->p2m_seg.pfn;
+ start_info->nr_p2m_frames = dom->p2m_seg.pages;
+ }
+
+ start_info->flags = dom->flags;
+ start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
+ start_info->store_evtchn = dom->xenstore_evtchn;
+ start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn);
+ start_info->console.domU.evtchn = dom->console_evtchn;
+
+ if ( dom->modules[0].blob )
+ {
+ start_info->mod_start = dom->initrd_start;
+ start_info->mod_len = dom->initrd_len;
+ }
+
+ if ( dom->cmdline )
+ {
+ strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE);
+ start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0';
+ }
+
+ return 0;
+}
+
+static int shared_info_x86_32(struct xc_dom_image *dom, void *ptr)
+{
+ shared_info_x86_32_t *shared_info = ptr;
+ int i;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ memset(shared_info, 0, sizeof(*shared_info));
+ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
+ shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
+ return 0;
+}
+
+static int shared_info_x86_64(struct xc_dom_image *dom, void *ptr)
+{
+ shared_info_x86_64_t *shared_info = ptr;
+ int i;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ memset(shared_info, 0, sizeof(*shared_info));
+ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
+ shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
+ return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int vcpu_x86_32(struct xc_dom_image *dom)
+{
+ vcpu_guest_context_any_t any_ctx;
+ vcpu_guest_context_x86_32_t *ctxt = &any_ctx.x32;
+ xen_pfn_t cr3_pfn;
+ int rc;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ /* clear everything */
+ memset(ctxt, 0, sizeof(*ctxt));
+
+ ctxt->user_regs.eip = dom->parms.virt_entry;
+ ctxt->user_regs.esp =
+ dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86;
+ ctxt->user_regs.esi =
+ dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
+ ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */
+
+ ctxt->debugreg[6] = X86_DR6_DEFAULT;
+ ctxt->debugreg[7] = X86_DR7_DEFAULT;
+
+ ctxt->flags = VGCF_in_kernel_X86_32 | VGCF_online_X86_32;
+ if ( dom->parms.pae == XEN_PAE_EXTCR3 ||
+ dom->parms.pae == XEN_PAE_BIMODAL )
+ ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3);
+
+ cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn);
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_32(cr3_pfn);
+ DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "",
+ __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn);
+
+ ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_32;
+ ctxt->user_regs.es = FLAT_KERNEL_DS_X86_32;
+ ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_32;
+ ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_32;
+ ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_32;
+ ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_32;
+
+ ctxt->kernel_ss = ctxt->user_regs.ss;
+ ctxt->kernel_sp = ctxt->user_regs.esp;
+
+ rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
+ if ( rc != 0 )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
+
+ return rc;
+}
+
+static int vcpu_x86_64(struct xc_dom_image *dom)
+{
+ vcpu_guest_context_any_t any_ctx;
+ vcpu_guest_context_x86_64_t *ctxt = &any_ctx.x64;
+ xen_pfn_t cr3_pfn;
+ int rc;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ /* clear everything */
+ memset(ctxt, 0, sizeof(*ctxt));
+
+ ctxt->user_regs.rip = dom->parms.virt_entry;
+ ctxt->user_regs.rsp =
+ dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86;
+ ctxt->user_regs.rsi =
+ dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
+ ctxt->user_regs.rflags = 1 << 9; /* Interrupt Enable */
+
+ ctxt->debugreg[6] = X86_DR6_DEFAULT;
+ ctxt->debugreg[7] = X86_DR7_DEFAULT;
+
+ ctxt->flags = VGCF_in_kernel_X86_64 | VGCF_online_X86_64;
+ cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn);
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_64(cr3_pfn);
+ DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "",
+ __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn);
+
+ ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_64;
+ ctxt->user_regs.es = FLAT_KERNEL_DS_X86_64;
+ ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_64;
+ ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_64;
+ ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_64;
+ ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_64;
+
+ ctxt->kernel_ss = ctxt->user_regs.ss;
+ ctxt->kernel_sp = ctxt->user_regs.esp;
+
+ rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
+ if ( rc != 0 )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
+
+ return rc;
+}
+
+const static void *hvm_get_save_record(const void *ctx, unsigned int type,
+ unsigned int instance)
+{
+ const struct hvm_save_descriptor *header;
+
+ for ( header = ctx;
+ header->typecode != HVM_SAVE_CODE(END);
+ ctx += sizeof(*header) + header->length, header = ctx )
+ if ( header->typecode == type && header->instance == instance )
+ return ctx + sizeof(*header);
+
+ return NULL;
+}
+
+static int vcpu_hvm(struct xc_dom_image *dom)
+{
+ struct {
+ struct hvm_save_descriptor header_d;
+ HVM_SAVE_TYPE(HEADER) header;
+ struct hvm_save_descriptor cpu_d;
+ HVM_SAVE_TYPE(CPU) cpu;
+ struct hvm_save_descriptor end_d;
+ HVM_SAVE_TYPE(END) end;
+ } bsp_ctx;
+ uint8_t *full_ctx = NULL;
+ int rc;
+
+ DOMPRINTF_CALLED(dom->xch);
+
+ assert(dom->max_vcpus);
+
+ /*
+ * Get the full HVM context in order to have the header, it is not
+ * possible to get the header with getcontext_partial, and crafting one
+ * from userspace is also not an option since cpuid is trapped and
+ * modified by Xen.
+ */
+
+ rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, NULL, 0);
+ if ( rc <= 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: unable to fetch HVM context size (rc=%d)",
+ __func__, rc);
+ goto out;
+ }
+
+ full_ctx = calloc(1, rc);
+ if ( full_ctx == NULL )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: unable to allocate memory for HVM context (rc=%d)",
+ __func__, rc);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, full_ctx, rc);
+ if ( rc <= 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: unable to fetch HVM context (rc=%d)",
+ __func__, rc);
+ goto out;
+ }
+
+ /* Copy the header to our partial context. */
+ memset(&bsp_ctx, 0, sizeof(bsp_ctx));
+ memcpy(&bsp_ctx, full_ctx,
+ sizeof(struct hvm_save_descriptor) + HVM_SAVE_LENGTH(HEADER));
+
+ /* Set the CPU descriptor. */
+ bsp_ctx.cpu_d.typecode = HVM_SAVE_CODE(CPU);
+ bsp_ctx.cpu_d.instance = 0;
+ bsp_ctx.cpu_d.length = HVM_SAVE_LENGTH(CPU);
+
+ /* Set the cached part of the relevant segment registers. */
+ bsp_ctx.cpu.cs_base = 0;
+ bsp_ctx.cpu.ds_base = 0;
+ bsp_ctx.cpu.es_base = 0;
+ bsp_ctx.cpu.ss_base = 0;
+ bsp_ctx.cpu.tr_base = 0;
+ bsp_ctx.cpu.cs_limit = ~0u;
+ bsp_ctx.cpu.ds_limit = ~0u;
+ bsp_ctx.cpu.es_limit = ~0u;
+ bsp_ctx.cpu.ss_limit = ~0u;
+ bsp_ctx.cpu.tr_limit = 0x67;
+ bsp_ctx.cpu.cs_arbytes = 0xc9b;
+ bsp_ctx.cpu.ds_arbytes = 0xc93;
+ bsp_ctx.cpu.es_arbytes = 0xc93;
+ bsp_ctx.cpu.ss_arbytes = 0xc93;
+ bsp_ctx.cpu.tr_arbytes = 0x8b;
+
+ /* Set the control registers. */
+ bsp_ctx.cpu.cr0 = X86_CR0_PE | X86_CR0_ET;
+
+ /* Set the IP. */
+ bsp_ctx.cpu.rip = dom->parms.phys_entry;
+
+ bsp_ctx.cpu.dr6 = X86_DR6_DEFAULT;
+ bsp_ctx.cpu.dr7 = X86_DR7_DEFAULT;
+
+ if ( dom->start_info_seg.pfn )
+ bsp_ctx.cpu.rbx = dom->start_info_seg.pfn << PAGE_SHIFT;
+
+ /* Set the end descriptor. */
+ bsp_ctx.end_d.typecode = HVM_SAVE_CODE(END);
+ bsp_ctx.end_d.instance = 0;
+ bsp_ctx.end_d.length = HVM_SAVE_LENGTH(END);
+
+ /* TODO: maybe this should be a firmware option instead? */
+ if ( !dom->device_model )
+ {
+ struct {
+ struct hvm_save_descriptor header_d;
+ HVM_SAVE_TYPE(HEADER) header;
+ struct hvm_save_descriptor mtrr_d;
+ HVM_SAVE_TYPE(MTRR) mtrr;
+ struct hvm_save_descriptor end_d;
+ HVM_SAVE_TYPE(END) end;
+ } mtrr = {
+ .header_d = bsp_ctx.header_d,
+ .header = bsp_ctx.header,
+ .mtrr_d.typecode = HVM_SAVE_CODE(MTRR),
+ .mtrr_d.length = HVM_SAVE_LENGTH(MTRR),
+ .end_d = bsp_ctx.end_d,
+ .end = bsp_ctx.end,
+ };
+ const HVM_SAVE_TYPE(MTRR) *mtrr_record =
+ hvm_get_save_record(full_ctx, HVM_SAVE_CODE(MTRR), 0);
+ unsigned int i;
+
+ if ( !mtrr_record )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: unable to get MTRR save record", __func__);
+ goto out;
+ }
+
+ memcpy(&mtrr.mtrr, mtrr_record, sizeof(mtrr.mtrr));
+
+ /*
+ * Enable MTRR, set default type to WB.
+ * TODO: add MMIO areas as UC when passthrough is supported.
+ */
+ mtrr.mtrr.msr_mtrr_def_type = MTRR_TYPE_WRBACK | MTRR_DEF_TYPE_ENABLE;
+
+ for ( i = 0; i < dom->max_vcpus; i++ )
+ {
+ mtrr.mtrr_d.instance = i;
+ rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid,
+ (uint8_t *)&mtrr, sizeof(mtrr));
+ if ( rc != 0 )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc);
+ }
+ }
+
+ /*
+ * Loading the BSP context should be done in the last call to setcontext,
+ * since each setcontext call will put all vCPUs down.
+ */
+ rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid,
+ (uint8_t *)&bsp_ctx, sizeof(bsp_ctx));
+ if ( rc != 0 )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc);
+
+ out:
+ free(full_ctx);
+ return rc;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int x86_compat(xc_interface *xch, uint32_t domid, char *guest_type)
+{
+ static const struct {
+ char *guest;
+ uint32_t size;
+ } types[] = {
+ { "xen-3.0-x86_32p", 32 },
+ { "xen-3.0-x86_64", 64 },
+ };
+ DECLARE_DOMCTL;
+ int i,rc;
+
+ memset(&domctl, 0, sizeof(domctl));
+ domctl.domain = domid;
+ domctl.cmd = XEN_DOMCTL_set_address_size;
+ for ( i = 0; i < ARRAY_SIZE(types); i++ )
+ if ( !strcmp(types[i].guest, guest_type) )
+ domctl.u.address_size.size = types[i].size;
+ if ( domctl.u.address_size.size == 0 )
+ /* nothing to do */
+ return 0;
+
+ xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__,
+ guest_type, domctl.u.address_size.size);
+ rc = do_domctl(xch, &domctl);
+ if ( rc != 0 )
+ xc_dom_printf(xch, "%s: warning: failed (rc=%d)",
+ __FUNCTION__, rc);
+ return rc;
+}
+
+static int meminit_pv(struct xc_dom_image *dom)
+{
+ int rc;
+ xen_pfn_t pfn, allocsz, mfn, total, pfn_base;
+ int i, j, k;
+ xen_vmemrange_t dummy_vmemrange[1];
+ unsigned int dummy_vnode_to_pnode[1];
+ xen_vmemrange_t *vmemranges;
+ unsigned int *vnode_to_pnode;
+ unsigned int nr_vmemranges, nr_vnodes;
+
+ rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
+ if ( rc )
+ return rc;
+
+ /* try to claim pages for early warning of insufficient memory avail */
+ if ( dom->claim_enabled )
+ {
+ rc = xc_domain_claim_pages(dom->xch, dom->guest_domid,
+ dom->total_pages);
+ if ( rc )
+ return rc;
+ }
+
+ /* Setup dummy vNUMA information if it's not provided. Note
+ * that this is a valid state if libxl doesn't provide any
+ * vNUMA information.
+ *
+ * The dummy values make libxc allocate all pages from
+ * arbitrary physical nodes. This is the expected behaviour if
+ * no vNUMA configuration is provided to libxc.
+ *
+ * Note that the following hunk is just for the convenience of
+ * allocation code. No defaulting happens in libxc.
+ */
+ if ( dom->nr_vmemranges == 0 )
+ {
+ nr_vmemranges = 1;
+ vmemranges = dummy_vmemrange;
+ vmemranges[0].start = 0;
+ vmemranges[0].end = (uint64_t)dom->total_pages << PAGE_SHIFT;
+ vmemranges[0].flags = 0;
+ vmemranges[0].nid = 0;
+
+ nr_vnodes = 1;
+ vnode_to_pnode = dummy_vnode_to_pnode;
+ vnode_to_pnode[0] = XC_NUMA_NO_NODE;
+ }
+ else
+ {
+ nr_vmemranges = dom->nr_vmemranges;
+ nr_vnodes = dom->nr_vnodes;
+ vmemranges = dom->vmemranges;
+ vnode_to_pnode = dom->vnode_to_pnode;
+ }
+
+ total = dom->p2m_size = 0;
+ for ( i = 0; i < nr_vmemranges; i++ )
+ {
+ total += ((vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT);
+ dom->p2m_size = max(dom->p2m_size,
+ (xen_pfn_t)(vmemranges[i].end >> PAGE_SHIFT));
+ }
+ if ( total != dom->total_pages )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: vNUMA page count mismatch (0x%"PRIpfn" != 0x%"PRIpfn")",
+ __func__, total, dom->total_pages);
+ return -EINVAL;
+ }
+
+ dom->pv_p2m = xc_dom_malloc(dom, sizeof(*dom->pv_p2m) * dom->p2m_size);
+ if ( dom->pv_p2m == NULL )
+ return -EINVAL;
+ for ( pfn = 0; pfn < dom->p2m_size; pfn++ )
+ dom->pv_p2m[pfn] = INVALID_PFN;
+
+ /* allocate guest memory */
+ for ( i = 0; i < nr_vmemranges; i++ )
+ {
+ unsigned int memflags;
+ uint64_t pages, super_pages;
+ unsigned int pnode = vnode_to_pnode[vmemranges[i].nid];
+ xen_pfn_t extents[SUPERPAGE_BATCH_SIZE];
+ xen_pfn_t pfn_base_idx;
+
+ memflags = 0;
+ if ( pnode != XC_NUMA_NO_NODE )
+ memflags |= XENMEMF_exact_node(pnode);
+
+ pages = (vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT;
+ super_pages = pages >> SUPERPAGE_2MB_SHIFT;
+ pfn_base = vmemranges[i].start >> PAGE_SHIFT;
+
+ for ( pfn = pfn_base; pfn < pfn_base+pages; pfn++ )
+ dom->pv_p2m[pfn] = pfn;
+
+ pfn_base_idx = pfn_base;
+ while ( super_pages ) {
+ uint64_t count = min_t(uint64_t, super_pages, SUPERPAGE_BATCH_SIZE);
+ super_pages -= count;
+
+ for ( pfn = pfn_base_idx, j = 0;
+ pfn < pfn_base_idx + (count << SUPERPAGE_2MB_SHIFT);
+ pfn += SUPERPAGE_2MB_NR_PFNS, j++ )
+ extents[j] = dom->pv_p2m[pfn];
+ rc = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count,
+ SUPERPAGE_2MB_SHIFT, memflags,
+ extents);
+ if ( rc < 0 )
+ return rc;
+
+ /* Expand the returned mfns into the p2m array. */
+ pfn = pfn_base_idx;
+ for ( j = 0; j < rc; j++ )
+ {
+ mfn = extents[j];
+ for ( k = 0; k < SUPERPAGE_2MB_NR_PFNS; k++, pfn++ )
+ dom->pv_p2m[pfn] = mfn + k;
+ }
+ pfn_base_idx = pfn;
+ }
+
+ for ( j = pfn_base_idx - pfn_base; j < pages; j += allocsz )
+ {
+ allocsz = min_t(uint64_t, 1024 * 1024, pages - j);
+ rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
+ allocsz, 0, memflags, &dom->pv_p2m[pfn_base + j]);
+
+ if ( rc )
+ {
+ if ( pnode != XC_NUMA_NO_NODE )
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: failed to allocate 0x%"PRIx64" pages (v=%d, p=%d)",
+ __func__, pages, i, pnode);
+ else
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: failed to allocate 0x%"PRIx64" pages",
+ __func__, pages);
+ return rc;
+ }
+ }
+ rc = 0;
+ }
+
+ /* Ensure no unclaimed pages are left unused.
+ * OK to call if hadn't done the earlier claim call. */
+ xc_domain_claim_pages(dom->xch, dom->guest_domid, 0 /* cancel claim */);
+
+ return rc;
+}
+
+/*
+ * Check whether there exists mmio hole in the specified memory range.
+ * Returns 1 if exists, else returns 0.
+ */
+static int check_mmio_hole(uint64_t start, uint64_t memsize,
+ uint64_t mmio_start, uint64_t mmio_size)
+{
+ if ( start + memsize <= mmio_start || start >= mmio_start + mmio_size )
+ return 0;
+ else
+ return 1;
+}
+
+static int meminit_hvm(struct xc_dom_image *dom)
+{
+ unsigned long i, vmemid, nr_pages = dom->total_pages;
+ unsigned long p2m_size;
+ unsigned long target_pages = dom->target_pages;
+ unsigned long cur_pages, cur_pfn;
+ int rc;
+ unsigned long stat_normal_pages = 0, stat_2mb_pages = 0,
+ stat_1gb_pages = 0;
+ unsigned int memflags = 0;
+ int claim_enabled = dom->claim_enabled;
+ uint64_t total_pages;
+ xen_vmemrange_t dummy_vmemrange[2];
+ unsigned int dummy_vnode_to_pnode[1];
+ xen_vmemrange_t *vmemranges;
+ unsigned int *vnode_to_pnode;
+ unsigned int nr_vmemranges, nr_vnodes;
+ xc_interface *xch = dom->xch;
+ uint32_t domid = dom->guest_domid;
+
+ if ( nr_pages > target_pages )
+ memflags |= XENMEMF_populate_on_demand;
+
+ if ( dom->nr_vmemranges == 0 )
+ {
+ /* Build dummy vnode information
+ *
+ * Guest physical address space layout:
+ * [0, hole_start) [hole_start, 4G) [4G, highmem_end)
+ *
+ * Of course if there is no high memory, the second vmemrange
+ * has no effect on the actual result.
+ */
+
+ dummy_vmemrange[0].start = 0;
+ dummy_vmemrange[0].end = dom->lowmem_end;
+ dummy_vmemrange[0].flags = 0;
+ dummy_vmemrange[0].nid = 0;
+ nr_vmemranges = 1;
+
+ if ( dom->highmem_end > (1ULL << 32) )
+ {
+ dummy_vmemrange[1].start = 1ULL << 32;
+ dummy_vmemrange[1].end = dom->highmem_end;
+ dummy_vmemrange[1].flags = 0;
+ dummy_vmemrange[1].nid = 0;
+
+ nr_vmemranges++;
+ }
+
+ dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE;
+ nr_vnodes = 1;
+ vmemranges = dummy_vmemrange;
+ vnode_to_pnode = dummy_vnode_to_pnode;
+ }
+ else
+ {
+ if ( nr_pages > target_pages )
+ {
+ DOMPRINTF("Cannot enable vNUMA and PoD at the same time");
+ goto error_out;
+ }
+
+ nr_vmemranges = dom->nr_vmemranges;
+ nr_vnodes = dom->nr_vnodes;
+ vmemranges = dom->vmemranges;
+ vnode_to_pnode = dom->vnode_to_pnode;
+ }
+
+ total_pages = 0;
+ p2m_size = 0;
+ for ( i = 0; i < nr_vmemranges; i++ )
+ {
+ DOMPRINTF("range: start=0x%"PRIx64" end=0x%"PRIx64, vmemranges[i].start, vmemranges[i].end);
+
+ total_pages += ((vmemranges[i].end - vmemranges[i].start)
+ >> PAGE_SHIFT);
+ p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ?
+ p2m_size : (vmemranges[i].end >> PAGE_SHIFT);
+ }
+
+ if ( total_pages != nr_pages )
+ {
+ DOMPRINTF("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%lx)",
+ total_pages, nr_pages);
+ goto error_out;
+ }
+
+ dom->p2m_size = p2m_size;
+
+ /*
+ * Try to claim pages for early warning of insufficient memory available.
+ * This should go before xc_domain_set_pod_target, becuase that function
+ * actually allocates memory for the guest. Claiming after memory has been
+ * allocated is pointless.
+ */
+ if ( claim_enabled ) {
+ rc = xc_domain_claim_pages(xch, domid,
+ target_pages - dom->vga_hole_size);
+ if ( rc != 0 )
+ {
+ DOMPRINTF("Could not allocate memory for HVM guest as we cannot claim memory!");
+ goto error_out;
+ }
+ }
+
+ if ( memflags & XENMEMF_populate_on_demand )
+ {
+ /*
+ * Subtract VGA_HOLE_SIZE from target_pages for the VGA
+ * "hole". Xen will adjust the PoD cache size so that domain
+ * tot_pages will be target_pages - VGA_HOLE_SIZE after
+ * this call.
+ */
+ rc = xc_domain_set_pod_target(xch, domid,
+ target_pages - dom->vga_hole_size,
+ NULL, NULL, NULL);
+ if ( rc != 0 )
+ {
+ DOMPRINTF("Could not set PoD target for HVM guest.\n");
+ goto error_out;
+ }
+ }
+
+ /*
+ * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
+ *
+ * We attempt to allocate 1GB pages if possible. It falls back on 2MB
+ * pages if 1GB allocation fails. 4KB pages will be used eventually if
+ * both fail.
+ */
+ if ( dom->device_model )
+ {
+ xen_pfn_t extents[0xa0];
+
+ for ( i = 0; i < ARRAY_SIZE(extents); ++i )
+ extents[i] = i;
+
+ rc = xc_domain_populate_physmap_exact(
+ xch, domid, 0xa0, 0, memflags, extents);
+ if ( rc != 0 )
+ {
+ DOMPRINTF("Could not populate low memory (< 0xA0).\n");
+ goto error_out;
+ }
+ }
+
+ stat_normal_pages = 0;
+ for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
+ {
+ unsigned int new_memflags = memflags;
+ uint64_t end_pages;
+ unsigned int vnode = vmemranges[vmemid].nid;
+ unsigned int pnode = vnode_to_pnode[vnode];
+
+ if ( pnode != XC_NUMA_NO_NODE )
+ new_memflags |= XENMEMF_exact_node(pnode);
+
+ end_pages = vmemranges[vmemid].end >> PAGE_SHIFT;
+ /*
+ * Consider vga hole belongs to the vmemrange that covers
+ * 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just
+ * before this loop.
+ */
+ if ( vmemranges[vmemid].start == 0 && dom->device_model )
+ {
+ cur_pages = 0xc0;
+ stat_normal_pages += 0xc0;
+ }
+ else
+ cur_pages = vmemranges[vmemid].start >> PAGE_SHIFT;
+
+ rc = 0;
+ while ( (rc == 0) && (end_pages > cur_pages) )
+ {
+ /* Clip count to maximum 1GB extent. */
+ unsigned long count = end_pages - cur_pages;
+ unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
+
+ if ( count > max_pages )
+ count = max_pages;
+
+ cur_pfn = cur_pages;
+
+ /* Take care the corner cases of super page tails */
+ if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
+ (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) )
+ count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1);
+ else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
+ (count > SUPERPAGE_1GB_NR_PFNS) )
+ count &= ~(SUPERPAGE_1GB_NR_PFNS - 1);
+
+ /* Attemp to allocate 1GB super page. Because in each pass
+ * we only allocate at most 1GB, we don't have to clip
+ * super page boundaries.
+ */
+ if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 &&
+ /* Check if there exists MMIO hole in the 1GB memory
+ * range */
+ !check_mmio_hole(cur_pfn << PAGE_SHIFT,
+ SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT,
+ dom->mmio_start, dom->mmio_size) )
+ {
+ long done;
+ unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT;
+ xen_pfn_t sp_extents[nr_extents];
+
+ for ( i = 0; i < nr_extents; i++ )
+ sp_extents[i] = cur_pages + (i << SUPERPAGE_1GB_SHIFT);
+
+ done = xc_domain_populate_physmap(xch, domid, nr_extents,
+ SUPERPAGE_1GB_SHIFT,
+ new_memflags, sp_extents);
+
+ if ( done > 0 )
+ {
+ stat_1gb_pages += done;
+ done <<= SUPERPAGE_1GB_SHIFT;
+ cur_pages += done;
+ count -= done;
+ }
+ }
+
+ if ( count != 0 )
+ {
+ /* Clip count to maximum 8MB extent. */
+ max_pages = SUPERPAGE_2MB_NR_PFNS * 4;
+ if ( count > max_pages )
+ count = max_pages;
+
+ /* Clip partial superpage extents to superpage
+ * boundaries. */
+ if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
+ (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) )
+ count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1);
+ else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
+ (count > SUPERPAGE_2MB_NR_PFNS) )
+ count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */
+
+ /* Attempt to allocate superpage extents. */
+ if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 )
+ {
+ long done;
+ unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT;
+ xen_pfn_t sp_extents[nr_extents];
+
+ for ( i = 0; i < nr_extents; i++ )
+ sp_extents[i] = cur_pages + (i << SUPERPAGE_2MB_SHIFT);
+
+ done = xc_domain_populate_physmap(xch, domid, nr_extents,
+ SUPERPAGE_2MB_SHIFT,
+ new_memflags, sp_extents);
+
+ if ( done > 0 )
+ {
+ stat_2mb_pages += done;
+ done <<= SUPERPAGE_2MB_SHIFT;
+ cur_pages += done;
+ count -= done;
+ }
+ }
+ }
+
+ /* Fall back to 4kB extents. */
+ if ( count != 0 )
+ {
+ xen_pfn_t extents[count];
+
+ for ( i = 0; i < count; ++i )
+ extents[i] = cur_pages + i;
+
+ rc = xc_domain_populate_physmap_exact(
+ xch, domid, count, 0, new_memflags, extents);
+ cur_pages += count;
+ stat_normal_pages += count;
+ }
+ }
+
+ if ( rc != 0 )
+ {
+ DOMPRINTF("Could not allocate memory for HVM guest.");
+ goto error_out;
+ }
+ }
+
+ DPRINTF("PHYSICAL MEMORY ALLOCATION:\n");
+ DPRINTF(" 4KB PAGES: 0x%016lx\n", stat_normal_pages);
+ DPRINTF(" 2MB PAGES: 0x%016lx\n", stat_2mb_pages);
+ DPRINTF(" 1GB PAGES: 0x%016lx\n", stat_1gb_pages);
+
+ rc = 0;
+ goto out;
+ error_out:
+ rc = -1;
+ out:
+
+ /* ensure no unclaimed pages are left unused */
+ xc_domain_claim_pages(xch, domid, 0 /* cancels the claim */);
+
+ return rc;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static int bootearly(struct xc_dom_image *dom)
+{
+ if ( dom->container_type == XC_DOM_PV_CONTAINER &&
+ elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active) )
+ {
+ DOMPRINTF("PV Autotranslate guests no longer supported");
+ errno = EOPNOTSUPP;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int bootlate_pv(struct xc_dom_image *dom)
+{
+ static const struct {
+ char *guest;
+ unsigned long pgd_type;
+ } types[] = {
+ { "xen-3.0-x86_32", MMUEXT_PIN_L2_TABLE},
+ { "xen-3.0-x86_32p", MMUEXT_PIN_L3_TABLE},
+ { "xen-3.0-x86_64", MMUEXT_PIN_L4_TABLE},
+ };
+ unsigned long pgd_type = 0;
+ shared_info_t *shared_info;
+ xen_pfn_t shinfo;
+ int i, rc;
+
+ for ( i = 0; i < ARRAY_SIZE(types); i++ )
+ if ( !strcmp(types[i].guest, dom->guest_type) )
+ pgd_type = types[i].pgd_type;
+
+ /* Drop references to all initial page tables before pinning. */
+ xc_dom_unmap_one(dom, dom->pgtables_seg.pfn);
+ xc_dom_unmap_one(dom, dom->p2m_seg.pfn);
+ rc = pin_table(dom->xch, pgd_type,
+ xc_dom_p2m(dom, dom->pgtables_seg.pfn),
+ dom->guest_domid);
+ if ( rc != 0 )
+ {
+ xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+ "%s: pin_table failed (pfn 0x%" PRIpfn ", rc=%d)",
+ __FUNCTION__, dom->pgtables_seg.pfn, rc);
+ return rc;
+ }
+ shinfo = dom->shared_info_mfn;
+
+ /* setup shared_info page */
+ DOMPRINTF("%s: shared_info: pfn 0x%" PRIpfn ", mfn 0x%" PRIpfn "",
+ __FUNCTION__, dom->shared_info_pfn, dom->shared_info_mfn);
+ shared_info = xc_map_foreign_range(dom->xch, dom->guest_domid,
+ PAGE_SIZE_X86,
+ PROT_READ | PROT_WRITE,
+ shinfo);
+ if ( shared_info == NULL )
+ return -1;
+ dom->arch_hooks->shared_info(dom, shared_info);
+ munmap(shared_info, PAGE_SIZE_X86);
+
+ return 0;
+}
+
+/*
+ * The memory layout of the start_info page and the modules, and where the
+ * addresses are stored:
+ *
+ * /----------------------------------\
+ * | struct hvm_start_info |
+ * +----------------------------------+ <- start_info->modlist_paddr
+ * | struct hvm_modlist_entry[0] |
+ * +----------------------------------+
+ * | struct hvm_modlist_entry[1] |
+ * +----------------------------------+ <- modlist[0].cmdline_paddr
+ * | cmdline of module 0 |
+ * | char[HVMLOADER_MODULE_NAME_SIZE] |
+ * +----------------------------------+ <- modlist[1].cmdline_paddr
+ * | cmdline of module 1 |
+ * +----------------------------------+
+ */
+static void add_module_to_list(struct xc_dom_image *dom,
+ struct xc_hvm_firmware_module *module,
+ const char *cmdline,
+ struct hvm_modlist_entry *modlist,
+ struct hvm_start_info *start_info)
+{
+ uint32_t index = start_info->nr_modules;
+ void *modules_cmdline_start = modlist + HVMLOADER_MODULE_MAX_COUNT;
+ uint64_t modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
+ ((uintptr_t)modlist - (uintptr_t)start_info);
+ uint64_t modules_cmdline_paddr = modlist_paddr +
+ sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT;
+
+ if ( module->length == 0 )
+ return;
+
+ assert(start_info->nr_modules < HVMLOADER_MODULE_MAX_COUNT);
+
+ modlist[index].paddr = module->guest_addr_out;
+ modlist[index].size = module->length;
+
+ if ( cmdline )
+ {
+ assert(strnlen(cmdline, HVMLOADER_MODULE_CMDLINE_SIZE)
+ < HVMLOADER_MODULE_CMDLINE_SIZE);
+ strncpy(modules_cmdline_start + HVMLOADER_MODULE_CMDLINE_SIZE * index,
+ cmdline, HVMLOADER_MODULE_CMDLINE_SIZE);
+ modlist[index].cmdline_paddr = modules_cmdline_paddr +
+ HVMLOADER_MODULE_CMDLINE_SIZE * index;
+ }
+
+ start_info->nr_modules++;
+}
+
+static int bootlate_hvm(struct xc_dom_image *dom)
+{
+ uint32_t domid = dom->guest_domid;
+ xc_interface *xch = dom->xch;
+ struct hvm_start_info *start_info;
+ size_t modsize;
+ struct hvm_modlist_entry *modlist;
+ struct hvm_memmap_table_entry *memmap;
+ unsigned int i;
+
+ start_info = xc_map_foreign_range(xch, domid, dom->start_info_seg.pages <<
+ XC_DOM_PAGE_SHIFT(dom),
+ PROT_READ | PROT_WRITE,
+ dom->start_info_seg.pfn);
+ if ( start_info == NULL )
+ {
+ DOMPRINTF("Unable to map HVM start info page");
+ return -1;
+ }
+
+ modlist = (void*)(start_info + 1) + dom->cmdline_size;
+
+ if ( !dom->device_model )
+ {
+ if ( dom->cmdline )
+ {
+ char *cmdline = (void*)(start_info + 1);
+
+ strncpy(cmdline, dom->cmdline, dom->cmdline_size);
+ start_info->cmdline_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
+ ((uintptr_t)cmdline - (uintptr_t)start_info);
+ }
+
+ /* ACPI module 0 is the RSDP */
+ start_info->rsdp_paddr = dom->acpi_modules[0].guest_addr_out ? : 0;
+ }
+ else
+ {
+ add_module_to_list(dom, &dom->system_firmware_module, "firmware",
+ modlist, start_info);
+ }
+
+ for ( i = 0; i < dom->num_modules; i++ )
+ {
+ struct xc_hvm_firmware_module mod;
+ uint64_t base = dom->parms.virt_base != UNSET_ADDR ?
+ dom->parms.virt_base : 0;
+
+ mod.guest_addr_out =
+ dom->modules[i].seg.vstart - base;
+ mod.length =
+ dom->modules[i].seg.vend - dom->modules[i].seg.vstart;
+
+ DOMPRINTF("Adding module %u guest_addr %"PRIx64" len %u",
+ i, mod.guest_addr_out, mod.length);
+
+ add_module_to_list(dom, &mod, dom->modules[i].cmdline,
+ modlist, start_info);
+ }
+
+ if ( start_info->nr_modules )
+ {
+ start_info->modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
+ ((uintptr_t)modlist - (uintptr_t)start_info);
+ }
+
+ /*
+ * Check a couple of XEN_HVM_MEMMAP_TYPEs to verify consistency with
+ * their corresponding e820 numerical values.
+ */
+ BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_RAM != E820_RAM);
+ BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_ACPI != E820_ACPI);
+
+ modsize = HVMLOADER_MODULE_MAX_COUNT *
+ (sizeof(*modlist) + HVMLOADER_MODULE_CMDLINE_SIZE);
+ memmap = (void*)modlist + modsize;
+
+ start_info->memmap_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
+ ((uintptr_t)modlist - (uintptr_t)start_info) + modsize;
+ start_info->memmap_entries = dom->e820_entries;
+ for ( i = 0; i < dom->e820_entries; i++ )
+ {
+ memmap[i].addr = dom->e820[i].addr;
+ memmap[i].size = dom->e820[i].size;
+ memmap[i].type = dom->e820[i].type;
+ }
+
+ start_info->magic = XEN_HVM_START_MAGIC_VALUE;
+ start_info->version = 1;
+
+ munmap(start_info, dom->start_info_seg.pages << XC_DOM_PAGE_SHIFT(dom));
+
+ if ( dom->device_model )
+ {
+ void *hvm_info_page;
+
+ if ( (hvm_info_page = xc_map_foreign_range(
+ xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ HVM_INFO_PFN)) == NULL )
+ return -1;
+ build_hvm_info(hvm_info_page, dom);
+ munmap(hvm_info_page, PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+bool xc_dom_translated(const struct xc_dom_image *dom)
+{
+ /* HVM guests are translated. PV guests are not. */
+ return dom->container_type == XC_DOM_HVM_CONTAINER;
+}
+
+/* ------------------------------------------------------------------------ */
+
+static struct xc_dom_arch xc_dom_32_pae = {
+ .guest_type = "xen-3.0-x86_32p",
+ .native_protocol = XEN_IO_PROTO_ABI_X86_32,
+ .page_shift = PAGE_SHIFT_X86,
+ .sizeof_pfn = 4,
+ .p2m_base_supported = 0,
+ .arch_private_size = sizeof(struct xc_dom_image_x86),
+ .alloc_magic_pages = alloc_magic_pages_pv,
+ .alloc_pgtables = alloc_pgtables_x86_32_pae,
+ .alloc_p2m_list = alloc_p2m_list_x86_32,
+ .setup_pgtables = setup_pgtables_x86_32_pae,
+ .start_info = start_info_x86_32,
+ .shared_info = shared_info_x86_32,
+ .vcpu = vcpu_x86_32,
+ .meminit = meminit_pv,
+ .bootearly = bootearly,
+ .bootlate = bootlate_pv,
+};
+
+static struct xc_dom_arch xc_dom_64 = {
+ .guest_type = "xen-3.0-x86_64",
+ .native_protocol = XEN_IO_PROTO_ABI_X86_64,
+ .page_shift = PAGE_SHIFT_X86,
+ .sizeof_pfn = 8,
+ .p2m_base_supported = 1,
+ .arch_private_size = sizeof(struct xc_dom_image_x86),
+ .alloc_magic_pages = alloc_magic_pages_pv,
+ .alloc_pgtables = alloc_pgtables_x86_64,
+ .alloc_p2m_list = alloc_p2m_list_x86_64,
+ .setup_pgtables = setup_pgtables_x86_64,
+ .start_info = start_info_x86_64,
+ .shared_info = shared_info_x86_64,
+ .vcpu = vcpu_x86_64,
+ .meminit = meminit_pv,
+ .bootearly = bootearly,
+ .bootlate = bootlate_pv,
+};
+
+static struct xc_dom_arch xc_hvm_32 = {
+ .guest_type = "hvm-3.0-x86_32",
+ .native_protocol = XEN_IO_PROTO_ABI_X86_32,
+ .page_shift = PAGE_SHIFT_X86,
+ .sizeof_pfn = 4,
+ .alloc_magic_pages = alloc_magic_pages_hvm,
+ .vcpu = vcpu_hvm,
+ .meminit = meminit_hvm,
+ .bootearly = bootearly,
+ .bootlate = bootlate_hvm,
+};
+
+static void __init register_arch_hooks(void)
+{
+ xc_dom_register_arch_hooks(&xc_dom_32_pae);
+ xc_dom_register_arch_hooks(&xc_dom_64);
+ xc_dom_register_arch_hooks(&xc_hvm_32);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * xg_domain.c
+ *
+ * API for manipulating and obtaining information on domains.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include "xg_private.h"
+#include "xc_core.h"
+
+int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *minfo)
+{
+ struct domain_info_context _di = { .guest_width = minfo->guest_width,
+ .p2m_size = minfo->p2m_size};
+ struct domain_info_context *dinfo = &_di;
+
+ free(minfo->pfn_type);
+ if ( minfo->p2m_table )
+ munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE);
+ minfo->p2m_table = NULL;
+
+ return 0;
+}
+
+int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid,
+ struct xc_domain_meminfo *minfo)
+{
+ struct domain_info_context _di;
+ struct domain_info_context *dinfo = &_di;
+
+ xc_dominfo_t info;
+ shared_info_any_t *live_shinfo;
+ xen_capabilities_info_t xen_caps = "";
+ int i;
+
+ /* Only be initialized once */
+ if ( minfo->pfn_type || minfo->p2m_table )
+ {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 )
+ {
+ PERROR("Could not get domain info");
+ return -1;
+ }
+
+ if ( xc_domain_get_guest_width(xch, domid, &minfo->guest_width) )
+ {
+ PERROR("Could not get domain address size");
+ return -1;
+ }
+ _di.guest_width = minfo->guest_width;
+
+ /* Get page table levels (see get_platform_info() in xg_save_restore.h */
+ if ( xc_version(xch, XENVER_capabilities, &xen_caps) )
+ {
+ PERROR("Could not get Xen capabilities (for page table levels)");
+ return -1;
+ }
+ if ( strstr(xen_caps, "xen-3.0-x86_64") )
+ /* Depends on whether it's a compat 32-on-64 guest */
+ minfo->pt_levels = ( (minfo->guest_width == 8) ? 4 : 3 );
+ else if ( strstr(xen_caps, "xen-3.0-x86_32p") )
+ minfo->pt_levels = 3;
+ else if ( strstr(xen_caps, "xen-3.0-x86_32") )
+ minfo->pt_levels = 2;
+ else
+ {
+ errno = EFAULT;
+ return -1;
+ }
+
+ /* We need the shared info page for mapping the P2M */
+ live_shinfo = xc_map_foreign_range(xch, domid, PAGE_SIZE, PROT_READ,
+ info.shared_info_frame);
+ if ( !live_shinfo )
+ {
+ PERROR("Could not map the shared info frame (MFN 0x%lx)",
+ info.shared_info_frame);
+ return -1;
+ }
+
+ if ( xc_core_arch_map_p2m_writable(xch, minfo->guest_width, &info,
+ live_shinfo, &minfo->p2m_table,
+ &minfo->p2m_size) )
+ {
+ PERROR("Could not map the P2M table");
+ munmap(live_shinfo, PAGE_SIZE);
+ return -1;
+ }
+ munmap(live_shinfo, PAGE_SIZE);
+ _di.p2m_size = minfo->p2m_size;
+
+ /* Make space and prepare for getting the PFN types */
+ minfo->pfn_type = calloc(sizeof(*minfo->pfn_type), minfo->p2m_size);
+ if ( !minfo->pfn_type )
+ {
+ PERROR("Could not allocate memory for the PFN types");
+ goto failed;
+ }
+ for ( i = 0; i < minfo->p2m_size; i++ )
+ minfo->pfn_type[i] = xc_pfn_to_mfn(i, minfo->p2m_table,
+ minfo->guest_width);
+
+ /* Retrieve PFN types in batches */
+ for ( i = 0; i < minfo->p2m_size ; i+=1024 )
+ {
+ int count = ((minfo->p2m_size - i ) > 1024 ) ?
+ 1024: (minfo->p2m_size - i);
+
+ if ( xc_get_pfn_type_batch(xch, domid, count, minfo->pfn_type + i) )
+ {
+ PERROR("Could not get %d-eth batch of PFN types", (i+1)/1024);
+ goto failed;
+ }
+ }
+
+ return 0;
+
+failed:
+ if ( minfo->pfn_type )
+ {
+ free(minfo->pfn_type);
+ minfo->pfn_type = NULL;
+ }
+ if ( minfo->p2m_table )
+ {
+ munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE);
+ minfo->p2m_table = NULL;
+ }
+
+ return -1;
+}
--- /dev/null
+/******************************************************************************
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright (c) 2011, Citrix Systems
+ */
+
+#include <inttypes.h>
+#include <errno.h>
+#include <xenctrl.h>
+#include <xenguest.h>
+
+int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t flags,
+ struct save_callbacks *callbacks,
+ xc_stream_type_t stream_type, int recv_fd)
+{
+ errno = ENOSYS;
+ return -1;
+}
+
+int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
+ unsigned int store_evtchn, unsigned long *store_mfn,
+ uint32_t store_domid, unsigned int console_evtchn,
+ unsigned long *console_mfn, uint32_t console_domid,
+ xc_stream_type_t stream_type,
+ struct restore_callbacks *callbacks, int send_back_fd)
+{
+ errno = ENOSYS;
+ return -1;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/******************************************************************************
+ * xc_offline_page.c
+ *
+ * Helper functions to offline/online one page
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <xc_core.h>
+
+#include "xc_private.h"
+#include "xenctrl_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+struct pte_backup_entry
+{
+ xen_pfn_t table_mfn;
+ int offset;
+};
+
+#define DEFAULT_BACKUP_COUNT 1024
+struct pte_backup
+{
+ struct pte_backup_entry *entries;
+ int max;
+ int cur;
+};
+
+static struct domain_info_context _dinfo;
+static struct domain_info_context *dinfo = &_dinfo;
+
+int xc_mark_page_online(xc_interface *xch, unsigned long start,
+ unsigned long end, uint32_t *status)
+{
+ DECLARE_SYSCTL;
+ DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+ int ret = -1;
+
+ if ( !status || (end < start) )
+ {
+ errno = EINVAL;
+ return -1;
+ }
+ if ( xc_hypercall_bounce_pre(xch, status) )
+ {
+ ERROR("Could not bounce memory for xc_mark_page_online\n");
+ return -1;
+ }
+
+ sysctl.cmd = XEN_SYSCTL_page_offline_op;
+ sysctl.u.page_offline.start = start;
+ sysctl.u.page_offline.cmd = sysctl_page_online;
+ sysctl.u.page_offline.end = end;
+ set_xen_guest_handle(sysctl.u.page_offline.status, status);
+ ret = xc_sysctl(xch, &sysctl);
+
+ xc_hypercall_bounce_post(xch, status);
+
+ return ret;
+}
+
+int xc_mark_page_offline(xc_interface *xch, unsigned long start,
+ unsigned long end, uint32_t *status)
+{
+ DECLARE_SYSCTL;
+ DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+ int ret = -1;
+
+ if ( !status || (end < start) )
+ {
+ errno = EINVAL;
+ return -1;
+ }
+ if ( xc_hypercall_bounce_pre(xch, status) )
+ {
+ ERROR("Could not bounce memory for xc_mark_page_offline");
+ return -1;
+ }
+
+ sysctl.cmd = XEN_SYSCTL_page_offline_op;
+ sysctl.u.page_offline.start = start;
+ sysctl.u.page_offline.cmd = sysctl_page_offline;
+ sysctl.u.page_offline.end = end;
+ set_xen_guest_handle(sysctl.u.page_offline.status, status);
+ ret = xc_sysctl(xch, &sysctl);
+
+ xc_hypercall_bounce_post(xch, status);
+
+ return ret;
+}
+
+int xc_query_page_offline_status(xc_interface *xch, unsigned long start,
+ unsigned long end, uint32_t *status)
+{
+ DECLARE_SYSCTL;
+ DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+ int ret = -1;
+
+ if ( !status || (end < start) )
+ {
+ errno = EINVAL;
+ return -1;
+ }
+ if ( xc_hypercall_bounce_pre(xch, status) )
+ {
+ ERROR("Could not bounce memory for xc_query_page_offline_status\n");
+ return -1;
+ }
+
+ sysctl.cmd = XEN_SYSCTL_page_offline_op;
+ sysctl.u.page_offline.start = start;
+ sysctl.u.page_offline.cmd = sysctl_query_page_offline;
+ sysctl.u.page_offline.end = end;
+ set_xen_guest_handle(sysctl.u.page_offline.status, status);
+ ret = xc_sysctl(xch, &sysctl);
+
+ xc_hypercall_bounce_post(xch, status);
+
+ return ret;
+}
+
+ /*
+ * There should no update to the grant when domain paused
+ */
+static int xc_is_page_granted_v1(xc_interface *xch, xen_pfn_t gpfn,
+ grant_entry_v1_t *gnttab, int gnt_num)
+{
+ int i = 0;
+
+ if (!gnttab)
+ return 0;
+
+ for (i = 0; i < gnt_num; i++)
+ if ( ((gnttab[i].flags & GTF_type_mask) != GTF_invalid) &&
+ (gnttab[i].frame == gpfn) )
+ break;
+
+ return (i != gnt_num);
+}
+
+static int xc_is_page_granted_v2(xc_interface *xch, xen_pfn_t gpfn,
+ grant_entry_v2_t *gnttab, int gnt_num)
+{
+ int i = 0;
+
+ if (!gnttab)
+ return 0;
+
+ for (i = 0; i < gnt_num; i++)
+ if ( ((gnttab[i].hdr.flags & GTF_type_mask) != GTF_invalid) &&
+ (gnttab[i].full_page.frame == gpfn) )
+ break;
+
+ return (i != gnt_num);
+}
+
+static int backup_ptes(xen_pfn_t table_mfn, int offset,
+ struct pte_backup *backup)
+{
+ if (!backup)
+ return -EINVAL;
+
+ if (backup->max == backup->cur)
+ {
+ backup->entries = realloc(backup->entries,
+ backup->max * 2 * sizeof(struct pte_backup_entry));
+ if (backup->entries == NULL)
+ return -1;
+ else
+ backup->max *= 2;
+ }
+
+ backup->entries[backup->cur].table_mfn = table_mfn;
+ backup->entries[backup->cur++].offset = offset;
+
+ return 0;
+}
+
+/*
+ * return:
+ * 1 when MMU update is required
+ * 0 when no changes
+ * <0 when error happen
+ */
+typedef int (*pte_func)(xc_interface *xch,
+ uint64_t pte, uint64_t *new_pte,
+ unsigned long table_mfn, int table_offset,
+ struct pte_backup *backup,
+ unsigned long no_use);
+
+static int __clear_pte(xc_interface *xch,
+ uint64_t pte, uint64_t *new_pte,
+ unsigned long table_mfn, int table_offset,
+ struct pte_backup *backup,
+ unsigned long mfn)
+{
+ /* If no new_pte pointer, same as no changes needed */
+ if (!new_pte || !backup)
+ return -EINVAL;
+
+ if ( !(pte & _PAGE_PRESENT))
+ return 0;
+
+ /* XXX Check for PSE bit here */
+ /* Hit one entry */
+ if ( ((pte >> PAGE_SHIFT_X86) & MFN_MASK_X86) == mfn)
+ {
+ *new_pte = pte & ~_PAGE_PRESENT;
+ if (!backup_ptes(table_mfn, table_offset, backup))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __update_pte(xc_interface *xch,
+ uint64_t pte, uint64_t *new_pte,
+ unsigned long table_mfn, int table_offset,
+ struct pte_backup *backup,
+ unsigned long new_mfn)
+{
+ int index;
+
+ if (!new_pte)
+ return 0;
+
+ for (index = 0; index < backup->cur; index ++)
+ if ( (backup->entries[index].table_mfn == table_mfn) &&
+ (backup->entries[index].offset == table_offset) )
+ break;
+
+ if (index != backup->cur)
+ {
+ if (pte & _PAGE_PRESENT)
+ ERROR("Page present while in backup ptes\n");
+ pte &= ~MFN_MASK_X86;
+ pte |= (new_mfn << PAGE_SHIFT_X86) | _PAGE_PRESENT;
+ *new_pte = pte;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int change_pte(xc_interface *xch, uint32_t domid,
+ struct xc_domain_meminfo *minfo,
+ struct pte_backup *backup,
+ struct xc_mmu *mmu,
+ pte_func func,
+ unsigned long data)
+{
+ int pte_num, rc;
+ uint64_t i;
+ void *content = NULL;
+
+ pte_num = PAGE_SIZE / ((minfo->pt_levels == 2) ? 4 : 8);
+
+ for (i = 0; i < minfo->p2m_size; i++)
+ {
+ xen_pfn_t table_mfn = xc_pfn_to_mfn(i, minfo->p2m_table,
+ minfo->guest_width);
+ uint64_t pte, new_pte;
+ int j;
+
+ if ( (table_mfn == INVALID_PFN) ||
+ ((minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
+ XEN_DOMCTL_PFINFO_XTAB) )
+ continue;
+
+ if ( minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
+ {
+ content = xc_map_foreign_range(xch, domid, PAGE_SIZE,
+ PROT_READ, table_mfn);
+ if (!content)
+ goto failed;
+
+ for (j = 0; j < pte_num; j++)
+ {
+ if ( minfo->pt_levels == 2 )
+ pte = ((const uint32_t*)content)[j];
+ else
+ pte = ((const uint64_t*)content)[j];
+
+ rc = func(xch, pte, &new_pte, table_mfn, j, backup, data);
+
+ switch (rc)
+ {
+ case 1:
+ if ( xc_add_mmu_update(xch, mmu,
+ table_mfn << PAGE_SHIFT |
+ j * ( (minfo->pt_levels == 2) ?
+ sizeof(uint32_t): sizeof(uint64_t)) |
+ MMU_PT_UPDATE_PRESERVE_AD,
+ new_pte) )
+ goto failed;
+ break;
+
+ case 0:
+ break;
+
+ default:
+ goto failed;
+ }
+ }
+
+ munmap(content, PAGE_SIZE);
+ content = NULL;
+ }
+ }
+
+ if ( xc_flush_mmu_updates(xch, mmu) )
+ goto failed;
+
+ return 0;
+failed:
+ /* XXX Shall we take action if we have fail to swap? */
+ if (content)
+ munmap(content, PAGE_SIZE);
+
+ return -1;
+}
+
+static int update_pte(xc_interface *xch, uint32_t domid,
+ struct xc_domain_meminfo *minfo,
+ struct pte_backup *backup,
+ struct xc_mmu *mmu,
+ unsigned long new_mfn)
+{
+ return change_pte(xch, domid, minfo, backup, mmu,
+ __update_pte, new_mfn);
+}
+
+static int clear_pte(xc_interface *xch, uint32_t domid,
+ struct xc_domain_meminfo *minfo,
+ struct pte_backup *backup,
+ struct xc_mmu *mmu,
+ xen_pfn_t mfn)
+{
+ return change_pte(xch, domid, minfo, backup, mmu,
+ __clear_pte, mfn);
+}
+
+/*
+ * Check if a page can be exchanged successfully
+ */
+
+static int is_page_exchangable(xc_interface *xch, uint32_t domid, xen_pfn_t mfn,
+ xc_dominfo_t *info)
+{
+ uint32_t status;
+ int rc;
+
+ /* domain checking */
+ if ( !domid || (domid > DOMID_FIRST_RESERVED) )
+ {
+ DPRINTF("Dom0's page can't be LM");
+ return 0;
+ }
+ if (info->hvm)
+ {
+ DPRINTF("Currently we can only live change PV guest's page\n");
+ return 0;
+ }
+
+ /* Check if pages are offline pending or not */
+ rc = xc_query_page_offline_status(xch, mfn, mfn, &status);
+
+ if ( rc || !(status & PG_OFFLINE_STATUS_OFFLINE_PENDING) )
+ {
+ ERROR("Page %lx is not offline pending %x\n",
+ mfn, status);
+ return 0;
+ }
+
+ return 1;
+}
+
+xen_pfn_t *xc_map_m2p(xc_interface *xch,
+ unsigned long max_mfn,
+ int prot,
+ unsigned long *mfn0)
+{
+ privcmd_mmap_entry_t *entries;
+ unsigned long m2p_chunks, m2p_size;
+ xen_pfn_t *m2p;
+ xen_pfn_t *extent_start;
+ int i;
+
+ m2p = NULL;
+ m2p_size = M2P_SIZE(max_mfn);
+ m2p_chunks = M2P_CHUNKS(max_mfn);
+
+ extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
+ if ( !extent_start )
+ {
+ ERROR("failed to allocate space for m2p mfns");
+ goto err0;
+ }
+
+ if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) )
+ {
+ PERROR("xc_get_m2p_mfns");
+ goto err1;
+ }
+
+ entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
+ if (entries == NULL)
+ {
+ ERROR("failed to allocate space for mmap entries");
+ goto err1;
+ }
+
+ for ( i = 0; i < m2p_chunks; i++ )
+ entries[i].mfn = extent_start[i];
+
+ m2p = xc_map_foreign_ranges(xch, DOMID_XEN,
+ m2p_size, prot, M2P_CHUNK_SIZE,
+ entries, m2p_chunks);
+ if (m2p == NULL)
+ {
+ PERROR("xc_mmap_foreign_ranges failed");
+ goto err2;
+ }
+
+ if (mfn0)
+ *mfn0 = entries[0].mfn;
+
+err2:
+ free(entries);
+err1:
+ free(extent_start);
+
+err0:
+ return m2p;
+}
+
+/* The domain should be suspended when called here */
+int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn)
+{
+ xc_dominfo_t info;
+ struct xc_domain_meminfo minfo;
+ struct xc_mmu *mmu = NULL;
+ struct pte_backup old_ptes = {NULL, 0, 0};
+ grant_entry_v1_t *gnttab_v1 = NULL;
+ grant_entry_v2_t *gnttab_v2 = NULL;
+ struct mmuext_op mops;
+ int gnt_num, unpined = 0;
+ void *old_p, *backup = NULL;
+ int rc, result = -1;
+ uint32_t status;
+ xen_pfn_t new_mfn, gpfn;
+ xen_pfn_t *m2p_table;
+ unsigned long max_mfn;
+
+ if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return -1;
+ }
+
+ if (!info.shutdown || info.shutdown_reason != SHUTDOWN_suspend)
+ {
+ errno = EINVAL;
+ ERROR("Can't exchange page unless domain is suspended\n");
+ return -1;
+ }
+ if (!is_page_exchangable(xch, domid, mfn, &info))
+ {
+ ERROR("Could not exchange page\n");
+ return -1;
+ }
+
+ /* Map M2P and obtain gpfn */
+ rc = xc_maximum_ram_page(xch, &max_mfn);
+ if ( rc || !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, NULL)) )
+ {
+ PERROR("Failed to map live M2P table");
+ return -1;
+ }
+ gpfn = m2p_table[mfn];
+
+ /* Map domain's memory information */
+ memset(&minfo, 0, sizeof(minfo));
+ if ( xc_map_domain_meminfo(xch, domid, &minfo) )
+ {
+ PERROR("Could not map domain's memory information\n");
+ goto failed;
+ }
+
+ /* For translation macros */
+ dinfo->guest_width = minfo.guest_width;
+ dinfo->p2m_size = minfo.p2m_size;
+
+ /* Don't exchange CR3 for PAE guest in PAE host environment */
+ if (minfo.guest_width > sizeof(long))
+ {
+ if ( (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
+ XEN_DOMCTL_PFINFO_L3TAB )
+ goto failed;
+ }
+
+ gnttab_v2 = xc_gnttab_map_table_v2(xch, domid, &gnt_num);
+ if (!gnttab_v2)
+ {
+ gnttab_v1 = xc_gnttab_map_table_v1(xch, domid, &gnt_num);
+ if (!gnttab_v1)
+ {
+ ERROR("Failed to map grant table\n");
+ goto failed;
+ }
+ }
+
+ if (gnttab_v1
+ ? xc_is_page_granted_v1(xch, mfn, gnttab_v1, gnt_num)
+ : xc_is_page_granted_v2(xch, mfn, gnttab_v2, gnt_num))
+ {
+ ERROR("Page %lx is granted now\n", mfn);
+ goto failed;
+ }
+
+ /* allocate required data structure */
+ backup = malloc(PAGE_SIZE);
+ if (!backup)
+ {
+ ERROR("Failed to allocate backup pages pointer\n");
+ goto failed;
+ }
+
+ old_ptes.max = DEFAULT_BACKUP_COUNT;
+ old_ptes.entries = malloc(sizeof(struct pte_backup_entry) *
+ DEFAULT_BACKUP_COUNT);
+
+ if (!old_ptes.entries)
+ {
+ ERROR("Faield to allocate backup\n");
+ goto failed;
+ }
+ old_ptes.cur = 0;
+
+ /* Unpin the page if it is pined */
+ if (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LPINTAB)
+ {
+ mops.cmd = MMUEXT_UNPIN_TABLE;
+ mops.arg1.mfn = mfn;
+
+ if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 )
+ {
+ ERROR("Failed to unpin page %lx", mfn);
+ goto failed;
+ }
+ mops.arg1.mfn = mfn;
+ unpined = 1;
+ }
+
+ /* backup the content */
+ old_p = xc_map_foreign_range(xch, domid, PAGE_SIZE,
+ PROT_READ, mfn);
+ if (!old_p)
+ {
+ ERROR("Failed to map foreign page %lx\n", mfn);
+ goto failed;
+ }
+
+ memcpy(backup, old_p, PAGE_SIZE);
+ munmap(old_p, PAGE_SIZE);
+
+ mmu = xc_alloc_mmu_updates(xch, domid);
+ if ( mmu == NULL )
+ {
+ ERROR("%s: failed at %d\n", __FUNCTION__, __LINE__);
+ goto failed;
+ }
+
+ /* Firstly update all pte to be invalid to remove the reference */
+ rc = clear_pte(xch, domid, &minfo, &old_ptes, mmu, mfn);
+
+ if (rc)
+ {
+ ERROR("clear pte failed\n");
+ goto failed;
+ }
+
+ rc = xc_domain_memory_exchange_pages(xch, domid,
+ 1, 0, &mfn,
+ 1, 0, &new_mfn);
+
+ if (rc)
+ {
+ ERROR("Exchange the page failed\n");
+ /* Exchange fail means there are refere to the page still */
+ rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, mfn);
+ if (rc)
+ result = -2;
+ goto failed;
+ }
+
+ rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, new_mfn);
+
+ if (rc)
+ {
+ ERROR("update pte failed guest may be broken now\n");
+ /* No recover action now for swap fail */
+ result = -2;
+ goto failed;
+ }
+
+ /* Check if pages are offlined already */
+ rc = xc_query_page_offline_status(xch, mfn, mfn,
+ &status);
+
+ if (rc)
+ {
+ ERROR("Fail to query offline status\n");
+ }else if ( !(status & PG_OFFLINE_STATUS_OFFLINED) )
+ {
+ ERROR("page is still online or pending\n");
+ goto failed;
+ }
+ else
+ {
+ void *new_p;
+ IPRINTF("Now page is offlined %lx\n", mfn);
+ /* Update the p2m table */
+ minfo.p2m_table[gpfn] = new_mfn;
+
+ new_p = xc_map_foreign_range(xch, domid, PAGE_SIZE,
+ PROT_READ|PROT_WRITE, new_mfn);
+ if ( new_p == NULL )
+ {
+ ERROR("failed to map new_p for copy, guest may be broken?");
+ goto failed;
+ }
+ memcpy(new_p, backup, PAGE_SIZE);
+ munmap(new_p, PAGE_SIZE);
+ mops.arg1.mfn = new_mfn;
+ result = 0;
+ }
+
+failed:
+
+ if (unpined && (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB))
+ {
+ switch ( minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
+ {
+ case XEN_DOMCTL_PFINFO_L1TAB:
+ mops.cmd = MMUEXT_PIN_L1_TABLE;
+ break;
+
+ case XEN_DOMCTL_PFINFO_L2TAB:
+ mops.cmd = MMUEXT_PIN_L2_TABLE;
+ break;
+
+ case XEN_DOMCTL_PFINFO_L3TAB:
+ mops.cmd = MMUEXT_PIN_L3_TABLE;
+ break;
+
+ case XEN_DOMCTL_PFINFO_L4TAB:
+ mops.cmd = MMUEXT_PIN_L4_TABLE;
+ break;
+
+ default:
+ ERROR("Unpined for non pate table page\n");
+ break;
+ }
+
+ if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 )
+ {
+ ERROR("failed to pin the mfn again\n");
+ result = -2;
+ }
+ }
+
+ free(mmu);
+
+ free(old_ptes.entries);
+
+ free(backup);
+
+ if (gnttab_v1)
+ munmap(gnttab_v1, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v1_t)));
+ if (gnttab_v2)
+ munmap(gnttab_v2, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v2_t)));
+
+ xc_unmap_domain_meminfo(xch, &minfo);
+ munmap(m2p_table, M2P_SIZE(max_mfn));
+
+ return result;
+}
--- /dev/null
+/******************************************************************************
+ * xg_private.c
+ *
+ * Helper functions for the rest of the library.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include "xg_private.h"
+
+char *xc_read_image(xc_interface *xch,
+ const char *filename, unsigned long *size)
+{
+ int kernel_fd = -1;
+ gzFile kernel_gfd = NULL;
+ char *image = NULL, *tmp;
+ unsigned int bytes;
+
+ if ( (filename == NULL) || (size == NULL) )
+ return NULL;
+
+ if ( (kernel_fd = open(filename, O_RDONLY)) < 0 )
+ {
+ PERROR("Could not open kernel image '%s'", filename);
+ goto out;
+ }
+
+ if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL )
+ {
+ PERROR("Could not allocate decompression state for state file");
+ goto out;
+ }
+
+ *size = 0;
+
+#define CHUNK 1*1024*1024
+ while(1)
+ {
+ if ( (tmp = realloc(image, *size + CHUNK)) == NULL )
+ {
+ PERROR("Could not allocate memory for kernel image");
+ free(image);
+ image = NULL;
+ goto out;
+ }
+ image = tmp;
+
+ bytes = gzread(kernel_gfd, image + *size, CHUNK);
+ switch (bytes)
+ {
+ case -1:
+ PERROR("Error reading kernel image");
+ free(image);
+ image = NULL;
+ goto out;
+ case 0: /* EOF */
+ if ( *size == 0 )
+ {
+ PERROR("Could not read kernel image");
+ free(image);
+ image = NULL;
+ }
+ goto out;
+ default:
+ *size += bytes;
+ break;
+ }
+ }
+#undef CHUNK
+
+ out:
+ if ( image )
+ {
+ /* Shrink allocation to fit image. */
+ tmp = realloc(image, *size);
+ if ( tmp )
+ image = tmp;
+ }
+
+ if ( kernel_gfd != NULL )
+ gzclose(kernel_gfd);
+ else if ( kernel_fd >= 0 )
+ close(kernel_fd);
+ return image;
+}
+
+char *xc_inflate_buffer(xc_interface *xch,
+ const char *in_buf, unsigned long in_size,
+ unsigned long *out_size)
+{
+ int sts;
+ z_stream zStream;
+ unsigned long out_len;
+ char *out_buf;
+
+ /* Not compressed? Then return the original buffer. */
+ if ( ((unsigned char)in_buf[0] != 0x1F) ||
+ ((unsigned char)in_buf[1] != 0x8B) )
+ {
+ if ( out_size != NULL )
+ *out_size = in_size;
+ return (char *)in_buf;
+ }
+
+ out_len = (unsigned char)in_buf[in_size-4] +
+ (256 * ((unsigned char)in_buf[in_size-3] +
+ (256 * ((unsigned char)in_buf[in_size-2] +
+ (256 * (unsigned char)in_buf[in_size-1])))));
+
+ memset(&zStream, 0, sizeof(zStream));
+ out_buf = malloc(out_len + 16); /* Leave a little extra space */
+ if ( out_buf == NULL )
+ {
+ ERROR("Error mallocing buffer\n");
+ return NULL;
+ }
+
+ zStream.next_in = (unsigned char *)in_buf;
+ zStream.avail_in = in_size;
+ zStream.next_out = (unsigned char *)out_buf;
+ zStream.avail_out = out_len+16;
+ sts = inflateInit2(&zStream, (MAX_WBITS+32)); /* +32 means "handle gzip" */
+ if ( sts != Z_OK )
+ {
+ ERROR("inflateInit failed, sts %d\n", sts);
+ free(out_buf);
+ return NULL;
+ }
+
+ /* Inflate in one pass/call */
+ sts = inflate(&zStream, Z_FINISH);
+ inflateEnd(&zStream);
+ if ( sts != Z_STREAM_END )
+ {
+ ERROR("inflate failed, sts %d\n", sts);
+ free(out_buf);
+ return NULL;
+ }
+
+ if ( out_size != NULL )
+ *out_size = out_len;
+
+ return out_buf;
+}
+
+/*******************/
+
+int pin_table(
+ xc_interface *xch, unsigned int type, unsigned long mfn, uint32_t dom)
+{
+ struct mmuext_op op;
+
+ op.cmd = type;
+ op.arg1.mfn = mfn;
+
+ if ( xc_mmuext_op(xch, &op, 1, dom) < 0 )
+ return 1;
+
+ return 0;
+}
+
+/* This is shared between save and restore, and may generally be useful. */
+unsigned long csum_page(void *page)
+{
+ int i;
+ unsigned long *p = page;
+ unsigned long long sum=0;
+
+ for ( i = 0; i < (PAGE_SIZE/sizeof(unsigned long)); i++ )
+ sum += p[i];
+
+ return sum ^ (sum>>32);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XG_PRIVATE_H
+#define XG_PRIVATE_H
+
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "xc_private.h"
+#include "xenguest.h"
+
+#include <xen/memory.h>
+#include <xen/elfnote.h>
+
+#ifndef ELFSIZE
+#include <limits.h>
+#if UINT_MAX == ULONG_MAX
+#define ELFSIZE 32
+#else
+#define ELFSIZE 64
+#endif
+#endif
+
+char *xc_read_image(xc_interface *xch,
+ const char *filename, unsigned long *size);
+char *xc_inflate_buffer(xc_interface *xch,
+ const char *in_buf,
+ unsigned long in_size,
+ unsigned long *out_size);
+
+unsigned long csum_page (void * page);
+
+#define _PAGE_PRESENT 0x001
+#define _PAGE_RW 0x002
+#define _PAGE_USER 0x004
+#define _PAGE_PWT 0x008
+#define _PAGE_PCD 0x010
+#define _PAGE_ACCESSED 0x020
+#define _PAGE_DIRTY 0x040
+#define _PAGE_PAT 0x080
+#define _PAGE_PSE 0x080
+#define _PAGE_GLOBAL 0x100
+
+#define VIRT_BITS_I386 32
+#define VIRT_BITS_X86_64 48
+
+#define PGTBL_LEVELS_I386 3
+#define PGTBL_LEVELS_X86_64 4
+
+#define PGTBL_LEVEL_SHIFT_X86 9
+
+#define L1_PAGETABLE_SHIFT_PAE 12
+#define L2_PAGETABLE_SHIFT_PAE 21
+#define L3_PAGETABLE_SHIFT_PAE 30
+#define L1_PAGETABLE_ENTRIES_PAE 512
+#define L2_PAGETABLE_ENTRIES_PAE 512
+#define L3_PAGETABLE_ENTRIES_PAE 4
+
+#define L1_PAGETABLE_SHIFT_X86_64 12
+#define L2_PAGETABLE_SHIFT_X86_64 21
+#define L3_PAGETABLE_SHIFT_X86_64 30
+#define L4_PAGETABLE_SHIFT_X86_64 39
+#define L1_PAGETABLE_ENTRIES_X86_64 512
+#define L2_PAGETABLE_ENTRIES_X86_64 512
+#define L3_PAGETABLE_ENTRIES_X86_64 512
+#define L4_PAGETABLE_ENTRIES_X86_64 512
+
+typedef uint64_t x86_pgentry_t;
+
+#define PAGE_SHIFT_ARM 12
+#define PAGE_SIZE_ARM (1UL << PAGE_SHIFT_ARM)
+#define PAGE_MASK_ARM (~(PAGE_SIZE_ARM-1))
+
+#define PAGE_SHIFT_X86 12
+#define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86)
+#define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1))
+
+#define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT)
+
+static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m,
+ unsigned gwidth)
+{
+ if ( gwidth == sizeof(uint64_t) )
+ /* 64 bit guest. Need to truncate their pfns for 32 bit toolstacks. */
+ return ((uint64_t *)p2m)[pfn];
+ else
+ {
+ /* 32 bit guest. Need to expand INVALID_MFN for 64 bit toolstacks. */
+ uint32_t mfn = ((uint32_t *)p2m)[pfn];
+
+ return mfn == ~0U ? INVALID_MFN : mfn;
+ }
+}
+
+
+/* Masks for PTE<->PFN conversions */
+#define MADDR_BITS_X86 ((dinfo->guest_width == 8) ? 52 : 44)
+#define MFN_MASK_X86 ((1ULL << (MADDR_BITS_X86 - PAGE_SHIFT_X86)) - 1)
+#define MADDR_MASK_X86 (MFN_MASK_X86 << PAGE_SHIFT_X86)
+
+int pin_table(xc_interface *xch, unsigned int type, unsigned long mfn,
+ uint32_t dom);
+
+#endif /* XG_PRIVATE_H */
--- /dev/null
+/*
+ * Definitions and utilities for save / restore.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xc_private.h"
+
+#include <xen/foreign/x86_32.h>
+#include <xen/foreign/x86_64.h>
+
+/*
+** We process save/restore/migrate in batches of pages; the below
+** determines how many pages we (at maximum) deal with in each batch.
+*/
+#define MAX_BATCH_SIZE 1024 /* up to 1024 pages (4MB) at a time */
+
+/* When pinning page tables at the end of restore, we also use batching. */
+#define MAX_PIN_BATCH 1024
+
+/*
+** Determine various platform information required for save/restore, in
+** particular:
+**
+** - the maximum MFN on this machine, used to compute the size of
+** the M2P table;
+**
+** - the starting virtual address of the the hypervisor; we use this
+** to determine which parts of guest address space(s) do and don't
+** require canonicalization during save/restore; and
+**
+** - the number of page-table levels for save/ restore. This should
+** be a property of the domain, but for the moment we just read it
+** from the hypervisor.
+**
+** - The width of a guest word (unsigned long), in bytes.
+**
+** Returns 1 on success, 0 on failure.
+*/
+static inline int get_platform_info(xc_interface *xch, uint32_t dom,
+ /* OUT */ unsigned long *max_mfn,
+ /* OUT */ unsigned long *hvirt_start,
+ /* OUT */ unsigned int *pt_levels,
+ /* OUT */ unsigned int *guest_width)
+{
+ xen_capabilities_info_t xen_caps = "";
+ xen_platform_parameters_t xen_params;
+
+ if (xc_version(xch, XENVER_platform_parameters, &xen_params) != 0)
+ return 0;
+
+ if (xc_version(xch, XENVER_capabilities, &xen_caps) != 0)
+ return 0;
+
+ if (xc_maximum_ram_page(xch, max_mfn))
+ return 0;
+
+ *hvirt_start = xen_params.virt_start;
+
+ if ( xc_domain_get_guest_width(xch, dom, guest_width) != 0)
+ return 0;
+
+ /* 64-bit tools will see the 64-bit hvirt_start, but 32-bit guests
+ * will be using the compat one. */
+ if ( *guest_width < sizeof (unsigned long) )
+ /* XXX need to fix up a way of extracting this value from Xen if
+ * XXX it becomes variable for domU */
+ *hvirt_start = 0xf5800000;
+
+ if (strstr(xen_caps, "xen-3.0-x86_64"))
+ /* Depends on whether it's a compat 32-on-64 guest */
+ *pt_levels = ( (*guest_width == 8) ? 4 : 3 );
+ else if (strstr(xen_caps, "xen-3.0-x86_32p"))
+ *pt_levels = 3;
+ else
+ return 0;
+
+ return 1;
+}
+
+
+/*
+** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables.
+** The M2P simply holds the corresponding PFN, while the top bit of a P2M
+** entry tell us whether or not the the PFN is currently mapped.
+*/
+
+#define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10))
+
+
+/*
+** The M2P is made up of some number of 'chunks' of at least 2MB in size.
+** The below definitions and utility function(s) deal with mapping the M2P
+** regarldess of the underlying machine memory size or architecture.
+*/
+#define M2P_SHIFT L2_PAGETABLE_SHIFT_PAE
+#define M2P_CHUNK_SIZE (1 << M2P_SHIFT)
+#define M2P_SIZE(_m) ROUNDUP(((_m) * sizeof(xen_pfn_t)), M2P_SHIFT)
+#define M2P_CHUNKS(_m) (M2P_SIZE((_m)) >> M2P_SHIFT)
+
+#define UNFOLD_CR3(_c) \
+ ((uint64_t)((dinfo->guest_width == 8) \
+ ? ((_c) >> 12) \
+ : (((uint32_t)(_c) >> 12) | ((uint32_t)(_c) << 20))))
+
+#define FOLD_CR3(_c) \
+ ((uint64_t)((dinfo->guest_width == 8) \
+ ? ((uint64_t)(_c)) << 12 \
+ : (((uint32_t)(_c) << 12) | ((uint32_t)(_c) >> 20))))
+
+#define MEMCPY_FIELD(_d, _s, _f, _w) do { \
+ if ((_w) == 8) \
+ memcpy(&(_d)->x64._f, &(_s)->x64._f,sizeof((_d)->x64._f)); \
+ else \
+ memcpy(&(_d)->x32._f, &(_s)->x32._f,sizeof((_d)->x32._f)); \
+} while (0)
+
+#define MEMSET_ARRAY_FIELD(_p, _f, _v, _w) do { \
+ if ((_w) == 8) \
+ memset(&(_p)->x64._f[0], (_v), sizeof((_p)->x64._f)); \
+ else \
+ memset(&(_p)->x32._f[0], (_v), sizeof((_p)->x32._f)); \
+} while (0)
--- /dev/null
+#include <assert.h>
+
+#include "xg_sr_common.h"
+
+#include <xen-tools/libs.h>
+
+static const char *const dhdr_types[] =
+{
+ [DHDR_TYPE_X86_PV] = "x86 PV",
+ [DHDR_TYPE_X86_HVM] = "x86 HVM",
+};
+
+const char *dhdr_type_to_str(uint32_t type)
+{
+ if ( type < ARRAY_SIZE(dhdr_types) && dhdr_types[type] )
+ return dhdr_types[type];
+
+ return "Reserved";
+}
+
+static const char *const mandatory_rec_types[] =
+{
+ [REC_TYPE_END] = "End",
+ [REC_TYPE_PAGE_DATA] = "Page data",
+ [REC_TYPE_X86_PV_INFO] = "x86 PV info",
+ [REC_TYPE_X86_PV_P2M_FRAMES] = "x86 PV P2M frames",
+ [REC_TYPE_X86_PV_VCPU_BASIC] = "x86 PV vcpu basic",
+ [REC_TYPE_X86_PV_VCPU_EXTENDED] = "x86 PV vcpu extended",
+ [REC_TYPE_X86_PV_VCPU_XSAVE] = "x86 PV vcpu xsave",
+ [REC_TYPE_SHARED_INFO] = "Shared info",
+ [REC_TYPE_X86_TSC_INFO] = "x86 TSC info",
+ [REC_TYPE_HVM_CONTEXT] = "HVM context",
+ [REC_TYPE_HVM_PARAMS] = "HVM params",
+ [REC_TYPE_TOOLSTACK] = "Toolstack",
+ [REC_TYPE_X86_PV_VCPU_MSRS] = "x86 PV vcpu msrs",
+ [REC_TYPE_VERIFY] = "Verify",
+ [REC_TYPE_CHECKPOINT] = "Checkpoint",
+ [REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST] = "Checkpoint dirty pfn list",
+ [REC_TYPE_STATIC_DATA_END] = "Static data end",
+ [REC_TYPE_X86_CPUID_POLICY] = "x86 CPUID policy",
+ [REC_TYPE_X86_MSR_POLICY] = "x86 MSR policy",
+};
+
+const char *rec_type_to_str(uint32_t type)
+{
+ if ( !(type & REC_TYPE_OPTIONAL) )
+ {
+ if ( (type < ARRAY_SIZE(mandatory_rec_types)) &&
+ (mandatory_rec_types[type]) )
+ return mandatory_rec_types[type];
+ }
+
+ return "Reserved";
+}
+
+int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
+ void *buf, size_t sz)
+{
+ static const char zeroes[(1u << REC_ALIGN_ORDER) - 1] = { 0 };
+
+ xc_interface *xch = ctx->xch;
+ typeof(rec->length) combined_length = rec->length + sz;
+ size_t record_length = ROUNDUP(combined_length, REC_ALIGN_ORDER);
+ struct iovec parts[] = {
+ { &rec->type, sizeof(rec->type) },
+ { &combined_length, sizeof(combined_length) },
+ { rec->data, rec->length },
+ { buf, sz },
+ { (void *)zeroes, record_length - combined_length },
+ };
+
+ if ( record_length > REC_LENGTH_MAX )
+ {
+ ERROR("Record (0x%08x, %s) length %#zx exceeds max (%#x)", rec->type,
+ rec_type_to_str(rec->type), record_length, REC_LENGTH_MAX);
+ return -1;
+ }
+
+ if ( rec->length )
+ assert(rec->data);
+ if ( sz )
+ assert(buf);
+
+ if ( writev_exact(ctx->fd, parts, ARRAY_SIZE(parts)) )
+ goto err;
+
+ return 0;
+
+ err:
+ PERROR("Unable to write record to stream");
+ return -1;
+}
+
+int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rhdr rhdr;
+ size_t datasz;
+
+ if ( read_exact(fd, &rhdr, sizeof(rhdr)) )
+ {
+ PERROR("Failed to read Record Header from stream");
+ return -1;
+ }
+
+ if ( rhdr.length > REC_LENGTH_MAX )
+ {
+ ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rhdr.type,
+ rec_type_to_str(rhdr.type), rhdr.length, REC_LENGTH_MAX);
+ return -1;
+ }
+
+ datasz = ROUNDUP(rhdr.length, REC_ALIGN_ORDER);
+
+ if ( datasz )
+ {
+ rec->data = malloc(datasz);
+
+ if ( !rec->data )
+ {
+ ERROR("Unable to allocate %zu bytes for record data (0x%08x, %s)",
+ datasz, rhdr.type, rec_type_to_str(rhdr.type));
+ return -1;
+ }
+
+ if ( read_exact(fd, rec->data, datasz) )
+ {
+ free(rec->data);
+ rec->data = NULL;
+ PERROR("Failed to read %zu bytes of data for record (0x%08x, %s)",
+ datasz, rhdr.type, rec_type_to_str(rhdr.type));
+ return -1;
+ }
+ }
+ else
+ rec->data = NULL;
+
+ rec->type = rhdr.type;
+ rec->length = rhdr.length;
+
+ return 0;
+};
+
+static void __attribute__((unused)) build_assertions(void)
+{
+ BUILD_BUG_ON(sizeof(struct xc_sr_ihdr) != 24);
+ BUILD_BUG_ON(sizeof(struct xc_sr_dhdr) != 16);
+ BUILD_BUG_ON(sizeof(struct xc_sr_rhdr) != 8);
+
+ BUILD_BUG_ON(sizeof(struct xc_sr_rec_page_data_header) != 8);
+ BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_info) != 8);
+ BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_p2m_frames) != 8);
+ BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_vcpu_hdr) != 8);
+ BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_tsc_info) != 24);
+ BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params_entry) != 16);
+ BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params) != 8);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#ifndef __COMMON__H
+#define __COMMON__H
+
+#include <stdbool.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+#include "xenctrl_dom.h"
+#include "xc_bitops.h"
+
+#include "xg_sr_stream_format.h"
+
+/* String representation of Domain Header types. */
+const char *dhdr_type_to_str(uint32_t type);
+
+/* String representation of Record types. */
+const char *rec_type_to_str(uint32_t type);
+
+struct xc_sr_context;
+struct xc_sr_record;
+
+/**
+ * Save operations. To be implemented for each type of guest, for use by the
+ * common save algorithm.
+ *
+ * Every function must be implemented, even if only with a no-op stub.
+ */
+struct xc_sr_save_ops
+{
+ /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */
+ xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
+
+ /**
+ * Optionally transform the contents of a page from being specific to the
+ * sending environment, to being generic for the stream.
+ *
+ * The page of data at the end of 'page' may be a read-only mapping of a
+ * running guest; it must not be modified. If no transformation is
+ * required, the callee should leave '*pages' untouched.
+ *
+ * If a transformation is required, the callee should allocate themselves
+ * a local page using malloc() and return it via '*page'.
+ *
+ * The caller shall free() '*page' in all cases. In the case that the
+ * callee encounters an error, it should *NOT* free() the memory it
+ * allocated for '*page'.
+ *
+ * It is valid to fail with EAGAIN if the transformation is not able to be
+ * completed at this point. The page shall be retried later.
+ *
+ * @returns 0 for success, -1 for failure, with errno appropriately set.
+ */
+ int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type,
+ void **page);
+
+ /**
+ * Set up local environment to save a domain. (Typically querying
+ * running domain state, setting up mappings etc.)
+ *
+ * This is called once before any common setup has occurred, allowing for
+ * guest-specific adjustments to be made to common state.
+ */
+ int (*setup)(struct xc_sr_context *ctx);
+
+ /**
+ * Send static records at the head of the stream. This is called once,
+ * after the Image and Domain headers are written.
+ */
+ int (*static_data)(struct xc_sr_context *ctx);
+
+ /**
+ * Send dynamic records which need to be at the start of the stream. This
+ * is called after the STATIC_DATA_END record is written.
+ */
+ int (*start_of_stream)(struct xc_sr_context *ctx);
+
+ /**
+ * Send records which need to be at the start of a checkpoint. This is
+ * called once, or once per checkpoint in a checkpointed stream, and is
+ * ahead of memory data.
+ */
+ int (*start_of_checkpoint)(struct xc_sr_context *ctx);
+
+ /**
+ * Send records which need to be at the end of the checkpoint. This is
+ * called once, or once per checkpoint in a checkpointed stream, and is
+ * after the memory data.
+ */
+ int (*end_of_checkpoint)(struct xc_sr_context *ctx);
+
+ /**
+ * Check state of guest to decide whether it makes sense to continue
+ * migration. This is called in each iteration or checkpoint to check
+ * whether all criteria for the migration are still met. If that's not
+ * the case either migration is cancelled via a bad rc or the situation
+ * is handled, e.g. by sending appropriate records.
+ */
+ int (*check_vm_state)(struct xc_sr_context *ctx);
+
+ /**
+ * Clean up the local environment. Will be called exactly once, either
+ * after a successful save, or upon encountering an error.
+ */
+ int (*cleanup)(struct xc_sr_context *ctx);
+};
+
+
+/**
+ * Restore operations. To be implemented for each type of guest, for use by
+ * the common restore algorithm.
+ *
+ * Every function must be implemented, even if only with a no-op stub.
+ */
+struct xc_sr_restore_ops
+{
+ /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */
+ xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
+
+ /* Check to see whether a PFN is valid. */
+ bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
+
+ /* Set the GFN of a PFN. */
+ void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
+
+ /* Set the type of a PFN. */
+ void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
+ xen_pfn_t type);
+
+ /**
+ * Optionally transform the contents of a page from being generic in the
+ * stream, to being specific to the restoring environment.
+ *
+ * 'page' is expected to be modified in-place if a transformation is
+ * required.
+ *
+ * @returns 0 for success, -1 for failure, with errno appropriately set.
+ */
+ int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page);
+
+ /**
+ * Set up local environment to restore a domain.
+ *
+ * This is called once before any common setup has occurred, allowing for
+ * guest-specific adjustments to be made to common state.
+ */
+ int (*setup)(struct xc_sr_context *ctx);
+
+ /**
+ * Process an individual record from the stream. The caller shall take
+ * care of processing common records (e.g. END, PAGE_DATA).
+ *
+ * @return 0 for success, -1 for failure, or the following sentinels:
+ * - RECORD_NOT_PROCESSED
+ * - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and
+ * a failover is needed.
+ */
+#define RECORD_NOT_PROCESSED 1
+#define BROKEN_CHANNEL 2
+ int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);
+
+ /**
+ * Perform any actions required after the static data has arrived. Called
+ * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
+ * 'missing' should be filled in for any data item the higher level
+ * toolstack needs to provide compatiblity for.
+ */
+ int (*static_data_complete)(struct xc_sr_context *ctx,
+ unsigned int *missing);
+
+ /**
+ * Perform any actions required after the stream has been finished. Called
+ * after the END record has been received.
+ */
+ int (*stream_complete)(struct xc_sr_context *ctx);
+
+ /**
+ * Clean up the local environment. Will be called exactly once, either
+ * after a successful restore, or upon encountering an error.
+ */
+ int (*cleanup)(struct xc_sr_context *ctx);
+};
+
+/* Wrapper for blobs of data heading Xen-wards. */
+struct xc_sr_blob
+{
+ void *ptr;
+ size_t size;
+};
+
+/*
+ * Update a blob. Duplicate src/size, freeing the old blob if necessary. May
+ * fail due to memory allocation.
+ */
+static inline int update_blob(struct xc_sr_blob *blob,
+ const void *src, size_t size)
+{
+ void *ptr;
+
+ if ( !src || !size )
+ {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if ( (ptr = malloc(size)) == NULL )
+ return -1;
+
+ free(blob->ptr);
+ blob->ptr = memcpy(ptr, src, size);
+ blob->size = size;
+
+ return 0;
+}
+
+struct xc_sr_context
+{
+ xc_interface *xch;
+ uint32_t domid;
+ int fd;
+
+ /* Plain VM, or checkpoints over time. */
+ xc_stream_type_t stream_type;
+
+ xc_dominfo_t dominfo;
+
+ union /* Common save or restore data. */
+ {
+ struct /* Save data. */
+ {
+ int recv_fd;
+
+ struct xc_sr_save_ops ops;
+ struct save_callbacks *callbacks;
+
+ /* Live migrate vs non live suspend. */
+ bool live;
+
+ /* Further debugging information in the stream. */
+ bool debug;
+
+ unsigned long p2m_size;
+
+ struct precopy_stats stats;
+
+ xen_pfn_t *batch_pfns;
+ unsigned int nr_batch_pfns;
+ unsigned long *deferred_pages;
+ unsigned long nr_deferred_pages;
+ xc_hypercall_buffer_t dirty_bitmap_hbuf;
+ } save;
+
+ struct /* Restore data. */
+ {
+ struct xc_sr_restore_ops ops;
+ struct restore_callbacks *callbacks;
+
+ int send_back_fd;
+ unsigned long p2m_size;
+ xc_hypercall_buffer_t dirty_bitmap_hbuf;
+
+ /* From Image Header. */
+ uint32_t format_version;
+
+ /* From Domain Header. */
+ uint32_t guest_type;
+ uint32_t guest_page_size;
+
+ /* Currently buffering records between a checkpoint */
+ bool buffer_all_records;
+
+ /* Whether a STATIC_DATA_END record has been seen/inferred. */
+ bool seen_static_data_end;
+
+/*
+ * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
+ * in case the primary will fail, we can recover from the last
+ * checkpoint state.
+ * This should be enough for most of the cases because primary only send
+ * dirty pages at checkpoint.
+ */
+#define DEFAULT_BUF_RECORDS 1024
+ struct xc_sr_record *buffered_records;
+ unsigned int allocated_rec_num;
+ unsigned int buffered_rec_num;
+
+ /*
+ * Xenstore and Console parameters.
+ * INPUT: evtchn & domid
+ * OUTPUT: gfn
+ */
+ xen_pfn_t xenstore_gfn, console_gfn;
+ unsigned int xenstore_evtchn, console_evtchn;
+ uint32_t xenstore_domid, console_domid;
+
+ /* Bitmap of currently populated PFNs during restore. */
+ unsigned long *populated_pfns;
+ xen_pfn_t max_populated_pfn;
+
+ /* Sender has invoked verify mode on the stream. */
+ bool verify;
+ } restore;
+ };
+
+ union /* Guest-arch specific data. */
+ {
+ struct /* x86 */
+ {
+ /* Common save/restore data. */
+ union
+ {
+ struct
+ {
+ /* X86_{CPUID,MSR}_DATA blobs for CPU Policy. */
+ struct xc_sr_blob cpuid, msr;
+ } restore;
+ };
+
+ struct /* x86 PV guest. */
+ {
+ /* 4 or 8; 32 or 64 bit domain */
+ unsigned int width;
+ /* 3 or 4 pagetable levels */
+ unsigned int levels;
+
+ /* Maximum Xen frame */
+ xen_pfn_t max_mfn;
+ /* Read-only machine to phys map */
+ xen_pfn_t *m2p;
+ /* first mfn of the compat m2p (Only needed for 32bit PV guests) */
+ xen_pfn_t compat_m2p_mfn0;
+ /* Number of m2p frames mapped */
+ unsigned long nr_m2p_frames;
+
+ /* Maximum guest frame */
+ xen_pfn_t max_pfn;
+
+ /* Number of frames making up the p2m */
+ unsigned int p2m_frames;
+ /* Guest's phys to machine map. Mapped read-only (save) or
+ * allocated locally (restore). Uses guest unsigned longs. */
+ void *p2m;
+ /* The guest pfns containing the p2m leaves */
+ xen_pfn_t *p2m_pfns;
+
+ /* Read-only mapping of guests shared info page */
+ shared_info_any_t *shinfo;
+
+ /* p2m generation count for verifying validity of local p2m. */
+ uint64_t p2m_generation;
+
+ union
+ {
+ struct
+ {
+ /* State machine for the order of received records. */
+ bool seen_pv_info;
+
+ /* Types for each page (bounded by max_pfn). */
+ uint32_t *pfn_types;
+
+ /* x86 PV per-vcpu storage structure for blobs. */
+ struct xc_sr_x86_pv_restore_vcpu
+ {
+ struct xc_sr_blob basic, extd, xsave, msr;
+ } *vcpus;
+ unsigned int nr_vcpus;
+ } restore;
+ };
+ } pv;
+
+ struct /* x86 HVM guest. */
+ {
+ union
+ {
+ struct
+ {
+ /* Whether qemu enabled logdirty mode, and we should
+ * disable on cleanup. */
+ bool qemu_enabled_logdirty;
+ } save;
+
+ struct
+ {
+ /* HVM context blob. */
+ struct xc_sr_blob context;
+ } restore;
+ };
+ } hvm;
+
+ } x86;
+ };
+};
+
+extern struct xc_sr_save_ops save_ops_x86_pv;
+extern struct xc_sr_save_ops save_ops_x86_hvm;
+
+extern struct xc_sr_restore_ops restore_ops_x86_pv;
+extern struct xc_sr_restore_ops restore_ops_x86_hvm;
+
+struct xc_sr_record
+{
+ uint32_t type;
+ uint32_t length;
+ void *data;
+};
+
+/*
+ * Writes a split record to the stream, applying correct padding where
+ * appropriate. It is common when sending records containing blobs from Xen
+ * that the header and blob data are separate. This function accepts a second
+ * buffer and length, and will merge it with the main record when sending.
+ *
+ * Records with a non-zero length must provide a valid data field; records
+ * with a 0 length shall have their data field ignored.
+ *
+ * Returns 0 on success and non0 on failure.
+ */
+int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
+ void *buf, size_t sz);
+
+/*
+ * Writes a record to the stream, applying correct padding where appropriate.
+ * Records with a non-zero length must provide a valid data field; records
+ * with a 0 length shall have their data field ignored.
+ *
+ * Returns 0 on success and non0 on failure.
+ */
+static inline int write_record(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ return write_split_record(ctx, rec, NULL, 0);
+}
+
+/*
+ * Reads a record from the stream, and fills in the record structure.
+ *
+ * Returns 0 on success and non-0 on failure.
+ *
+ * On success, the records type and size shall be valid.
+ * - If size is 0, data shall be NULL.
+ * - If size is non-0, data shall be a buffer allocated by malloc() which must
+ * be passed to free() by the caller.
+ *
+ * On failure, the contents of the record structure are undefined.
+ */
+int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);
+
+/*
+ * This would ideally be private in restore.c, but is needed by
+ * x86_pv_localise_page() if we receive pagetables frames ahead of the
+ * contents of the frames they point at.
+ */
+int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
+ const xen_pfn_t *original_pfns, const uint32_t *types);
+
+/* Handle a STATIC_DATA_END record. */
+int handle_static_data_end(struct xc_sr_context *ctx);
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include "xg_sr_common_x86.h"
+
+int write_x86_tsc_info(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_x86_tsc_info tsc = {};
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_X86_TSC_INFO,
+ .length = sizeof(tsc),
+ .data = &tsc,
+ };
+
+ if ( xc_domain_get_tsc_info(xch, ctx->domid, &tsc.mode,
+ &tsc.nsec, &tsc.khz, &tsc.incarnation) < 0 )
+ {
+ PERROR("Unable to obtain TSC information");
+ return -1;
+ }
+
+ return write_record(ctx, &rec);
+}
+
+int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_x86_tsc_info *tsc = rec->data;
+
+ if ( rec->length != sizeof(*tsc) )
+ {
+ ERROR("X86_TSC_INFO record wrong size: length %u, expected %zu",
+ rec->length, sizeof(*tsc));
+ return -1;
+ }
+
+ if ( xc_domain_set_tsc_info(xch, ctx->domid, tsc->mode,
+ tsc->nsec, tsc->khz, tsc->incarnation) )
+ {
+ PERROR("Unable to set TSC information");
+ return -1;
+ }
+
+ return 0;
+}
+
+int write_x86_cpu_policy_records(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_record cpuid = { .type = REC_TYPE_X86_CPUID_POLICY, };
+ struct xc_sr_record msrs = { .type = REC_TYPE_X86_MSR_POLICY, };
+ uint32_t nr_leaves = 0, nr_msrs = 0;
+ int rc;
+
+ if ( xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs) < 0 )
+ {
+ PERROR("Unable to get CPU Policy size");
+ return -1;
+ }
+
+ cpuid.data = malloc(nr_leaves * sizeof(xen_cpuid_leaf_t));
+ msrs.data = malloc(nr_msrs * sizeof(xen_msr_entry_t));
+ if ( !cpuid.data || !msrs.data )
+ {
+ ERROR("Cannot allocate memory for CPU Policy");
+ rc = -1;
+ goto out;
+ }
+
+ if ( xc_get_domain_cpu_policy(xch, ctx->domid, &nr_leaves, cpuid.data,
+ &nr_msrs, msrs.data) )
+ {
+ PERROR("Unable to get d%d CPU Policy", ctx->domid);
+ rc = -1;
+ goto out;
+ }
+
+ cpuid.length = nr_leaves * sizeof(xen_cpuid_leaf_t);
+ if ( cpuid.length )
+ {
+ rc = write_record(ctx, &cpuid);
+ if ( rc )
+ goto out;
+ }
+
+ msrs.length = nr_msrs * sizeof(xen_msr_entry_t);
+ if ( msrs.length )
+ rc = write_record(ctx, &msrs);
+
+ out:
+ free(cpuid.data);
+ free(msrs.data);
+
+ return rc;
+}
+
+int handle_x86_cpuid_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+
+ if ( rec->length == 0 ||
+ rec->length % sizeof(xen_cpuid_leaf_t) != 0 )
+ {
+ ERROR("X86_CPUID_POLICY size %u should be multiple of %zu",
+ rec->length, sizeof(xen_cpuid_leaf_t));
+ return -1;
+ }
+
+ rc = update_blob(&ctx->x86.restore.cpuid, rec->data, rec->length);
+ if ( rc )
+ ERROR("Unable to allocate %u bytes for X86_CPUID_POLICY", rec->length);
+
+ return rc;
+}
+
+int handle_x86_msr_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+
+ if ( rec->length == 0 ||
+ rec->length % sizeof(xen_msr_entry_t) != 0 )
+ {
+ ERROR("X86_MSR_POLICY size %u should be multiple of %zu",
+ rec->length, sizeof(xen_cpuid_leaf_t));
+ return -1;
+ }
+
+ rc = update_blob(&ctx->x86.restore.msr, rec->data, rec->length);
+ if ( rc )
+ ERROR("Unable to allocate %u bytes for X86_MSR_POLICY", rec->length);
+
+ return rc;
+}
+
+int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing)
+{
+ xc_interface *xch = ctx->xch;
+ uint32_t nr_leaves = 0, nr_msrs = 0;
+ uint32_t err_l = ~0, err_s = ~0, err_m = ~0;
+
+ if ( ctx->x86.restore.cpuid.ptr )
+ nr_leaves = ctx->x86.restore.cpuid.size / sizeof(xen_cpuid_leaf_t);
+ else
+ *missing |= XGR_SDD_MISSING_CPUID;
+
+ if ( ctx->x86.restore.msr.ptr )
+ nr_msrs = ctx->x86.restore.msr.size / sizeof(xen_msr_entry_t);
+ else
+ *missing |= XGR_SDD_MISSING_MSR;
+
+ if ( (nr_leaves || nr_msrs) &&
+ xc_set_domain_cpu_policy(xch, ctx->domid,
+ nr_leaves, ctx->x86.restore.cpuid.ptr,
+ nr_msrs, ctx->x86.restore.msr.ptr,
+ &err_l, &err_s, &err_m) )
+ {
+ PERROR("Failed to set CPUID policy: leaf %08x, subleaf %08x, msr %08x",
+ err_l, err_s, err_m);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#ifndef __COMMON_X86__H
+#define __COMMON_X86__H
+
+#include "xg_sr_common.h"
+
+/*
+ * Obtains a domains TSC information from Xen and writes a X86_TSC_INFO record
+ * into the stream.
+ */
+int write_x86_tsc_info(struct xc_sr_context *ctx);
+
+/*
+ * Parses a X86_TSC_INFO record and applies the result to the domain.
+ */
+int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec);
+
+/*
+ * Obtains a domains CPU Policy from Xen, and writes X86_{CPUID,MSR}_POLICY
+ * records into the stream.
+ */
+int write_x86_cpu_policy_records(struct xc_sr_context *ctx);
+
+/*
+ * Parses an X86_CPUID_POLICY record and stashes the content for application
+ * when a STATIC_DATA_END record is encountered.
+ */
+int handle_x86_cpuid_policy(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec);
+
+/*
+ * Parses an X86_MSR_POLICY record and stashes the content for application
+ * when a STATIC_DATA_END record is encountered.
+ */
+int handle_x86_msr_policy(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec);
+
+/*
+ * Perform common x86 actions required after the static data has arrived.
+ */
+int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing);
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include <assert.h>
+
+#include "xg_sr_common_x86_pv.h"
+
+xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn)
+{
+ assert(mfn <= ctx->x86.pv.max_mfn);
+ return ctx->x86.pv.m2p[mfn];
+}
+
+bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn)
+{
+ return ((mfn <= ctx->x86.pv.max_mfn) &&
+ (mfn_to_pfn(ctx, mfn) <= ctx->x86.pv.max_pfn) &&
+ (xc_pfn_to_mfn(mfn_to_pfn(ctx, mfn), ctx->x86.pv.p2m,
+ ctx->x86.pv.width) == mfn));
+}
+
+void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t pfn = ~0UL;
+
+ ERROR("mfn %#lx, max %#lx", mfn, ctx->x86.pv.max_mfn);
+
+ if ( (mfn != ~0UL) && (mfn <= ctx->x86.pv.max_mfn) )
+ {
+ pfn = ctx->x86.pv.m2p[mfn];
+ ERROR(" m2p[%#lx] = %#lx, max_pfn %#lx",
+ mfn, pfn, ctx->x86.pv.max_pfn);
+ }
+
+ if ( (pfn != ~0UL) && (pfn <= ctx->x86.pv.max_pfn) )
+ ERROR(" p2m[%#lx] = %#lx",
+ pfn, xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width));
+}
+
+xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3)
+{
+ if ( ctx->x86.pv.width == 8 )
+ return cr3 >> 12;
+ else
+ {
+ /* 32bit guests can't represent mfns wider than 32 bits */
+ if ( cr3 & 0xffffffff00000000UL )
+ return ~0UL;
+ else
+ return (uint32_t)((cr3 >> 12) | (cr3 << 20));
+ }
+}
+
+uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t _mfn)
+{
+ uint64_t mfn = _mfn;
+
+ if ( ctx->x86.pv.width == 8 )
+ return mfn << 12;
+ else
+ {
+ /* 32bit guests can't represent mfns wider than 32 bits */
+ if ( mfn & 0xffffffff00000000UL )
+ return ~0UL;
+ else
+ return (uint32_t)((mfn << 12) | (mfn >> 20));
+ }
+}
+
+int x86_pv_domain_info(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int guest_width, guest_levels;
+
+ /* Get the domain width */
+ if ( xc_domain_get_guest_width(xch, ctx->domid, &guest_width) )
+ {
+ PERROR("Unable to determine dom%d's width", ctx->domid);
+ return -1;
+ }
+
+ if ( guest_width == 4 )
+ guest_levels = 3;
+ else if ( guest_width == 8 )
+ guest_levels = 4;
+ else
+ {
+ ERROR("Invalid guest width %d. Expected 32 or 64", guest_width * 8);
+ return -1;
+ }
+ ctx->x86.pv.width = guest_width;
+ ctx->x86.pv.levels = guest_levels;
+
+ DPRINTF("%d bits, %d levels", guest_width * 8, guest_levels);
+
+ return 0;
+}
+
+int x86_pv_map_m2p(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t m2p_chunks, m2p_size, max_page;
+ privcmd_mmap_entry_t *entries = NULL;
+ xen_pfn_t *extents_start = NULL;
+ int rc = -1, i;
+
+ if ( xc_maximum_ram_page(xch, &max_page) < 0 )
+ {
+ PERROR("Failed to get maximum ram page");
+ goto err;
+ }
+
+ ctx->x86.pv.max_mfn = max_page;
+ m2p_size = M2P_SIZE(ctx->x86.pv.max_mfn);
+ m2p_chunks = M2P_CHUNKS(ctx->x86.pv.max_mfn);
+
+ extents_start = malloc(m2p_chunks * sizeof(xen_pfn_t));
+ if ( !extents_start )
+ {
+ ERROR("Unable to allocate %lu bytes for m2p mfns",
+ m2p_chunks * sizeof(xen_pfn_t));
+ goto err;
+ }
+
+ if ( xc_machphys_mfn_list(xch, m2p_chunks, extents_start) )
+ {
+ PERROR("Failed to get m2p mfn list");
+ goto err;
+ }
+
+ entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t));
+ if ( !entries )
+ {
+ ERROR("Unable to allocate %lu bytes for m2p mapping mfns",
+ m2p_chunks * sizeof(privcmd_mmap_entry_t));
+ goto err;
+ }
+
+ for ( i = 0; i < m2p_chunks; ++i )
+ entries[i].mfn = extents_start[i];
+
+ ctx->x86.pv.m2p = xc_map_foreign_ranges(
+ xch, DOMID_XEN, m2p_size, PROT_READ,
+ M2P_CHUNK_SIZE, entries, m2p_chunks);
+
+ if ( !ctx->x86.pv.m2p )
+ {
+ PERROR("Failed to mmap() m2p ranges");
+ goto err;
+ }
+
+ ctx->x86.pv.nr_m2p_frames = (M2P_CHUNK_SIZE >> PAGE_SHIFT) * m2p_chunks;
+
+#ifdef __i386__
+ /* 32 bit toolstacks automatically get the compat m2p */
+ ctx->x86.pv.compat_m2p_mfn0 = entries[0].mfn;
+#else
+ /* 64 bit toolstacks need to ask Xen specially for it */
+ {
+ struct xen_machphys_mfn_list xmml = {
+ .max_extents = 1,
+ .extent_start = { &ctx->x86.pv.compat_m2p_mfn0 },
+ };
+
+ rc = do_memory_op(xch, XENMEM_machphys_compat_mfn_list,
+ &xmml, sizeof(xmml));
+ if ( rc || xmml.nr_extents != 1 )
+ {
+ PERROR("Failed to get compat mfn list from Xen");
+ rc = -1;
+ goto err;
+ }
+ }
+#endif
+
+ /* All Done */
+ rc = 0;
+ DPRINTF("max_mfn %#lx", ctx->x86.pv.max_mfn);
+
+ err:
+ free(entries);
+ free(extents_start);
+
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#ifndef __COMMON_X86_PV_H
+#define __COMMON_X86_PV_H
+
+#include "xg_sr_common_x86.h"
+
+/* Virtual address ranges reserved for hypervisor. */
+#define HYPERVISOR_VIRT_START_X86_64 0xFFFF800000000000ULL
+#define HYPERVISOR_VIRT_END_X86_64 0xFFFF87FFFFFFFFFFULL
+
+#define HYPERVISOR_VIRT_START_X86_32 0x00000000F5800000ULL
+#define HYPERVISOR_VIRT_END_X86_32 0x00000000FFFFFFFFULL
+
+/*
+ * Convert an mfn to a pfn, given Xen's m2p table.
+ *
+ * Caller must ensure that the requested mfn is in range.
+ */
+xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/*
+ * Query whether a particular mfn is valid in the physmap of a guest.
+ */
+bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/*
+ * Debug a particular mfn by walking the p2m and m2p.
+ */
+void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/*
+ * Convert a PV cr3 field to an mfn.
+ *
+ * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into
+ * a 32bit architectural cr3.
+ */
+xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3);
+
+/*
+ * Convert an mfn to a PV cr3 field.
+ *
+ * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into
+ * a 32bit architectural cr3.
+ */
+uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t mfn);
+
+/* Bits 12 through 51 of a PTE point at the frame */
+#define PTE_FRAME_MASK 0x000ffffffffff000ULL
+
+/*
+ * Extract an mfn from a Pagetable Entry. May return INVALID_MFN if the pte
+ * would overflow a 32bit xen_pfn_t.
+ */
+static inline xen_pfn_t pte_to_frame(uint64_t pte)
+{
+ uint64_t frame = (pte & PTE_FRAME_MASK) >> PAGE_SHIFT;
+
+#ifdef __i386__
+ if ( frame >= INVALID_MFN )
+ return INVALID_MFN;
+#endif
+
+ return frame;
+}
+
+/*
+ * Change the frame in a Pagetable Entry while leaving the flags alone.
+ */
+static inline uint64_t merge_pte(uint64_t pte, xen_pfn_t mfn)
+{
+ return (pte & ~PTE_FRAME_MASK) | ((uint64_t)mfn << PAGE_SHIFT);
+}
+
+/*
+ * Get current domain information.
+ *
+ * Fills ctx->x86.pv
+ * - .width
+ * - .levels
+ * - .fpp
+ * - .p2m_frames
+ *
+ * Used by the save side to create the X86_PV_INFO record, and by the restore
+ * side to verify the incoming stream.
+ *
+ * Returns 0 on success and non-zero on error.
+ */
+int x86_pv_domain_info(struct xc_sr_context *ctx);
+
+/*
+ * Maps the Xen M2P.
+ *
+ * Fills ctx->x86.pv.
+ * - .max_mfn
+ * - .m2p
+ *
+ * Returns 0 on success and non-zero on error.
+ */
+int x86_pv_map_m2p(struct xc_sr_context *ctx);
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include <arpa/inet.h>
+
+#include <assert.h>
+
+#include "xg_sr_common.h"
+
+/*
+ * Read and validate the Image and Domain headers.
+ */
+static int read_headers(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_ihdr ihdr;
+ struct xc_sr_dhdr dhdr;
+
+ if ( read_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
+ {
+ PERROR("Failed to read Image Header from stream");
+ return -1;
+ }
+
+ ihdr.id = ntohl(ihdr.id);
+ ihdr.version = ntohl(ihdr.version);
+ ihdr.options = ntohs(ihdr.options);
+
+ if ( ihdr.marker != IHDR_MARKER )
+ {
+ ERROR("Invalid marker: Got 0x%016"PRIx64, ihdr.marker);
+ return -1;
+ }
+
+ if ( ihdr.id != IHDR_ID )
+ {
+ ERROR("Invalid ID: Expected 0x%08x, Got 0x%08x", IHDR_ID, ihdr.id);
+ return -1;
+ }
+
+ if ( ihdr.version < 2 || ihdr.version > 3 )
+ {
+ ERROR("Invalid Version: Expected 2 <= ver <= 3, Got %d",
+ ihdr.version);
+ return -1;
+ }
+
+ if ( ihdr.options & IHDR_OPT_BIG_ENDIAN )
+ {
+ ERROR("Unable to handle big endian streams");
+ return -1;
+ }
+
+ ctx->restore.format_version = ihdr.version;
+
+ if ( read_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
+ {
+ PERROR("Failed to read Domain Header from stream");
+ return -1;
+ }
+
+ ctx->restore.guest_type = dhdr.type;
+ ctx->restore.guest_page_size = (1U << dhdr.page_shift);
+
+ if ( dhdr.xen_major == 0 )
+ {
+ IPRINTF("Found %s domain, converted from legacy stream format",
+ dhdr_type_to_str(dhdr.type));
+ DPRINTF(" Legacy conversion script version %u", dhdr.xen_minor);
+ }
+ else
+ IPRINTF("Found %s domain from Xen %u.%u",
+ dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor);
+ return 0;
+}
+
+/*
+ * Is a pfn populated?
+ */
+static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ if ( pfn > ctx->restore.max_populated_pfn )
+ return false;
+ return test_bit(pfn, ctx->restore.populated_pfns);
+}
+
+/*
+ * Set a pfn as populated, expanding the tracking structures if needed. To
+ * avoid realloc()ing too excessively, the size increased to the nearest power
+ * of two large enough to contain the required pfn.
+ */
+static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ xc_interface *xch = ctx->xch;
+
+ if ( pfn > ctx->restore.max_populated_pfn )
+ {
+ xen_pfn_t new_max;
+ size_t old_sz, new_sz;
+ unsigned long *p;
+
+ /* Round up to the nearest power of two larger than pfn, less 1. */
+ new_max = pfn;
+ new_max |= new_max >> 1;
+ new_max |= new_max >> 2;
+ new_max |= new_max >> 4;
+ new_max |= new_max >> 8;
+ new_max |= new_max >> 16;
+#ifdef __x86_64__
+ new_max |= new_max >> 32;
+#endif
+
+ old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
+ new_sz = bitmap_size(new_max + 1);
+ p = realloc(ctx->restore.populated_pfns, new_sz);
+ if ( !p )
+ {
+ ERROR("Failed to realloc populated bitmap");
+ errno = ENOMEM;
+ return -1;
+ }
+
+ memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
+
+ ctx->restore.populated_pfns = p;
+ ctx->restore.max_populated_pfn = new_max;
+ }
+
+ assert(!test_bit(pfn, ctx->restore.populated_pfns));
+ set_bit(pfn, ctx->restore.populated_pfns);
+
+ return 0;
+}
+
+/*
+ * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+ * unpopulated subset. If types is NULL, no page type checking is performed
+ * and all unpopulated pfns are populated.
+ */
+int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
+ const xen_pfn_t *original_pfns, const uint32_t *types)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
+ *pfns = malloc(count * sizeof(*pfns));
+ unsigned int i, nr_pfns = 0;
+ int rc = -1;
+
+ if ( !mfns || !pfns )
+ {
+ ERROR("Failed to allocate %zu bytes for populating the physmap",
+ 2 * count * sizeof(*mfns));
+ goto err;
+ }
+
+ for ( i = 0; i < count; ++i )
+ {
+ if ( (!types || (types &&
+ (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
+ types[i] != XEN_DOMCTL_PFINFO_BROKEN))) &&
+ !pfn_is_populated(ctx, original_pfns[i]) )
+ {
+ rc = pfn_set_populated(ctx, original_pfns[i]);
+ if ( rc )
+ goto err;
+ pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+ ++nr_pfns;
+ }
+ }
+
+ if ( nr_pfns )
+ {
+ rc = xc_domain_populate_physmap_exact(
+ xch, ctx->domid, nr_pfns, 0, 0, mfns);
+ if ( rc )
+ {
+ PERROR("Failed to populate physmap");
+ goto err;
+ }
+
+ for ( i = 0; i < nr_pfns; ++i )
+ {
+ if ( mfns[i] == INVALID_MFN )
+ {
+ ERROR("Populate physmap failed for pfn %u", i);
+ rc = -1;
+ goto err;
+ }
+
+ ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+ }
+ }
+
+ rc = 0;
+
+ err:
+ free(pfns);
+ free(mfns);
+
+ return rc;
+}
+
+/*
+ * Given a list of pfns, their types, and a block of page data from the
+ * stream, populate and record their types, map the relevant subset and copy
+ * the data into the guest.
+ */
+static int process_page_data(struct xc_sr_context *ctx, unsigned int count,
+ xen_pfn_t *pfns, uint32_t *types, void *page_data)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t *mfns = malloc(count * sizeof(*mfns));
+ int *map_errs = malloc(count * sizeof(*map_errs));
+ int rc;
+ void *mapping = NULL, *guest_page = NULL;
+ unsigned int i, /* i indexes the pfns from the record. */
+ j, /* j indexes the subset of pfns we decide to map. */
+ nr_pages = 0;
+
+ if ( !mfns || !map_errs )
+ {
+ rc = -1;
+ ERROR("Failed to allocate %zu bytes to process page data",
+ count * (sizeof(*mfns) + sizeof(*map_errs)));
+ goto err;
+ }
+
+ rc = populate_pfns(ctx, count, pfns, types);
+ if ( rc )
+ {
+ ERROR("Failed to populate pfns for batch of %u pages", count);
+ goto err;
+ }
+
+ for ( i = 0; i < count; ++i )
+ {
+ ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]);
+
+ switch ( types[i] )
+ {
+ case XEN_DOMCTL_PFINFO_NOTAB:
+
+ case XEN_DOMCTL_PFINFO_L1TAB:
+ case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+ case XEN_DOMCTL_PFINFO_L2TAB:
+ case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+ case XEN_DOMCTL_PFINFO_L3TAB:
+ case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+ case XEN_DOMCTL_PFINFO_L4TAB:
+ case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+ mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
+ break;
+ }
+ }
+
+ /* Nothing to do? */
+ if ( nr_pages == 0 )
+ goto done;
+
+ mapping = guest_page = xenforeignmemory_map(
+ xch->fmem, ctx->domid, PROT_READ | PROT_WRITE,
+ nr_pages, mfns, map_errs);
+ if ( !mapping )
+ {
+ rc = -1;
+ PERROR("Unable to map %u mfns for %u pages of data",
+ nr_pages, count);
+ goto err;
+ }
+
+ for ( i = 0, j = 0; i < count; ++i )
+ {
+ switch ( types[i] )
+ {
+ case XEN_DOMCTL_PFINFO_XTAB:
+ case XEN_DOMCTL_PFINFO_BROKEN:
+ case XEN_DOMCTL_PFINFO_XALLOC:
+ /* No page data to deal with. */
+ continue;
+ }
+
+ if ( map_errs[j] )
+ {
+ rc = -1;
+ ERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed with %d",
+ pfns[i], mfns[j], types[i], map_errs[j]);
+ goto err;
+ }
+
+ /* Undo page normalisation done by the saver. */
+ rc = ctx->restore.ops.localise_page(ctx, types[i], page_data);
+ if ( rc )
+ {
+ ERROR("Failed to localise pfn %#"PRIpfn" (type %#"PRIx32")",
+ pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+ goto err;
+ }
+
+ if ( ctx->restore.verify )
+ {
+ /* Verify mode - compare incoming data to what we already have. */
+ if ( memcmp(guest_page, page_data, PAGE_SIZE) )
+ ERROR("verify pfn %#"PRIpfn" failed (type %#"PRIx32")",
+ pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+ }
+ else
+ {
+ /* Regular mode - copy incoming data into place. */
+ memcpy(guest_page, page_data, PAGE_SIZE);
+ }
+
+ ++j;
+ guest_page += PAGE_SIZE;
+ page_data += PAGE_SIZE;
+ }
+
+ done:
+ rc = 0;
+
+ err:
+ if ( mapping )
+ xenforeignmemory_unmap(xch->fmem, mapping, nr_pages);
+
+ free(map_errs);
+ free(mfns);
+
+ return rc;
+}
+
+/*
+ * Validate a PAGE_DATA record from the stream, and pass the results to
+ * process_page_data() to actually perform the legwork.
+ */
+static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_page_data_header *pages = rec->data;
+ unsigned int i, pages_of_data = 0;
+ int rc = -1;
+
+ xen_pfn_t *pfns = NULL, pfn;
+ uint32_t *types = NULL, type;
+
+ /*
+ * v2 compatibility only exists for x86 streams. This is a bit of a
+ * bodge, but it is less bad than duplicating handle_page_data() between
+ * different architectures.
+ */
+#if defined(__i386__) || defined(__x86_64__)
+ /* v2 compat. Infer the position of STATIC_DATA_END. */
+ if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end )
+ {
+ rc = handle_static_data_end(ctx);
+ if ( rc )
+ {
+ ERROR("Inferred STATIC_DATA_END record failed");
+ goto err;
+ }
+ rc = -1;
+ }
+
+ if ( !ctx->restore.seen_static_data_end )
+ {
+ ERROR("No STATIC_DATA_END seen");
+ goto err;
+ }
+#endif
+
+ if ( rec->length < sizeof(*pages) )
+ {
+ ERROR("PAGE_DATA record truncated: length %u, min %zu",
+ rec->length, sizeof(*pages));
+ goto err;
+ }
+
+ if ( pages->count < 1 )
+ {
+ ERROR("Expected at least 1 pfn in PAGE_DATA record");
+ goto err;
+ }
+
+ if ( rec->length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) )
+ {
+ ERROR("PAGE_DATA record (length %u) too short to contain %u"
+ " pfns worth of information", rec->length, pages->count);
+ goto err;
+ }
+
+ pfns = malloc(pages->count * sizeof(*pfns));
+ types = malloc(pages->count * sizeof(*types));
+ if ( !pfns || !types )
+ {
+ ERROR("Unable to allocate enough memory for %u pfns",
+ pages->count);
+ goto err;
+ }
+
+ for ( i = 0; i < pages->count; ++i )
+ {
+ pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK;
+ if ( !ctx->restore.ops.pfn_is_valid(ctx, pfn) )
+ {
+ ERROR("pfn %#"PRIpfn" (index %u) outside domain maximum", pfn, i);
+ goto err;
+ }
+
+ type = (pages->pfn[i] & PAGE_DATA_TYPE_MASK) >> 32;
+ if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) &&
+ ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) )
+ {
+ ERROR("Invalid type %#"PRIx32" for pfn %#"PRIpfn" (index %u)",
+ type, pfn, i);
+ goto err;
+ }
+
+ if ( type < XEN_DOMCTL_PFINFO_BROKEN )
+ /* NOTAB and all L1 through L4 tables (including pinned) should
+ * have a page worth of data in the record. */
+ pages_of_data++;
+
+ pfns[i] = pfn;
+ types[i] = type;
+ }
+
+ if ( rec->length != (sizeof(*pages) +
+ (sizeof(uint64_t) * pages->count) +
+ (PAGE_SIZE * pages_of_data)) )
+ {
+ ERROR("PAGE_DATA record wrong size: length %u, expected "
+ "%zu + %zu + %lu", rec->length, sizeof(*pages),
+ (sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data));
+ goto err;
+ }
+
+ rc = process_page_data(ctx, pages->count, pfns, types,
+ &pages->pfn[pages->count]);
+ err:
+ free(types);
+ free(pfns);
+
+ return rc;
+}
+
+/*
+ * Send checkpoint dirty pfn list to primary.
+ */
+static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc = -1;
+ unsigned int count, written;
+ uint64_t i, *pfns = NULL;
+ struct iovec *iov = NULL;
+ xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST,
+ };
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->restore.dirty_bitmap_hbuf);
+
+ if ( xc_shadow_control(
+ xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+ HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size,
+ NULL, 0, &stats) != ctx->restore.p2m_size )
+ {
+ PERROR("Failed to retrieve logdirty bitmap");
+ goto err;
+ }
+
+ for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ )
+ {
+ if ( test_bit(i, dirty_bitmap) )
+ count++;
+ }
+
+
+ pfns = malloc(count * sizeof(*pfns));
+ if ( !pfns )
+ {
+ ERROR("Unable to allocate %zu bytes of memory for dirty pfn list",
+ count * sizeof(*pfns));
+ goto err;
+ }
+
+ for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i )
+ {
+ if ( !test_bit(i, dirty_bitmap) )
+ continue;
+
+ if ( written > count )
+ {
+ ERROR("Dirty pfn list exceed");
+ goto err;
+ }
+
+ pfns[written++] = i;
+ }
+
+ /* iovec[] for writev(). */
+ iov = malloc(3 * sizeof(*iov));
+ if ( !iov )
+ {
+ ERROR("Unable to allocate memory for sending dirty bitmap");
+ goto err;
+ }
+
+ rec.length = count * sizeof(*pfns);
+
+ iov[0].iov_base = &rec.type;
+ iov[0].iov_len = sizeof(rec.type);
+
+ iov[1].iov_base = &rec.length;
+ iov[1].iov_len = sizeof(rec.length);
+
+ iov[2].iov_base = pfns;
+ iov[2].iov_len = count * sizeof(*pfns);
+
+ if ( writev_exact(ctx->restore.send_back_fd, iov, 3) )
+ {
+ PERROR("Failed to write dirty bitmap to stream");
+ goto err;
+ }
+
+ rc = 0;
+ err:
+ free(pfns);
+ free(iov);
+ return rc;
+}
+
+static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
+static int handle_checkpoint(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc = 0, ret;
+ unsigned int i;
+
+ if ( ctx->stream_type == XC_STREAM_PLAIN )
+ {
+ ERROR("Found checkpoint in non-checkpointed stream");
+ rc = -1;
+ goto err;
+ }
+
+ ret = ctx->restore.callbacks->checkpoint(ctx->restore.callbacks->data);
+ switch ( ret )
+ {
+ case XGR_CHECKPOINT_SUCCESS:
+ break;
+
+ case XGR_CHECKPOINT_FAILOVER:
+ if ( ctx->restore.buffer_all_records )
+ rc = BROKEN_CHANNEL;
+ else
+ /* We don't have a consistent state */
+ rc = -1;
+ goto err;
+
+ default: /* Other fatal error */
+ rc = -1;
+ goto err;
+ }
+
+ if ( ctx->restore.buffer_all_records )
+ {
+ IPRINTF("All records buffered");
+
+ for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
+ {
+ rc = process_record(ctx, &ctx->restore.buffered_records[i]);
+ if ( rc )
+ goto err;
+ }
+ ctx->restore.buffered_rec_num = 0;
+ IPRINTF("All records processed");
+ }
+ else
+ ctx->restore.buffer_all_records = true;
+
+ if ( ctx->stream_type == XC_STREAM_COLO )
+ {
+#define HANDLE_CALLBACK_RETURN_VALUE(ret) \
+ do { \
+ if ( ret == 1 ) \
+ rc = 0; /* Success */ \
+ else \
+ { \
+ if ( ret == 2 ) \
+ rc = BROKEN_CHANNEL; \
+ else \
+ rc = -1; /* Some unspecified error */ \
+ goto err; \
+ } \
+ } while (0)
+
+ /* COLO */
+
+ /* We need to resume guest */
+ rc = ctx->restore.ops.stream_complete(ctx);
+ if ( rc )
+ goto err;
+
+ ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn,
+ ctx->restore.console_gfn,
+ ctx->restore.callbacks->data);
+
+ /* Resume secondary vm */
+ ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+ /* Wait for a new checkpoint */
+ ret = ctx->restore.callbacks->wait_checkpoint(
+ ctx->restore.callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+ /* suspend secondary vm */
+ ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+#undef HANDLE_CALLBACK_RETURN_VALUE
+
+ rc = send_checkpoint_dirty_pfn_list(ctx);
+ if ( rc )
+ goto err;
+ }
+
+ err:
+ return rc;
+}
+
+static int buffer_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int new_alloc_num;
+ struct xc_sr_record *p;
+
+ if ( ctx->restore.buffered_rec_num >= ctx->restore.allocated_rec_num )
+ {
+ new_alloc_num = ctx->restore.allocated_rec_num + DEFAULT_BUF_RECORDS;
+ p = realloc(ctx->restore.buffered_records,
+ new_alloc_num * sizeof(struct xc_sr_record));
+ if ( !p )
+ {
+ ERROR("Failed to realloc memory for buffered records");
+ return -1;
+ }
+
+ ctx->restore.buffered_records = p;
+ ctx->restore.allocated_rec_num = new_alloc_num;
+ }
+
+ memcpy(&ctx->restore.buffered_records[ctx->restore.buffered_rec_num++],
+ rec, sizeof(*rec));
+
+ return 0;
+}
+
+int handle_static_data_end(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int missing = 0;
+ int rc = 0;
+
+ if ( ctx->restore.seen_static_data_end )
+ {
+ ERROR("Multiple STATIC_DATA_END records found");
+ return -1;
+ }
+
+ ctx->restore.seen_static_data_end = true;
+
+ rc = ctx->restore.ops.static_data_complete(ctx, &missing);
+ if ( rc )
+ return rc;
+
+ if ( ctx->restore.callbacks->static_data_done &&
+ (rc = ctx->restore.callbacks->static_data_done(
+ missing, ctx->restore.callbacks->data) != 0) )
+ ERROR("static_data_done() callback failed: %d\n", rc);
+
+ return rc;
+}
+
+static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ int rc = 0;
+
+ switch ( rec->type )
+ {
+ case REC_TYPE_END:
+ break;
+
+ case REC_TYPE_PAGE_DATA:
+ rc = handle_page_data(ctx, rec);
+ break;
+
+ case REC_TYPE_VERIFY:
+ DPRINTF("Verify mode enabled");
+ ctx->restore.verify = true;
+ break;
+
+ case REC_TYPE_CHECKPOINT:
+ rc = handle_checkpoint(ctx);
+ break;
+
+ case REC_TYPE_STATIC_DATA_END:
+ rc = handle_static_data_end(ctx);
+ break;
+
+ default:
+ rc = ctx->restore.ops.process_record(ctx, rec);
+ break;
+ }
+
+ free(rec->data);
+ rec->data = NULL;
+
+ return rc;
+}
+
+static int setup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->restore.dirty_bitmap_hbuf);
+
+ if ( ctx->stream_type == XC_STREAM_COLO )
+ {
+ dirty_bitmap = xc_hypercall_buffer_alloc_pages(
+ xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size)));
+
+ if ( !dirty_bitmap )
+ {
+ ERROR("Unable to allocate memory for dirty bitmap");
+ rc = -1;
+ goto err;
+ }
+ }
+
+ rc = ctx->restore.ops.setup(ctx);
+ if ( rc )
+ goto err;
+
+ ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1;
+ ctx->restore.populated_pfns = bitmap_alloc(
+ ctx->restore.max_populated_pfn + 1);
+ if ( !ctx->restore.populated_pfns )
+ {
+ ERROR("Unable to allocate memory for populated_pfns bitmap");
+ rc = -1;
+ goto err;
+ }
+
+ ctx->restore.buffered_records = malloc(
+ DEFAULT_BUF_RECORDS * sizeof(struct xc_sr_record));
+ if ( !ctx->restore.buffered_records )
+ {
+ ERROR("Unable to allocate memory for buffered records");
+ rc = -1;
+ goto err;
+ }
+ ctx->restore.allocated_rec_num = DEFAULT_BUF_RECORDS;
+
+ err:
+ return rc;
+}
+
+static void cleanup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int i;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->restore.dirty_bitmap_hbuf);
+
+ for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
+ free(ctx->restore.buffered_records[i].data);
+
+ if ( ctx->stream_type == XC_STREAM_COLO )
+ xc_hypercall_buffer_free_pages(
+ xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size)));
+
+ free(ctx->restore.buffered_records);
+ free(ctx->restore.populated_pfns);
+
+ if ( ctx->restore.ops.cleanup(ctx) )
+ PERROR("Failed to clean up");
+}
+
+/*
+ * Restore a domain.
+ */
+static int restore(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_record rec;
+ int rc, saved_rc = 0, saved_errno = 0;
+
+ IPRINTF("Restoring domain");
+
+ rc = setup(ctx);
+ if ( rc )
+ goto err;
+
+ do
+ {
+ rc = read_record(ctx, ctx->fd, &rec);
+ if ( rc )
+ {
+ if ( ctx->restore.buffer_all_records )
+ goto remus_failover;
+ else
+ goto err;
+ }
+
+ if ( ctx->restore.buffer_all_records &&
+ rec.type != REC_TYPE_END &&
+ rec.type != REC_TYPE_CHECKPOINT )
+ {
+ rc = buffer_record(ctx, &rec);
+ if ( rc )
+ goto err;
+ }
+ else
+ {
+ rc = process_record(ctx, &rec);
+ if ( rc == RECORD_NOT_PROCESSED )
+ {
+ if ( rec.type & REC_TYPE_OPTIONAL )
+ DPRINTF("Ignoring optional record %#x (%s)",
+ rec.type, rec_type_to_str(rec.type));
+ else
+ {
+ ERROR("Mandatory record %#x (%s) not handled",
+ rec.type, rec_type_to_str(rec.type));
+ rc = -1;
+ goto err;
+ }
+ }
+ else if ( rc == BROKEN_CHANNEL )
+ goto remus_failover;
+ else if ( rc )
+ goto err;
+ }
+
+ } while ( rec.type != REC_TYPE_END );
+
+ remus_failover:
+ if ( ctx->stream_type == XC_STREAM_COLO )
+ {
+ /* With COLO, we have already called stream_complete */
+ rc = 0;
+ IPRINTF("COLO Failover");
+ goto done;
+ }
+
+ /*
+ * With Remus, if we reach here, there must be some error on primary,
+ * failover from the last checkpoint state.
+ */
+ rc = ctx->restore.ops.stream_complete(ctx);
+ if ( rc )
+ goto err;
+
+ IPRINTF("Restore successful");
+ goto done;
+
+ err:
+ saved_errno = errno;
+ saved_rc = rc;
+ PERROR("Restore failed");
+
+ done:
+ cleanup(ctx);
+
+ if ( saved_rc )
+ {
+ rc = saved_rc;
+ errno = saved_errno;
+ }
+
+ return rc;
+}
+
+int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
+ unsigned int store_evtchn, unsigned long *store_mfn,
+ uint32_t store_domid, unsigned int console_evtchn,
+ unsigned long *console_gfn, uint32_t console_domid,
+ xc_stream_type_t stream_type,
+ struct restore_callbacks *callbacks, int send_back_fd)
+{
+ xen_pfn_t nr_pfns;
+ struct xc_sr_context ctx = {
+ .xch = xch,
+ .fd = io_fd,
+ .stream_type = stream_type,
+ };
+
+ /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */
+ ctx.restore.console_evtchn = console_evtchn;
+ ctx.restore.console_domid = console_domid;
+ ctx.restore.xenstore_evtchn = store_evtchn;
+ ctx.restore.xenstore_domid = store_domid;
+ ctx.restore.callbacks = callbacks;
+ ctx.restore.send_back_fd = send_back_fd;
+
+ /* Sanity check stream_type-related parameters */
+ switch ( stream_type )
+ {
+ case XC_STREAM_COLO:
+ assert(callbacks->suspend &&
+ callbacks->postcopy &&
+ callbacks->wait_checkpoint &&
+ callbacks->restore_results);
+ /* Fallthrough */
+ case XC_STREAM_REMUS:
+ assert(callbacks->checkpoint);
+ /* Fallthrough */
+ case XC_STREAM_PLAIN:
+ break;
+
+ default:
+ assert(!"Bad stream_type");
+ break;
+ }
+
+ if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
+ {
+ PERROR("Failed to get domain info");
+ return -1;
+ }
+
+ if ( ctx.dominfo.domid != dom )
+ {
+ ERROR("Domain %u does not exist", dom);
+ return -1;
+ }
+
+ DPRINTF("fd %d, dom %u, hvm %u, stream_type %d",
+ io_fd, dom, ctx.dominfo.hvm, stream_type);
+
+ ctx.domid = dom;
+
+ if ( read_headers(&ctx) )
+ return -1;
+
+ if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 )
+ {
+ PERROR("Unable to obtain the guest p2m size");
+ return -1;
+ }
+
+ ctx.restore.p2m_size = nr_pfns;
+ ctx.restore.ops = ctx.dominfo.hvm
+ ? restore_ops_x86_hvm : restore_ops_x86_pv;
+
+ if ( restore(&ctx) )
+ return -1;
+
+ IPRINTF("XenStore: mfn %#"PRIpfn", dom %d, evt %u",
+ ctx.restore.xenstore_gfn,
+ ctx.restore.xenstore_domid,
+ ctx.restore.xenstore_evtchn);
+
+ IPRINTF("Console: mfn %#"PRIpfn", dom %d, evt %u",
+ ctx.restore.console_gfn,
+ ctx.restore.console_domid,
+ ctx.restore.console_evtchn);
+
+ *console_gfn = ctx.restore.console_gfn;
+ *store_mfn = ctx.restore.xenstore_gfn;
+
+ return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include <assert.h>
+#include <arpa/inet.h>
+
+#include "xg_sr_common_x86.h"
+
+/*
+ * Process an HVM_CONTEXT record from the stream.
+ */
+static int handle_hvm_context(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ int rc = update_blob(&ctx->x86.hvm.restore.context, rec->data, rec->length);
+
+ if ( rc )
+ ERROR("Unable to allocate %u bytes for hvm context", rec->length);
+
+ return rc;
+}
+
+/*
+ * Process an HVM_PARAMS record from the stream.
+ */
+static int handle_hvm_params(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_hvm_params *hdr = rec->data;
+ struct xc_sr_rec_hvm_params_entry *entry = hdr->param;
+ unsigned int i;
+ int rc;
+
+ if ( rec->length < sizeof(*hdr) )
+ {
+ ERROR("HVM_PARAMS record truncated: length %u, header size %zu",
+ rec->length, sizeof(*hdr));
+ return -1;
+ }
+
+ if ( rec->length != (sizeof(*hdr) + hdr->count * sizeof(*entry)) )
+ {
+ ERROR("HVM_PARAMS record truncated: header %zu, count %u, "
+ "expected len %zu, got %u",
+ sizeof(*hdr), hdr->count, hdr->count * sizeof(*entry),
+ rec->length);
+ return -1;
+ }
+
+ /*
+ * Tolerate empty records. Older sending sides used to accidentally
+ * generate them.
+ */
+ if ( hdr->count == 0 )
+ {
+ DBGPRINTF("Skipping empty HVM_PARAMS record\n");
+ return 0;
+ }
+
+ for ( i = 0; i < hdr->count; i++, entry++ )
+ {
+ switch ( entry->index )
+ {
+ case HVM_PARAM_CONSOLE_PFN:
+ ctx->restore.console_gfn = entry->value;
+ xc_clear_domain_page(xch, ctx->domid, entry->value);
+ break;
+ case HVM_PARAM_STORE_PFN:
+ ctx->restore.xenstore_gfn = entry->value;
+ xc_clear_domain_page(xch, ctx->domid, entry->value);
+ break;
+ case HVM_PARAM_IOREQ_PFN:
+ case HVM_PARAM_BUFIOREQ_PFN:
+ xc_clear_domain_page(xch, ctx->domid, entry->value);
+ break;
+
+ case HVM_PARAM_PAE_ENABLED:
+ /*
+ * This HVM_PARAM only ever existed to pass data into
+ * xc_cpuid_apply_policy(). The function has now been updated to
+ * use a normal calling convention, making the param obsolete.
+ *
+ * Discard if we find it in an old migration stream.
+ */
+ continue;
+ }
+
+ rc = xc_hvm_param_set(xch, ctx->domid, entry->index, entry->value);
+ if ( rc < 0 )
+ {
+ PERROR("set HVM param %"PRId64" = 0x%016"PRIx64,
+ entry->index, entry->value);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+/* restore_ops function. */
+static bool x86_hvm_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ return true;
+}
+
+/* restore_ops function. */
+static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx,
+ xen_pfn_t pfn)
+{
+ return pfn;
+}
+
+/* restore_ops function. */
+static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
+ xen_pfn_t gfn)
+{
+ /* no op */
+}
+
+/* restore_ops function. */
+static void x86_hvm_set_page_type(struct xc_sr_context *ctx,
+ xen_pfn_t pfn, xen_pfn_t type)
+{
+ /* no-op */
+}
+
+/* restore_ops function. */
+static int x86_hvm_localise_page(struct xc_sr_context *ctx,
+ uint32_t type, void *page)
+{
+ /* no-op */
+ return 0;
+}
+
+/*
+ * restore_ops function. Confirms the stream matches the domain.
+ */
+static int x86_hvm_setup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+
+ if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM )
+ {
+ ERROR("Unable to restore %s domain into an x86 HVM domain",
+ dhdr_type_to_str(ctx->restore.guest_type));
+ return -1;
+ }
+
+ if ( ctx->restore.guest_page_size != PAGE_SIZE )
+ {
+ ERROR("Invalid page size %u for x86 HVM domains",
+ ctx->restore.guest_page_size);
+ return -1;
+ }
+
+#ifdef __i386__
+ /* Very large domains (> 1TB) will exhaust virtual address space. */
+ if ( ctx->restore.p2m_size > 0x0fffffff )
+ {
+ errno = E2BIG;
+ PERROR("Cannot restore this big a guest");
+ return -1;
+ }
+#endif
+
+ return 0;
+}
+
+/*
+ * restore_ops function.
+ */
+static int x86_hvm_process_record(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ switch ( rec->type )
+ {
+ case REC_TYPE_X86_TSC_INFO:
+ return handle_x86_tsc_info(ctx, rec);
+
+ case REC_TYPE_HVM_CONTEXT:
+ return handle_hvm_context(ctx, rec);
+
+ case REC_TYPE_HVM_PARAMS:
+ return handle_hvm_params(ctx, rec);
+
+ case REC_TYPE_X86_CPUID_POLICY:
+ return handle_x86_cpuid_policy(ctx, rec);
+
+ case REC_TYPE_X86_MSR_POLICY:
+ return handle_x86_msr_policy(ctx, rec);
+
+ default:
+ return RECORD_NOT_PROCESSED;
+ }
+}
+
+/*
+ * restore_ops function. Sets extra hvm parameters and seeds the grant table.
+ */
+static int x86_hvm_stream_complete(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+
+ rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_STORE_EVTCHN,
+ ctx->restore.xenstore_evtchn);
+ if ( rc )
+ {
+ PERROR("Failed to set HVM_PARAM_STORE_EVTCHN");
+ return rc;
+ }
+
+ rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_CONSOLE_EVTCHN,
+ ctx->restore.console_evtchn);
+ if ( rc )
+ {
+ PERROR("Failed to set HVM_PARAM_CONSOLE_EVTCHN");
+ return rc;
+ }
+
+ rc = xc_domain_hvm_setcontext(xch, ctx->domid,
+ ctx->x86.hvm.restore.context.ptr,
+ ctx->x86.hvm.restore.context.size);
+ if ( rc < 0 )
+ {
+ PERROR("Unable to restore HVM context");
+ return rc;
+ }
+
+ rc = xc_dom_gnttab_seed(xch, ctx->domid, true,
+ ctx->restore.console_gfn,
+ ctx->restore.xenstore_gfn,
+ ctx->restore.console_domid,
+ ctx->restore.xenstore_domid);
+ if ( rc )
+ {
+ PERROR("Failed to seed grant table");
+ return rc;
+ }
+
+ return rc;
+}
+
+static int x86_hvm_cleanup(struct xc_sr_context *ctx)
+{
+ free(ctx->x86.hvm.restore.context.ptr);
+
+ free(ctx->x86.restore.cpuid.ptr);
+ free(ctx->x86.restore.msr.ptr);
+
+ return 0;
+}
+
+struct xc_sr_restore_ops restore_ops_x86_hvm =
+{
+ .pfn_is_valid = x86_hvm_pfn_is_valid,
+ .pfn_to_gfn = x86_hvm_pfn_to_gfn,
+ .set_gfn = x86_hvm_set_gfn,
+ .set_page_type = x86_hvm_set_page_type,
+ .localise_page = x86_hvm_localise_page,
+ .setup = x86_hvm_setup,
+ .process_record = x86_hvm_process_record,
+ .static_data_complete = x86_static_data_complete,
+ .stream_complete = x86_hvm_stream_complete,
+ .cleanup = x86_hvm_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include <assert.h>
+
+#include "xg_sr_common_x86_pv.h"
+
+static xen_pfn_t pfn_to_mfn(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ assert(pfn <= ctx->x86.pv.max_pfn);
+
+ return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width);
+}
+
+/*
+ * Expand our local tracking information for the p2m table and domains maximum
+ * size. Normally this will be called once to expand from 0 to max_pfn, but
+ * is liable to expand multiple times if the domain grows on the sending side
+ * after migration has started.
+ */
+static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned long old_max = ctx->x86.pv.max_pfn, i;
+ unsigned int fpp = PAGE_SIZE / ctx->x86.pv.width;
+ unsigned long end_frame = (max_pfn / fpp) + 1;
+ unsigned long old_end_frame = (old_max / fpp) + 1;
+ xen_pfn_t *p2m = NULL, *p2m_pfns = NULL;
+ uint32_t *pfn_types = NULL;
+ size_t p2msz, p2m_pfnsz, pfn_typesz;
+
+ assert(max_pfn > old_max);
+
+ p2msz = (max_pfn + 1) * ctx->x86.pv.width;
+ p2m = realloc(ctx->x86.pv.p2m, p2msz);
+ if ( !p2m )
+ {
+ ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz);
+ return -1;
+ }
+ ctx->x86.pv.p2m = p2m;
+
+ pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types);
+ pfn_types = realloc(ctx->x86.pv.restore.pfn_types, pfn_typesz);
+ if ( !pfn_types )
+ {
+ ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz);
+ return -1;
+ }
+ ctx->x86.pv.restore.pfn_types = pfn_types;
+
+ p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns);
+ p2m_pfns = realloc(ctx->x86.pv.p2m_pfns, p2m_pfnsz);
+ if ( !p2m_pfns )
+ {
+ ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz);
+ return -1;
+ }
+ ctx->x86.pv.p2m_frames = end_frame;
+ ctx->x86.pv.p2m_pfns = p2m_pfns;
+
+ ctx->x86.pv.max_pfn = max_pfn;
+ for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
+ {
+ ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
+ ctx->restore.ops.set_page_type(ctx, i, 0);
+ }
+
+ for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i )
+ ctx->x86.pv.p2m_pfns[i] = INVALID_MFN;
+
+ DPRINTF("Changed max_pfn from %#lx to %#lx", old_max, max_pfn);
+ return 0;
+}
+
+/*
+ * Pin all of the pagetables.
+ */
+static int pin_pagetables(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned long i, nr_pins;
+ struct mmuext_op pin[MAX_PIN_BATCH];
+
+ for ( i = nr_pins = 0; i <= ctx->x86.pv.max_pfn; ++i )
+ {
+ if ( (ctx->x86.pv.restore.pfn_types[i] &
+ XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
+ continue;
+
+ switch ( (ctx->x86.pv.restore.pfn_types[i] &
+ XEN_DOMCTL_PFINFO_LTABTYPE_MASK) )
+ {
+ case XEN_DOMCTL_PFINFO_L1TAB:
+ pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
+ break;
+ case XEN_DOMCTL_PFINFO_L2TAB:
+ pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
+ break;
+ case XEN_DOMCTL_PFINFO_L3TAB:
+ pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
+ break;
+ case XEN_DOMCTL_PFINFO_L4TAB:
+ pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
+ break;
+ default:
+ continue;
+ }
+
+ pin[nr_pins].arg1.mfn = pfn_to_mfn(ctx, i);
+ nr_pins++;
+
+ if ( nr_pins == MAX_PIN_BATCH )
+ {
+ if ( xc_mmuext_op(xch, pin, nr_pins, ctx->domid) != 0 )
+ {
+ PERROR("Failed to pin batch of pagetables");
+ return -1;
+ }
+ nr_pins = 0;
+ }
+ }
+
+ if ( (nr_pins > 0) && (xc_mmuext_op(xch, pin, nr_pins, ctx->domid) < 0) )
+ {
+ PERROR("Failed to pin batch of pagetables");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Update details in a guests start_info structure.
+ */
+static int process_start_info(struct xc_sr_context *ctx,
+ vcpu_guest_context_any_t *vcpu)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t pfn, mfn;
+ start_info_any_t *guest_start_info = NULL;
+ int rc = -1;
+
+ pfn = GET_FIELD(vcpu, user_regs.edx, ctx->x86.pv.width);
+
+ if ( pfn > ctx->x86.pv.max_pfn )
+ {
+ ERROR("Start Info pfn %#lx out of range", pfn);
+ goto err;
+ }
+
+ if ( ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
+ {
+ ERROR("Start Info pfn %#lx has bad type %u", pfn,
+ (ctx->x86.pv.restore.pfn_types[pfn] >>
+ XEN_DOMCTL_PFINFO_LTAB_SHIFT));
+ goto err;
+ }
+
+ mfn = pfn_to_mfn(ctx, pfn);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("Start Info has bad mfn");
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ goto err;
+ }
+
+ SET_FIELD(vcpu, user_regs.edx, mfn, ctx->x86.pv.width);
+ guest_start_info = xc_map_foreign_range(
+ xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
+ if ( !guest_start_info )
+ {
+ PERROR("Failed to map Start Info at mfn %#lx", mfn);
+ goto err;
+ }
+
+ /* Deal with xenstore stuff */
+ pfn = GET_FIELD(guest_start_info, store_mfn, ctx->x86.pv.width);
+ if ( pfn > ctx->x86.pv.max_pfn )
+ {
+ ERROR("XenStore pfn %#lx out of range", pfn);
+ goto err;
+ }
+
+ mfn = pfn_to_mfn(ctx, pfn);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("XenStore pfn has bad mfn");
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ goto err;
+ }
+
+ ctx->restore.xenstore_gfn = mfn;
+ SET_FIELD(guest_start_info, store_mfn, mfn, ctx->x86.pv.width);
+ SET_FIELD(guest_start_info, store_evtchn,
+ ctx->restore.xenstore_evtchn, ctx->x86.pv.width);
+
+ /* Deal with console stuff */
+ pfn = GET_FIELD(guest_start_info, console.domU.mfn, ctx->x86.pv.width);
+ if ( pfn > ctx->x86.pv.max_pfn )
+ {
+ ERROR("Console pfn %#lx out of range", pfn);
+ goto err;
+ }
+
+ mfn = pfn_to_mfn(ctx, pfn);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("Console pfn has bad mfn");
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ goto err;
+ }
+
+ ctx->restore.console_gfn = mfn;
+ SET_FIELD(guest_start_info, console.domU.mfn, mfn, ctx->x86.pv.width);
+ SET_FIELD(guest_start_info, console.domU.evtchn,
+ ctx->restore.console_evtchn, ctx->x86.pv.width);
+
+ /* Set other information */
+ SET_FIELD(guest_start_info, nr_pages,
+ ctx->x86.pv.max_pfn + 1, ctx->x86.pv.width);
+ SET_FIELD(guest_start_info, shared_info,
+ ctx->dominfo.shared_info_frame << PAGE_SHIFT, ctx->x86.pv.width);
+ SET_FIELD(guest_start_info, flags, 0, ctx->x86.pv.width);
+
+ rc = 0;
+
+ err:
+ if ( guest_start_info )
+ munmap(guest_start_info, PAGE_SIZE);
+
+ return rc;
+}
+
+/*
+ * Process one stashed vcpu worth of basic state and send to Xen.
+ */
+static int process_vcpu_basic(struct xc_sr_context *ctx,
+ unsigned int vcpuid)
+{
+ xc_interface *xch = ctx->xch;
+ vcpu_guest_context_any_t *vcpu = ctx->x86.pv.restore.vcpus[vcpuid].basic.ptr;
+ xen_pfn_t pfn, mfn;
+ unsigned int i, gdt_count;
+ int rc = -1;
+
+ /* Vcpu 0 is special: Convert the suspend record to an mfn. */
+ if ( vcpuid == 0 )
+ {
+ rc = process_start_info(ctx, vcpu);
+ if ( rc )
+ return rc;
+ rc = -1;
+ }
+
+ SET_FIELD(vcpu, flags,
+ GET_FIELD(vcpu, flags, ctx->x86.pv.width) | VGCF_online,
+ ctx->x86.pv.width);
+
+ gdt_count = GET_FIELD(vcpu, gdt_ents, ctx->x86.pv.width);
+ if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
+ {
+ ERROR("GDT entry count (%u) out of range (max %u)",
+ gdt_count, FIRST_RESERVED_GDT_ENTRY);
+ errno = ERANGE;
+ goto err;
+ }
+ gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
+
+ /* Convert GDT frames to mfns. */
+ for ( i = 0; i < gdt_count; ++i )
+ {
+ pfn = GET_FIELD(vcpu, gdt_frames[i], ctx->x86.pv.width);
+ if ( pfn > ctx->x86.pv.max_pfn )
+ {
+ ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn);
+ goto err;
+ }
+
+ if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
+ {
+ ERROR("GDT frame %u (pfn %#lx) has bad type %u", i, pfn,
+ (ctx->x86.pv.restore.pfn_types[pfn] >>
+ XEN_DOMCTL_PFINFO_LTAB_SHIFT));
+ goto err;
+ }
+
+ mfn = pfn_to_mfn(ctx, pfn);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("GDT frame %u has bad mfn", i);
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ goto err;
+ }
+
+ SET_FIELD(vcpu, gdt_frames[i], mfn, ctx->x86.pv.width);
+ }
+
+ /* Convert CR3 to an mfn. */
+ pfn = cr3_to_mfn(ctx, GET_FIELD(vcpu, ctrlreg[3], ctx->x86.pv.width));
+ if ( pfn > ctx->x86.pv.max_pfn )
+ {
+ ERROR("cr3 (pfn %#lx) out of range", pfn);
+ goto err;
+ }
+
+ if ( (ctx->x86.pv.restore.pfn_types[pfn] &
+ XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
+ (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
+ {
+ ERROR("cr3 (pfn %#lx) has bad type %u, expected %u", pfn,
+ (ctx->x86.pv.restore.pfn_types[pfn] >>
+ XEN_DOMCTL_PFINFO_LTAB_SHIFT),
+ ctx->x86.pv.levels);
+ goto err;
+ }
+
+ mfn = pfn_to_mfn(ctx, pfn);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("cr3 has bad mfn");
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ goto err;
+ }
+
+ SET_FIELD(vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn), ctx->x86.pv.width);
+
+ /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */
+ if ( ctx->x86.pv.levels == 4 && (vcpu->x64.ctrlreg[1] & 1) )
+ {
+ pfn = vcpu->x64.ctrlreg[1] >> PAGE_SHIFT;
+
+ if ( pfn > ctx->x86.pv.max_pfn )
+ {
+ ERROR("cr1 (pfn %#lx) out of range", pfn);
+ goto err;
+ }
+
+ if ( (ctx->x86.pv.restore.pfn_types[pfn] &
+ XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
+ (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
+ {
+ ERROR("cr1 (pfn %#lx) has bad type %u, expected %u", pfn,
+ (ctx->x86.pv.restore.pfn_types[pfn] >>
+ XEN_DOMCTL_PFINFO_LTAB_SHIFT),
+ ctx->x86.pv.levels);
+ goto err;
+ }
+
+ mfn = pfn_to_mfn(ctx, pfn);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("cr1 has bad mfn");
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ goto err;
+ }
+
+ vcpu->x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT;
+ }
+
+ if ( xc_vcpu_setcontext(xch, ctx->domid, vcpuid, vcpu) )
+ {
+ PERROR("Failed to set vcpu%u's basic info", vcpuid);
+ goto err;
+ }
+
+ rc = 0;
+
+ err:
+ return rc;
+}
+
+/*
+ * Process one stashed vcpu worth of extended state and send to Xen.
+ */
+static int process_vcpu_extended(struct xc_sr_context *ctx,
+ unsigned int vcpuid)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_x86_pv_restore_vcpu *vcpu =
+ &ctx->x86.pv.restore.vcpus[vcpuid];
+ DECLARE_DOMCTL;
+
+ domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
+ domctl.domain = ctx->domid;
+ memcpy(&domctl.u.ext_vcpucontext, vcpu->extd.ptr, vcpu->extd.size);
+
+ if ( xc_domctl(xch, &domctl) != 0 )
+ {
+ PERROR("Failed to set vcpu%u's extended info", vcpuid);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Process one stashed vcpu worth of xsave state and send to Xen.
+ */
+static int process_vcpu_xsave(struct xc_sr_context *ctx,
+ unsigned int vcpuid)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_x86_pv_restore_vcpu *vcpu =
+ &ctx->x86.pv.restore.vcpus[vcpuid];
+ int rc;
+ DECLARE_DOMCTL;
+ DECLARE_HYPERCALL_BUFFER(void, buffer);
+
+ buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->xsave.size);
+ if ( !buffer )
+ {
+ ERROR("Unable to allocate %zu bytes for xsave hypercall buffer",
+ vcpu->xsave.size);
+ return -1;
+ }
+
+ domctl.cmd = XEN_DOMCTL_setvcpuextstate;
+ domctl.domain = ctx->domid;
+ domctl.u.vcpuextstate.vcpu = vcpuid;
+ domctl.u.vcpuextstate.size = vcpu->xsave.size;
+ set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
+
+ memcpy(buffer, vcpu->xsave.ptr, vcpu->xsave.size);
+
+ rc = xc_domctl(xch, &domctl);
+ if ( rc )
+ PERROR("Failed to set vcpu%u's xsave info", vcpuid);
+
+ xc_hypercall_buffer_free(xch, buffer);
+
+ return rc;
+}
+
+/*
+ * Process one stashed vcpu worth of msr state and send to Xen.
+ */
+static int process_vcpu_msrs(struct xc_sr_context *ctx,
+ unsigned int vcpuid)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_x86_pv_restore_vcpu *vcpu =
+ &ctx->x86.pv.restore.vcpus[vcpuid];
+ int rc;
+ DECLARE_DOMCTL;
+ DECLARE_HYPERCALL_BUFFER(void, buffer);
+
+ buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->msr.size);
+ if ( !buffer )
+ {
+ ERROR("Unable to allocate %zu bytes for msr hypercall buffer",
+ vcpu->msr.size);
+ return -1;
+ }
+
+ domctl.cmd = XEN_DOMCTL_set_vcpu_msrs;
+ domctl.domain = ctx->domid;
+ domctl.u.vcpu_msrs.vcpu = vcpuid;
+ domctl.u.vcpu_msrs.msr_count = vcpu->msr.size / sizeof(xen_domctl_vcpu_msr_t);
+ set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
+
+ memcpy(buffer, vcpu->msr.ptr, vcpu->msr.size);
+
+ rc = xc_domctl(xch, &domctl);
+ if ( rc )
+ PERROR("Failed to set vcpu%u's msrs", vcpuid);
+
+ xc_hypercall_buffer_free(xch, buffer);
+
+ return rc;
+}
+
+/*
+ * Process all stashed vcpu context and send to Xen.
+ */
+static int update_vcpu_context(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_x86_pv_restore_vcpu *vcpu;
+ unsigned int i;
+ int rc = 0;
+
+ for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i )
+ {
+ vcpu = &ctx->x86.pv.restore.vcpus[i];
+
+ if ( vcpu->basic.ptr )
+ {
+ rc = process_vcpu_basic(ctx, i);
+ if ( rc )
+ return rc;
+ }
+ else if ( i == 0 )
+ {
+ ERROR("Sender didn't send vcpu0's basic state");
+ return -1;
+ }
+
+ if ( vcpu->extd.ptr )
+ {
+ rc = process_vcpu_extended(ctx, i);
+ if ( rc )
+ return rc;
+ }
+
+ if ( vcpu->xsave.ptr )
+ {
+ rc = process_vcpu_xsave(ctx, i);
+ if ( rc )
+ return rc;
+ }
+
+ if ( vcpu->msr.ptr )
+ {
+ rc = process_vcpu_msrs(ctx, i);
+ if ( rc )
+ return rc;
+ }
+ }
+
+ return rc;
+}
+
+/*
+ * Copy the p2m which has been constructed locally as memory has been
+ * allocated, over the p2m in guest, so the guest can find its memory again on
+ * resume.
+ */
+static int update_guest_p2m(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t mfn, pfn, *guest_p2m = NULL;
+ unsigned int i;
+ int rc = -1;
+
+ for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i )
+ {
+ pfn = ctx->x86.pv.p2m_pfns[i];
+
+ if ( pfn > ctx->x86.pv.max_pfn )
+ {
+ ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range",
+ pfn, i);
+ goto err;
+ }
+
+ if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
+ {
+ ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %u", pfn, i,
+ (ctx->x86.pv.restore.pfn_types[pfn] >>
+ XEN_DOMCTL_PFINFO_LTAB_SHIFT));
+ goto err;
+ }
+
+ mfn = pfn_to_mfn(ctx, pfn);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("p2m_frame_list[%u] has bad mfn", i);
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ goto err;
+ }
+
+ ctx->x86.pv.p2m_pfns[i] = mfn;
+ }
+
+ guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE,
+ ctx->x86.pv.p2m_pfns,
+ ctx->x86.pv.p2m_frames);
+ if ( !guest_p2m )
+ {
+ PERROR("Failed to map p2m frames");
+ goto err;
+ }
+
+ memcpy(guest_p2m, ctx->x86.pv.p2m,
+ (ctx->x86.pv.max_pfn + 1) * ctx->x86.pv.width);
+ rc = 0;
+
+ err:
+ if ( guest_p2m )
+ munmap(guest_p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE);
+
+ return rc;
+}
+
+/*
+ * The valid width/pt_levels values in X86_PV_INFO are inextricably linked.
+ * Cross-check the legitimate combinations.
+ */
+static bool valid_x86_pv_info_combination(
+ const struct xc_sr_rec_x86_pv_info *info)
+{
+ switch ( info->guest_width )
+ {
+ case 4: return info->pt_levels == 3;
+ case 8: return info->pt_levels == 4;
+ default: return false;
+ }
+}
+
+/*
+ * Process an X86_PV_INFO record.
+ */
+static int handle_x86_pv_info(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_x86_pv_info *info = rec->data;
+
+ if ( ctx->x86.pv.restore.seen_pv_info )
+ {
+ ERROR("Already received X86_PV_INFO record");
+ return -1;
+ }
+
+ if ( rec->length < sizeof(*info) )
+ {
+ ERROR("X86_PV_INFO record truncated: length %u, expected %zu",
+ rec->length, sizeof(*info));
+ return -1;
+ }
+
+ if ( !valid_x86_pv_info_combination(info) )
+ {
+ ERROR("Invalid X86_PV_INFO combination: width %u, pt_levels %u",
+ info->guest_width, info->pt_levels);
+ return -1;
+ }
+
+ /*
+ * PV domains default to native width. For an incomming compat domain, we
+ * will typically be the first entity to inform Xen.
+ */
+ if ( info->guest_width != ctx->x86.pv.width )
+ {
+ struct xen_domctl domctl = {
+ .domain = ctx->domid,
+ .cmd = XEN_DOMCTL_set_address_size,
+ .u.address_size.size = info->guest_width * 8,
+ };
+ int rc = do_domctl(xch, &domctl);
+
+ if ( rc != 0 )
+ {
+ ERROR("Failed to update d%d address size to %u",
+ ctx->domid, info->guest_width * 8);
+ return -1;
+ }
+
+ /* Domain's information changed, better to refresh. */
+ rc = x86_pv_domain_info(ctx);
+ if ( rc != 0 )
+ {
+ ERROR("Unable to refresh guest information");
+ return -1;
+ }
+ }
+
+ /* Sanity check (possibly new) domain settings. */
+ if ( (info->guest_width != ctx->x86.pv.width) ||
+ (info->pt_levels != ctx->x86.pv.levels) )
+ {
+ ERROR("X86_PV_INFO width/pt_levels settings %u/%u mismatch with d%d %u/%u",
+ info->guest_width, info->pt_levels, ctx->domid,
+ ctx->x86.pv.width, ctx->x86.pv.levels);
+ return -1;
+ }
+
+ ctx->x86.pv.restore.seen_pv_info = true;
+ return 0;
+}
+
+/*
+ * Process an X86_PV_P2M_FRAMES record. Takes care of expanding the local p2m
+ * state if needed.
+ */
+static int handle_x86_pv_p2m_frames(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_x86_pv_p2m_frames *data = rec->data;
+ unsigned int start, end, x, fpp = PAGE_SIZE / ctx->x86.pv.width;
+ int rc;
+
+ /* v2 compat. Infer the position of STATIC_DATA_END. */
+ if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end )
+ {
+ rc = handle_static_data_end(ctx);
+ if ( rc )
+ {
+ ERROR("Inferred STATIC_DATA_END record failed");
+ return rc;
+ }
+ }
+
+ if ( !ctx->restore.seen_static_data_end )
+ {
+ ERROR("No STATIC_DATA_END seen");
+ return -1;
+ }
+
+ if ( !ctx->x86.pv.restore.seen_pv_info )
+ {
+ ERROR("Not yet received X86_PV_INFO record");
+ return -1;
+ }
+
+ if ( rec->length < sizeof(*data) )
+ {
+ ERROR("X86_PV_P2M_FRAMES record truncated: length %u, min %zu",
+ rec->length, sizeof(*data) + sizeof(uint64_t));
+ return -1;
+ }
+
+ if ( data->start_pfn > data->end_pfn )
+ {
+ ERROR("End pfn in stream (%#x) exceeds Start (%#x)",
+ data->end_pfn, data->start_pfn);
+ return -1;
+ }
+
+ start = data->start_pfn / fpp;
+ end = data->end_pfn / fpp + 1;
+
+ if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) )
+ {
+ ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#x"
+ ", end_pfn %#x, length %u, expected %zu + (%u - %u) * %zu",
+ data->start_pfn, data->end_pfn, rec->length,
+ sizeof(*data), end, start, sizeof(uint64_t));
+ return -1;
+ }
+
+ if ( data->end_pfn > ctx->x86.pv.max_pfn )
+ {
+ rc = expand_p2m(ctx, data->end_pfn);
+ if ( rc )
+ return rc;
+ }
+
+ for ( x = 0; x < (end - start); ++x )
+ ctx->x86.pv.p2m_pfns[start + x] = data->p2m_pfns[x];
+
+ return 0;
+}
+
+/*
+ * Processes X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} records from the stream.
+ * The blobs are all stashed to one side as they need to be deferred until the
+ * very end of the stream, rather than being send to Xen at the point they
+ * arrive in the stream. It performs all pre-hypercall size validation.
+ */
+static int handle_x86_pv_vcpu_blob(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_x86_pv_vcpu_hdr *vhdr = rec->data;
+ struct xc_sr_x86_pv_restore_vcpu *vcpu;
+ const char *rec_name;
+ size_t blobsz;
+ struct xc_sr_blob *blob = NULL;
+ int rc = -1;
+
+ switch ( rec->type )
+ {
+ case REC_TYPE_X86_PV_VCPU_BASIC:
+ rec_name = "X86_PV_VCPU_BASIC";
+ break;
+
+ case REC_TYPE_X86_PV_VCPU_EXTENDED:
+ rec_name = "X86_PV_VCPU_EXTENDED";
+ break;
+
+ case REC_TYPE_X86_PV_VCPU_XSAVE:
+ rec_name = "X86_PV_VCPU_XSAVE";
+ break;
+
+ case REC_TYPE_X86_PV_VCPU_MSRS:
+ rec_name = "X86_PV_VCPU_MSRS";
+ break;
+
+ default:
+ ERROR("Unrecognised vcpu blob record %s (%u)",
+ rec_type_to_str(rec->type), rec->type);
+ goto out;
+ }
+
+ /* Confirm that there is a complete header. */
+ if ( rec->length < sizeof(*vhdr) )
+ {
+ ERROR("%s record truncated: length %u, header size %zu",
+ rec_name, rec->length, sizeof(*vhdr));
+ goto out;
+ }
+
+ blobsz = rec->length - sizeof(*vhdr);
+
+ /*
+ * Tolerate empty records. Older sending sides used to accidentally
+ * generate them.
+ */
+ if ( blobsz == 0 )
+ {
+ DBGPRINTF("Skipping empty %s record for vcpu %u\n",
+ rec_type_to_str(rec->type), vhdr->vcpu_id);
+ rc = 0;
+ goto out;
+ }
+
+ /* Check that the vcpu id is within range. */
+ if ( vhdr->vcpu_id >= ctx->x86.pv.restore.nr_vcpus )
+ {
+ ERROR("%s record vcpu_id (%u) exceeds domain max (%u)",
+ rec_name, vhdr->vcpu_id, ctx->x86.pv.restore.nr_vcpus - 1);
+ goto out;
+ }
+
+ vcpu = &ctx->x86.pv.restore.vcpus[vhdr->vcpu_id];
+
+ /* Further per-record checks, where possible. */
+ switch ( rec->type )
+ {
+ case REC_TYPE_X86_PV_VCPU_BASIC:
+ {
+ size_t vcpusz = ctx->x86.pv.width == 8 ?
+ sizeof(vcpu_guest_context_x86_64_t) :
+ sizeof(vcpu_guest_context_x86_32_t);
+
+ if ( blobsz != vcpusz )
+ {
+ ERROR("%s record wrong size: expected %zu, got %u",
+ rec_name, sizeof(*vhdr) + vcpusz, rec->length);
+ goto out;
+ }
+ blob = &vcpu->basic;
+ break;
+ }
+
+ case REC_TYPE_X86_PV_VCPU_EXTENDED:
+ if ( blobsz > 128 )
+ {
+ ERROR("%s record too long: max %zu, got %u",
+ rec_name, sizeof(*vhdr) + 128, rec->length);
+ goto out;
+ }
+ blob = &vcpu->extd;
+ break;
+
+ case REC_TYPE_X86_PV_VCPU_XSAVE:
+ if ( blobsz < 16 )
+ {
+ ERROR("%s record too short: min %zu, got %u",
+ rec_name, sizeof(*vhdr) + 16, rec->length);
+ goto out;
+ }
+ blob = &vcpu->xsave;
+ break;
+
+ case REC_TYPE_X86_PV_VCPU_MSRS:
+ if ( blobsz % sizeof(xen_domctl_vcpu_msr_t) != 0 )
+ {
+ ERROR("%s record payload size %zu expected to be a multiple of %zu",
+ rec_name, blobsz, sizeof(xen_domctl_vcpu_msr_t));
+ goto out;
+ }
+ blob = &vcpu->msr;
+ break;
+ }
+
+ rc = update_blob(blob, vhdr->context, blobsz);
+ if ( rc )
+ ERROR("Unable to allocate %zu bytes for vcpu%u %s blob",
+ blobsz, vhdr->vcpu_id, rec_name);
+
+ out:
+ return rc;
+}
+
+/*
+ * Process a SHARED_INFO record from the stream.
+ */
+static int handle_shared_info(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int i;
+ int rc = -1;
+ shared_info_any_t *guest_shinfo = NULL;
+ const shared_info_any_t *old_shinfo = rec->data;
+
+ if ( !ctx->x86.pv.restore.seen_pv_info )
+ {
+ ERROR("Not yet received X86_PV_INFO record");
+ return -1;
+ }
+
+ if ( rec->length != PAGE_SIZE )
+ {
+ ERROR("X86_PV_SHARED_INFO record wrong size: length %u"
+ ", expected 4096", rec->length);
+ goto err;
+ }
+
+ guest_shinfo = xc_map_foreign_range(
+ xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ ctx->dominfo.shared_info_frame);
+ if ( !guest_shinfo )
+ {
+ PERROR("Failed to map Shared Info at mfn %#lx",
+ ctx->dominfo.shared_info_frame);
+ goto err;
+ }
+
+ MEMCPY_FIELD(guest_shinfo, old_shinfo, vcpu_info, ctx->x86.pv.width);
+ MEMCPY_FIELD(guest_shinfo, old_shinfo, arch, ctx->x86.pv.width);
+
+ SET_FIELD(guest_shinfo, arch.pfn_to_mfn_frame_list_list,
+ 0, ctx->x86.pv.width);
+
+ MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_pending, 0, ctx->x86.pv.width);
+ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
+ SET_FIELD(guest_shinfo, vcpu_info[i].evtchn_pending_sel,
+ 0, ctx->x86.pv.width);
+
+ MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_mask, 0xff, ctx->x86.pv.width);
+
+ rc = 0;
+
+ err:
+ if ( guest_shinfo )
+ munmap(guest_shinfo, PAGE_SIZE);
+
+ return rc;
+}
+
+/* restore_ops function. */
+static bool x86_pv_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ return pfn <= ctx->x86.pv.max_pfn;
+}
+
+/* restore_ops function. */
+static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn,
+ unsigned long type)
+{
+ assert(pfn <= ctx->x86.pv.max_pfn);
+
+ ctx->x86.pv.restore.pfn_types[pfn] = type;
+}
+
+/* restore_ops function. */
+static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
+ xen_pfn_t mfn)
+{
+ assert(pfn <= ctx->x86.pv.max_pfn);
+
+ if ( ctx->x86.pv.width == sizeof(uint64_t) )
+ /* 64 bit guest. Need to expand INVALID_MFN for 32 bit toolstacks. */
+ ((uint64_t *)ctx->x86.pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn;
+ else
+ /* 32 bit guest. Can truncate INVALID_MFN for 64 bit toolstacks. */
+ ((uint32_t *)ctx->x86.pv.p2m)[pfn] = mfn;
+}
+
+/*
+ * restore_ops function. Convert pfns back to mfns in pagetables. Possibly
+ * needs to populate new frames if a PTE is found referring to a frame which
+ * hasn't yet been seen from PAGE_DATA records.
+ */
+static int x86_pv_localise_page(struct xc_sr_context *ctx,
+ uint32_t type, void *page)
+{
+ xc_interface *xch = ctx->xch;
+ uint64_t *table = page;
+ uint64_t pte;
+ unsigned int i, to_populate;
+ xen_pfn_t pfns[(PAGE_SIZE / sizeof(uint64_t))];
+
+ type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ /* Only page tables need localisation. */
+ if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
+ return 0;
+
+ /* Check to see whether we need to populate any new frames. */
+ for ( i = 0, to_populate = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
+ {
+ pte = table[i];
+
+ if ( pte & _PAGE_PRESENT )
+ {
+ xen_pfn_t pfn = pte_to_frame(pte);
+
+#ifdef __i386__
+ if ( pfn == INVALID_MFN )
+ {
+ ERROR("PTE truncation detected. L%u[%u] = %016"PRIx64,
+ type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+ errno = E2BIG;
+ return -1;
+ }
+#endif
+
+ if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN )
+ pfns[to_populate++] = pfn;
+ }
+ }
+
+ if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
+ return -1;
+
+ for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
+ {
+ pte = table[i];
+
+ if ( pte & _PAGE_PRESENT )
+ {
+ xen_pfn_t mfn, pfn;
+
+ pfn = pte_to_frame(pte);
+ mfn = pfn_to_mfn(ctx, pfn);
+
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("Bad mfn for L%u[%u] - pte %"PRIx64,
+ type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ errno = ERANGE;
+ return -1;
+ }
+
+ table[i] = merge_pte(pte, mfn);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * restore_ops function. Confirm that the incoming stream matches the type of
+ * domain we are attempting to restore into.
+ */
+static int x86_pv_setup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+
+ if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV )
+ {
+ ERROR("Unable to restore %s domain into an x86_pv domain",
+ dhdr_type_to_str(ctx->restore.guest_type));
+ return -1;
+ }
+
+ if ( ctx->restore.guest_page_size != PAGE_SIZE )
+ {
+ ERROR("Invalid page size %d for x86_pv domains",
+ ctx->restore.guest_page_size);
+ return -1;
+ }
+
+ rc = x86_pv_domain_info(ctx);
+ if ( rc )
+ return rc;
+
+ ctx->x86.pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1;
+ ctx->x86.pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu),
+ ctx->x86.pv.restore.nr_vcpus);
+ if ( !ctx->x86.pv.restore.vcpus )
+ {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ rc = x86_pv_map_m2p(ctx);
+ if ( rc )
+ return rc;
+
+ return rc;
+}
+
+/*
+ * restore_ops function.
+ */
+static int x86_pv_process_record(struct xc_sr_context *ctx,
+ struct xc_sr_record *rec)
+{
+ switch ( rec->type )
+ {
+ case REC_TYPE_X86_PV_INFO:
+ return handle_x86_pv_info(ctx, rec);
+
+ case REC_TYPE_X86_PV_P2M_FRAMES:
+ return handle_x86_pv_p2m_frames(ctx, rec);
+
+ case REC_TYPE_X86_PV_VCPU_BASIC:
+ case REC_TYPE_X86_PV_VCPU_EXTENDED:
+ case REC_TYPE_X86_PV_VCPU_XSAVE:
+ case REC_TYPE_X86_PV_VCPU_MSRS:
+ return handle_x86_pv_vcpu_blob(ctx, rec);
+
+ case REC_TYPE_SHARED_INFO:
+ return handle_shared_info(ctx, rec);
+
+ case REC_TYPE_X86_TSC_INFO:
+ return handle_x86_tsc_info(ctx, rec);
+
+ case REC_TYPE_X86_CPUID_POLICY:
+ return handle_x86_cpuid_policy(ctx, rec);
+
+ case REC_TYPE_X86_MSR_POLICY:
+ return handle_x86_msr_policy(ctx, rec);
+
+ default:
+ return RECORD_NOT_PROCESSED;
+ }
+}
+
+/*
+ * restore_ops function. Update the vcpu context in Xen, pin the pagetables,
+ * rewrite the p2m and seed the grant table.
+ */
+static int x86_pv_stream_complete(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+
+ rc = update_vcpu_context(ctx);
+ if ( rc )
+ return rc;
+
+ rc = pin_pagetables(ctx);
+ if ( rc )
+ return rc;
+
+ rc = update_guest_p2m(ctx);
+ if ( rc )
+ return rc;
+
+ rc = xc_dom_gnttab_seed(xch, ctx->domid, false,
+ ctx->restore.console_gfn,
+ ctx->restore.xenstore_gfn,
+ ctx->restore.console_domid,
+ ctx->restore.xenstore_domid);
+ if ( rc )
+ {
+ PERROR("Failed to seed grant table");
+ return rc;
+ }
+
+ return rc;
+}
+
+/*
+ * restore_ops function.
+ */
+static int x86_pv_cleanup(struct xc_sr_context *ctx)
+{
+ free(ctx->x86.pv.p2m);
+ free(ctx->x86.pv.p2m_pfns);
+
+ if ( ctx->x86.pv.restore.vcpus )
+ {
+ unsigned int i;
+
+ for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i )
+ {
+ struct xc_sr_x86_pv_restore_vcpu *vcpu =
+ &ctx->x86.pv.restore.vcpus[i];
+
+ free(vcpu->basic.ptr);
+ free(vcpu->extd.ptr);
+ free(vcpu->xsave.ptr);
+ free(vcpu->msr.ptr);
+ }
+
+ free(ctx->x86.pv.restore.vcpus);
+ }
+
+ free(ctx->x86.pv.restore.pfn_types);
+
+ if ( ctx->x86.pv.m2p )
+ munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE);
+
+ free(ctx->x86.restore.cpuid.ptr);
+ free(ctx->x86.restore.msr.ptr);
+
+ return 0;
+}
+
+struct xc_sr_restore_ops restore_ops_x86_pv =
+{
+ .pfn_is_valid = x86_pv_pfn_is_valid,
+ .pfn_to_gfn = pfn_to_mfn,
+ .set_page_type = x86_pv_set_page_type,
+ .set_gfn = x86_pv_set_gfn,
+ .localise_page = x86_pv_localise_page,
+ .setup = x86_pv_setup,
+ .process_record = x86_pv_process_record,
+ .static_data_complete = x86_static_data_complete,
+ .stream_complete = x86_pv_stream_complete,
+ .cleanup = x86_pv_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include <assert.h>
+#include <arpa/inet.h>
+
+#include "xg_sr_common.h"
+
+/*
+ * Writes an Image header and Domain header into the stream.
+ */
+static int write_headers(struct xc_sr_context *ctx, uint16_t guest_type)
+{
+ xc_interface *xch = ctx->xch;
+ int32_t xen_version = xc_version(xch, XENVER_version, NULL);
+ struct xc_sr_ihdr ihdr = {
+ .marker = IHDR_MARKER,
+ .id = htonl(IHDR_ID),
+ .version = htonl(3),
+ .options = htons(IHDR_OPT_LITTLE_ENDIAN),
+ };
+ struct xc_sr_dhdr dhdr = {
+ .type = guest_type,
+ .page_shift = XC_PAGE_SHIFT,
+ .xen_major = (xen_version >> 16) & 0xffff,
+ .xen_minor = (xen_version) & 0xffff,
+ };
+
+ if ( xen_version < 0 )
+ {
+ PERROR("Unable to obtain Xen Version");
+ return -1;
+ }
+
+ if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
+ {
+ PERROR("Unable to write Image Header to stream");
+ return -1;
+ }
+
+ if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
+ {
+ PERROR("Unable to write Domain Header to stream");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes an END record into the stream.
+ */
+static int write_end_record(struct xc_sr_context *ctx)
+{
+ struct xc_sr_record end = { .type = REC_TYPE_END };
+
+ return write_record(ctx, &end);
+}
+
+/*
+ * Writes a STATIC_DATA_END record into the stream.
+ */
+static int write_static_data_end_record(struct xc_sr_context *ctx)
+{
+ struct xc_sr_record end = { .type = REC_TYPE_STATIC_DATA_END };
+
+ return write_record(ctx, &end);
+}
+
+/*
+ * Writes a CHECKPOINT record into the stream.
+ */
+static int write_checkpoint_record(struct xc_sr_context *ctx)
+{
+ struct xc_sr_record checkpoint = { .type = REC_TYPE_CHECKPOINT };
+
+ return write_record(ctx, &checkpoint);
+}
+
+/*
+ * Writes a batch of memory as a PAGE_DATA record into the stream. The batch
+ * is constructed in ctx->save.batch_pfns.
+ *
+ * This function:
+ * - gets the types for each pfn in the batch.
+ * - for each pfn with real data:
+ * - maps and attempts to localise the pages.
+ * - construct and writes a PAGE_DATA record into the stream.
+ */
+static int write_batch(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t *mfns = NULL, *types = NULL;
+ void *guest_mapping = NULL;
+ void **guest_data = NULL;
+ void **local_pages = NULL;
+ int *errors = NULL, rc = -1;
+ unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0;
+ unsigned int nr_pfns = ctx->save.nr_batch_pfns;
+ void *page, *orig_page;
+ uint64_t *rec_pfns = NULL;
+ struct iovec *iov = NULL; int iovcnt = 0;
+ struct xc_sr_rec_page_data_header hdr = { 0 };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_PAGE_DATA,
+ };
+
+ assert(nr_pfns != 0);
+
+ /* Mfns of the batch pfns. */
+ mfns = malloc(nr_pfns * sizeof(*mfns));
+ /* Types of the batch pfns. */
+ types = malloc(nr_pfns * sizeof(*types));
+ /* Errors from attempting to map the gfns. */
+ errors = malloc(nr_pfns * sizeof(*errors));
+ /* Pointers to page data to send. Mapped gfns or local allocations. */
+ guest_data = calloc(nr_pfns, sizeof(*guest_data));
+ /* Pointers to locally allocated pages. Need freeing. */
+ local_pages = calloc(nr_pfns, sizeof(*local_pages));
+ /* iovec[] for writev(). */
+ iov = malloc((nr_pfns + 4) * sizeof(*iov));
+
+ if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov )
+ {
+ ERROR("Unable to allocate arrays for a batch of %u pages",
+ nr_pfns);
+ goto err;
+ }
+
+ for ( i = 0; i < nr_pfns; ++i )
+ {
+ types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
+ ctx->save.batch_pfns[i]);
+
+ /* Likely a ballooned page. */
+ if ( mfns[i] == INVALID_MFN )
+ {
+ set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
+ ++ctx->save.nr_deferred_pages;
+ }
+ }
+
+ rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types);
+ if ( rc )
+ {
+ PERROR("Failed to get types for pfn batch");
+ goto err;
+ }
+ rc = -1;
+
+ for ( i = 0; i < nr_pfns; ++i )
+ {
+ switch ( types[i] )
+ {
+ case XEN_DOMCTL_PFINFO_BROKEN:
+ case XEN_DOMCTL_PFINFO_XALLOC:
+ case XEN_DOMCTL_PFINFO_XTAB:
+ continue;
+ }
+
+ mfns[nr_pages++] = mfns[i];
+ }
+
+ if ( nr_pages > 0 )
+ {
+ guest_mapping = xenforeignmemory_map(
+ xch->fmem, ctx->domid, PROT_READ, nr_pages, mfns, errors);
+ if ( !guest_mapping )
+ {
+ PERROR("Failed to map guest pages");
+ goto err;
+ }
+ nr_pages_mapped = nr_pages;
+
+ for ( i = 0, p = 0; i < nr_pfns; ++i )
+ {
+ switch ( types[i] )
+ {
+ case XEN_DOMCTL_PFINFO_BROKEN:
+ case XEN_DOMCTL_PFINFO_XALLOC:
+ case XEN_DOMCTL_PFINFO_XTAB:
+ continue;
+ }
+
+ if ( errors[p] )
+ {
+ ERROR("Mapping of pfn %#"PRIpfn" (mfn %#"PRIpfn") failed %d",
+ ctx->save.batch_pfns[i], mfns[p], errors[p]);
+ goto err;
+ }
+
+ orig_page = page = guest_mapping + (p * PAGE_SIZE);
+ rc = ctx->save.ops.normalise_page(ctx, types[i], &page);
+
+ if ( orig_page != page )
+ local_pages[i] = page;
+
+ if ( rc )
+ {
+ if ( rc == -1 && errno == EAGAIN )
+ {
+ set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
+ ++ctx->save.nr_deferred_pages;
+ types[i] = XEN_DOMCTL_PFINFO_XTAB;
+ --nr_pages;
+ }
+ else
+ goto err;
+ }
+ else
+ guest_data[i] = page;
+
+ rc = -1;
+ ++p;
+ }
+ }
+
+ rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns));
+ if ( !rec_pfns )
+ {
+ ERROR("Unable to allocate %zu bytes of memory for page data pfn list",
+ nr_pfns * sizeof(*rec_pfns));
+ goto err;
+ }
+
+ hdr.count = nr_pfns;
+
+ rec.length = sizeof(hdr);
+ rec.length += nr_pfns * sizeof(*rec_pfns);
+ rec.length += nr_pages * PAGE_SIZE;
+
+ for ( i = 0; i < nr_pfns; ++i )
+ rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i];
+
+ iov[0].iov_base = &rec.type;
+ iov[0].iov_len = sizeof(rec.type);
+
+ iov[1].iov_base = &rec.length;
+ iov[1].iov_len = sizeof(rec.length);
+
+ iov[2].iov_base = &hdr;
+ iov[2].iov_len = sizeof(hdr);
+
+ iov[3].iov_base = rec_pfns;
+ iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
+
+ iovcnt = 4;
+
+ if ( nr_pages )
+ {
+ for ( i = 0; i < nr_pfns; ++i )
+ {
+ if ( guest_data[i] )
+ {
+ iov[iovcnt].iov_base = guest_data[i];
+ iov[iovcnt].iov_len = PAGE_SIZE;
+ iovcnt++;
+ --nr_pages;
+ }
+ }
+ }
+
+ if ( writev_exact(ctx->fd, iov, iovcnt) )
+ {
+ PERROR("Failed to write page data to stream");
+ goto err;
+ }
+
+ /* Sanity check we have sent all the pages we expected to. */
+ assert(nr_pages == 0);
+ rc = ctx->save.nr_batch_pfns = 0;
+
+ err:
+ free(rec_pfns);
+ if ( guest_mapping )
+ xenforeignmemory_unmap(xch->fmem, guest_mapping, nr_pages_mapped);
+ for ( i = 0; local_pages && i < nr_pfns; ++i )
+ free(local_pages[i]);
+ free(iov);
+ free(local_pages);
+ free(guest_data);
+ free(errors);
+ free(types);
+ free(mfns);
+
+ return rc;
+}
+
+/*
+ * Flush a batch of pfns into the stream.
+ */
+static int flush_batch(struct xc_sr_context *ctx)
+{
+ int rc = 0;
+
+ if ( ctx->save.nr_batch_pfns == 0 )
+ return rc;
+
+ rc = write_batch(ctx);
+
+ if ( !rc )
+ {
+ VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns,
+ MAX_BATCH_SIZE *
+ sizeof(*ctx->save.batch_pfns));
+ }
+
+ return rc;
+}
+
+/*
+ * Add a single pfn to the batch, flushing the batch if full.
+ */
+static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+ int rc = 0;
+
+ if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE )
+ rc = flush_batch(ctx);
+
+ if ( rc == 0 )
+ ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
+
+ return rc;
+}
+
+/*
+ * Pause/suspend the domain, and refresh ctx->dominfo if required.
+ */
+static int suspend_domain(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+
+ /* TODO: Properly specify the return value from this callback. All
+ * implementations currently appear to return 1 for success, whereas
+ * the legacy code checks for != 0. */
+ int cb_rc = ctx->save.callbacks->suspend(ctx->save.callbacks->data);
+
+ if ( cb_rc == 0 )
+ {
+ ERROR("save callback suspend() failed: %d", cb_rc);
+ return -1;
+ }
+
+ /* Refresh domain information. */
+ if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) ||
+ (ctx->dominfo.domid != ctx->domid) )
+ {
+ PERROR("Unable to refresh domain information");
+ return -1;
+ }
+
+ /* Confirm the domain has actually been paused. */
+ if ( !ctx->dominfo.shutdown ||
+ (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) )
+ {
+ ERROR("Domain has not been suspended: shutdown %d, reason %d",
+ ctx->dominfo.shutdown, ctx->dominfo.shutdown_reason);
+ return -1;
+ }
+
+ xc_report_progress_single(xch, "Domain now suspended");
+
+ return 0;
+}
+
+/*
+ * Send a subset of pages in the guests p2m, according to the dirty bitmap.
+ * Used for each subsequent iteration of the live migration loop.
+ *
+ * Bitmap is bounded by p2m_size.
+ */
+static int send_dirty_pages(struct xc_sr_context *ctx,
+ unsigned long entries)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t p;
+ unsigned long written;
+ int rc;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->save.dirty_bitmap_hbuf);
+
+ for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p )
+ {
+ if ( !test_bit(p, dirty_bitmap) )
+ continue;
+
+ rc = add_to_batch(ctx, p);
+ if ( rc )
+ return rc;
+
+ /* Update progress every 4MB worth of memory sent. */
+ if ( (written & ((1U << (22 - 12)) - 1)) == 0 )
+ xc_report_progress_step(xch, written, entries);
+
+ ++written;
+ }
+
+ rc = flush_batch(ctx);
+ if ( rc )
+ return rc;
+
+ if ( written > entries )
+ DPRINTF("Bitmap contained more entries than expected...");
+
+ xc_report_progress_step(xch, entries, entries);
+
+ return ctx->save.ops.check_vm_state(ctx);
+}
+
+/*
+ * Send all pages in the guests p2m. Used as the first iteration of the live
+ * migration loop, and for a non-live save.
+ */
+static int send_all_pages(struct xc_sr_context *ctx)
+{
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->save.dirty_bitmap_hbuf);
+
+ bitmap_set(dirty_bitmap, ctx->save.p2m_size);
+
+ return send_dirty_pages(ctx, ctx->save.p2m_size);
+}
+
+static int enable_logdirty(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int on1 = 0, off = 0, on2 = 0;
+ int rc;
+
+ /* This juggling is required if logdirty is enabled for VRAM tracking. */
+ rc = xc_shadow_control(xch, ctx->domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL);
+ if ( rc < 0 )
+ {
+ on1 = errno;
+ rc = xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL);
+ if ( rc < 0 )
+ off = errno;
+ else {
+ rc = xc_shadow_control(xch, ctx->domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL);
+ if ( rc < 0 )
+ on2 = errno;
+ }
+ if ( rc < 0 )
+ {
+ PERROR("Failed to enable logdirty: %d,%d,%d", on1, off, on2);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static int update_progress_string(struct xc_sr_context *ctx, char **str)
+{
+ xc_interface *xch = ctx->xch;
+ char *new_str = NULL;
+ unsigned int iter = ctx->save.stats.iteration;
+
+ if ( asprintf(&new_str, "Frames iteration %u", iter) == -1 )
+ {
+ PERROR("Unable to allocate new progress string");
+ return -1;
+ }
+
+ free(*str);
+ *str = new_str;
+
+ xc_set_progress_prefix(xch, *str);
+ return 0;
+}
+
+/*
+ * This is the live migration precopy policy - it's called periodically during
+ * the precopy phase of live migrations, and is responsible for deciding when
+ * the precopy phase should terminate and what should be done next.
+ *
+ * The policy implemented here behaves identically to the policy previously
+ * hard-coded into xc_domain_save() - it proceeds to the stop-and-copy phase of
+ * the live migration when there are either fewer than 50 dirty pages, or more
+ * than 5 precopy rounds have completed.
+ */
+#define SPP_MAX_ITERATIONS 5
+#define SPP_TARGET_DIRTY_COUNT 50
+
+static int simple_precopy_policy(struct precopy_stats stats, void *user)
+{
+ return ((stats.dirty_count >= 0 &&
+ stats.dirty_count < SPP_TARGET_DIRTY_COUNT) ||
+ stats.iteration >= SPP_MAX_ITERATIONS)
+ ? XGS_POLICY_STOP_AND_COPY
+ : XGS_POLICY_CONTINUE_PRECOPY;
+}
+
+/*
+ * Send memory while guest is running.
+ */
+static int send_memory_live(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
+ char *progress_str = NULL;
+ unsigned int x = 0;
+ int rc;
+ int policy_decision;
+
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->save.dirty_bitmap_hbuf);
+
+ precopy_policy_t precopy_policy = ctx->save.callbacks->precopy_policy;
+ void *data = ctx->save.callbacks->data;
+
+ struct precopy_stats *policy_stats;
+
+ rc = update_progress_string(ctx, &progress_str);
+ if ( rc )
+ goto out;
+
+ ctx->save.stats = (struct precopy_stats){
+ .dirty_count = ctx->save.p2m_size,
+ };
+ policy_stats = &ctx->save.stats;
+
+ if ( precopy_policy == NULL )
+ precopy_policy = simple_precopy_policy;
+
+ bitmap_set(dirty_bitmap, ctx->save.p2m_size);
+
+ for ( ; ; )
+ {
+ policy_decision = precopy_policy(*policy_stats, data);
+ x++;
+
+ if ( stats.dirty_count > 0 && policy_decision != XGS_POLICY_ABORT )
+ {
+ rc = update_progress_string(ctx, &progress_str);
+ if ( rc )
+ goto out;
+
+ rc = send_dirty_pages(ctx, stats.dirty_count);
+ if ( rc )
+ goto out;
+ }
+
+ if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY )
+ break;
+
+ policy_stats->iteration = x;
+ policy_stats->total_written += policy_stats->dirty_count;
+ policy_stats->dirty_count = -1;
+
+ policy_decision = precopy_policy(*policy_stats, data);
+
+ if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY )
+ break;
+
+ if ( xc_shadow_control(
+ xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+ &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size,
+ NULL, 0, &stats) != ctx->save.p2m_size )
+ {
+ PERROR("Failed to retrieve logdirty bitmap");
+ rc = -1;
+ goto out;
+ }
+
+ policy_stats->dirty_count = stats.dirty_count;
+
+ }
+
+ if ( policy_decision == XGS_POLICY_ABORT )
+ {
+ PERROR("Abort precopy loop");
+ rc = -1;
+ goto out;
+ }
+
+ out:
+ xc_set_progress_prefix(xch, NULL);
+ free(progress_str);
+ return rc;
+}
+
+static int colo_merge_secondary_dirty_bitmap(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_record rec;
+ uint64_t *pfns = NULL;
+ uint64_t pfn;
+ unsigned int count, i;
+ int rc;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->save.dirty_bitmap_hbuf);
+
+ rc = read_record(ctx, ctx->save.recv_fd, &rec);
+ if ( rc )
+ goto err;
+
+ if ( rec.type != REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST )
+ {
+ PERROR("Expect dirty bitmap record, but received %u", rec.type);
+ rc = -1;
+ goto err;
+ }
+
+ if ( rec.length % sizeof(*pfns) )
+ {
+ PERROR("Invalid dirty pfn list record length %u", rec.length);
+ rc = -1;
+ goto err;
+ }
+
+ count = rec.length / sizeof(*pfns);
+ pfns = rec.data;
+
+ for ( i = 0; i < count; i++ )
+ {
+ pfn = pfns[i];
+ if ( pfn > ctx->save.p2m_size )
+ {
+ PERROR("Invalid pfn 0x%" PRIx64, pfn);
+ rc = -1;
+ goto err;
+ }
+
+ set_bit(pfn, dirty_bitmap);
+ }
+
+ rc = 0;
+
+ err:
+ free(rec.data);
+ return rc;
+}
+
+/*
+ * Suspend the domain and send dirty memory.
+ * This is the last iteration of the live migration and the
+ * heart of the checkpointed stream.
+ */
+static int suspend_and_send_dirty(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
+ char *progress_str = NULL;
+ int rc;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->save.dirty_bitmap_hbuf);
+
+ rc = suspend_domain(ctx);
+ if ( rc )
+ goto out;
+
+ if ( xc_shadow_control(
+ xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+ HYPERCALL_BUFFER(dirty_bitmap), ctx->save.p2m_size,
+ NULL, XEN_DOMCTL_SHADOW_LOGDIRTY_FINAL, &stats) !=
+ ctx->save.p2m_size )
+ {
+ PERROR("Failed to retrieve logdirty bitmap");
+ rc = -1;
+ goto out;
+ }
+
+ if ( ctx->save.live )
+ {
+ rc = update_progress_string(ctx, &progress_str);
+ if ( rc )
+ goto out;
+ }
+ else
+ xc_set_progress_prefix(xch, "Checkpointed save");
+
+ bitmap_or(dirty_bitmap, ctx->save.deferred_pages, ctx->save.p2m_size);
+
+ if ( !ctx->save.live && ctx->stream_type == XC_STREAM_COLO )
+ {
+ rc = colo_merge_secondary_dirty_bitmap(ctx);
+ if ( rc )
+ {
+ PERROR("Failed to get secondary vm's dirty pages");
+ goto out;
+ }
+ }
+
+ rc = send_dirty_pages(ctx, stats.dirty_count + ctx->save.nr_deferred_pages);
+ if ( rc )
+ goto out;
+
+ bitmap_clear(ctx->save.deferred_pages, ctx->save.p2m_size);
+ ctx->save.nr_deferred_pages = 0;
+
+ out:
+ xc_set_progress_prefix(xch, NULL);
+ free(progress_str);
+ return rc;
+}
+
+static int verify_frames(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
+ int rc;
+ struct xc_sr_record rec = { .type = REC_TYPE_VERIFY };
+
+ DPRINTF("Enabling verify mode");
+
+ rc = write_record(ctx, &rec);
+ if ( rc )
+ goto out;
+
+ xc_set_progress_prefix(xch, "Frames verify");
+ rc = send_all_pages(ctx);
+ if ( rc )
+ goto out;
+
+ if ( xc_shadow_control(
+ xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_PEEK,
+ &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size,
+ NULL, 0, &stats) != ctx->save.p2m_size )
+ {
+ PERROR("Failed to retrieve logdirty bitmap");
+ rc = -1;
+ goto out;
+ }
+
+ DPRINTF(" Further stats: faults %u, dirty %u",
+ stats.fault_count, stats.dirty_count);
+
+ out:
+ return rc;
+}
+
+/*
+ * Send all domain memory. This is the heart of the live migration loop.
+ */
+static int send_domain_memory_live(struct xc_sr_context *ctx)
+{
+ int rc;
+
+ rc = enable_logdirty(ctx);
+ if ( rc )
+ goto out;
+
+ rc = send_memory_live(ctx);
+ if ( rc )
+ goto out;
+
+ rc = suspend_and_send_dirty(ctx);
+ if ( rc )
+ goto out;
+
+ if ( ctx->save.debug && ctx->stream_type != XC_STREAM_PLAIN )
+ {
+ rc = verify_frames(ctx);
+ if ( rc )
+ goto out;
+ }
+
+ out:
+ return rc;
+}
+
+/*
+ * Checkpointed save.
+ */
+static int send_domain_memory_checkpointed(struct xc_sr_context *ctx)
+{
+ return suspend_and_send_dirty(ctx);
+}
+
+/*
+ * Send all domain memory, pausing the domain first. Generally used for
+ * suspend-to-file.
+ */
+static int send_domain_memory_nonlive(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+
+ rc = suspend_domain(ctx);
+ if ( rc )
+ goto err;
+
+ xc_set_progress_prefix(xch, "Frames");
+
+ rc = send_all_pages(ctx);
+ if ( rc )
+ goto err;
+
+ err:
+ return rc;
+}
+
+static int setup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->save.dirty_bitmap_hbuf);
+
+ rc = ctx->save.ops.setup(ctx);
+ if ( rc )
+ goto err;
+
+ dirty_bitmap = xc_hypercall_buffer_alloc_pages(
+ xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->save.p2m_size)));
+ ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE *
+ sizeof(*ctx->save.batch_pfns));
+ ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size));
+
+ if ( !ctx->save.batch_pfns || !dirty_bitmap || !ctx->save.deferred_pages )
+ {
+ ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
+ " deferred pages");
+ rc = -1;
+ errno = ENOMEM;
+ goto err;
+ }
+
+ rc = 0;
+
+ err:
+ return rc;
+}
+
+static void cleanup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+ &ctx->save.dirty_bitmap_hbuf);
+
+
+ xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL);
+
+ if ( ctx->save.ops.cleanup(ctx) )
+ PERROR("Failed to clean up");
+
+ xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
+ NRPAGES(bitmap_size(ctx->save.p2m_size)));
+ free(ctx->save.deferred_pages);
+ free(ctx->save.batch_pfns);
+}
+
+/*
+ * Save a domain.
+ */
+static int save(struct xc_sr_context *ctx, uint16_t guest_type)
+{
+ xc_interface *xch = ctx->xch;
+ int rc, saved_rc = 0, saved_errno = 0;
+
+ IPRINTF("Saving domain %d, type %s",
+ ctx->domid, dhdr_type_to_str(guest_type));
+
+ rc = setup(ctx);
+ if ( rc )
+ goto err;
+
+ xc_report_progress_single(xch, "Start of stream");
+
+ rc = write_headers(ctx, guest_type);
+ if ( rc )
+ goto err;
+
+ rc = ctx->save.ops.static_data(ctx);
+ if ( rc )
+ goto err;
+
+ rc = write_static_data_end_record(ctx);
+ if ( rc )
+ goto err;
+
+ rc = ctx->save.ops.start_of_stream(ctx);
+ if ( rc )
+ goto err;
+
+ do {
+ rc = ctx->save.ops.start_of_checkpoint(ctx);
+ if ( rc )
+ goto err;
+
+ rc = ctx->save.ops.check_vm_state(ctx);
+ if ( rc )
+ goto err;
+
+ if ( ctx->save.live )
+ rc = send_domain_memory_live(ctx);
+ else if ( ctx->stream_type != XC_STREAM_PLAIN )
+ rc = send_domain_memory_checkpointed(ctx);
+ else
+ rc = send_domain_memory_nonlive(ctx);
+
+ if ( rc )
+ goto err;
+
+ if ( !ctx->dominfo.shutdown ||
+ (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) )
+ {
+ ERROR("Domain has not been suspended");
+ rc = -1;
+ goto err;
+ }
+
+ rc = ctx->save.ops.end_of_checkpoint(ctx);
+ if ( rc )
+ goto err;
+
+ if ( ctx->stream_type != XC_STREAM_PLAIN )
+ {
+ /*
+ * We have now completed the initial live portion of the checkpoint
+ * process. Therefore switch into periodically sending synchronous
+ * batches of pages.
+ */
+ ctx->save.live = false;
+
+ rc = write_checkpoint_record(ctx);
+ if ( rc )
+ goto err;
+
+ if ( ctx->stream_type == XC_STREAM_COLO )
+ {
+ rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data);
+ if ( !rc )
+ {
+ rc = -1;
+ goto err;
+ }
+ }
+
+ rc = ctx->save.callbacks->postcopy(ctx->save.callbacks->data);
+ if ( rc <= 0 )
+ goto err;
+
+ if ( ctx->stream_type == XC_STREAM_COLO )
+ {
+ rc = ctx->save.callbacks->wait_checkpoint(
+ ctx->save.callbacks->data);
+ if ( rc <= 0 )
+ goto err;
+ }
+ else if ( ctx->stream_type == XC_STREAM_REMUS )
+ {
+ rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data);
+ if ( rc <= 0 )
+ goto err;
+ }
+ else
+ {
+ ERROR("Unknown checkpointed stream");
+ rc = -1;
+ goto err;
+ }
+ }
+ } while ( ctx->stream_type != XC_STREAM_PLAIN );
+
+ xc_report_progress_single(xch, "End of stream");
+
+ rc = write_end_record(ctx);
+ if ( rc )
+ goto err;
+
+ xc_report_progress_single(xch, "Complete");
+ goto done;
+
+ err:
+ saved_errno = errno;
+ saved_rc = rc;
+ PERROR("Save failed");
+
+ done:
+ cleanup(ctx);
+
+ if ( saved_rc )
+ {
+ rc = saved_rc;
+ errno = saved_errno;
+ }
+
+ return rc;
+};
+
+int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
+ uint32_t flags, struct save_callbacks *callbacks,
+ xc_stream_type_t stream_type, int recv_fd)
+{
+ struct xc_sr_context ctx = {
+ .xch = xch,
+ .fd = io_fd,
+ .stream_type = stream_type,
+ };
+
+ /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */
+ ctx.save.callbacks = callbacks;
+ ctx.save.live = !!(flags & XCFLAGS_LIVE);
+ ctx.save.debug = !!(flags & XCFLAGS_DEBUG);
+ ctx.save.recv_fd = recv_fd;
+
+ if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
+ {
+ PERROR("Failed to get domain info");
+ return -1;
+ }
+
+ if ( ctx.dominfo.domid != dom )
+ {
+ ERROR("Domain %u does not exist", dom);
+ return -1;
+ }
+
+ /* Sanity check stream_type-related parameters */
+ switch ( stream_type )
+ {
+ case XC_STREAM_COLO:
+ assert(callbacks->wait_checkpoint);
+ /* Fallthrough */
+ case XC_STREAM_REMUS:
+ assert(callbacks->checkpoint && callbacks->postcopy);
+ /* Fallthrough */
+ case XC_STREAM_PLAIN:
+ if ( ctx.dominfo.hvm )
+ assert(callbacks->switch_qemu_logdirty);
+ break;
+
+ default:
+ assert(!"Bad stream_type");
+ break;
+ }
+
+ DPRINTF("fd %d, dom %u, flags %u, hvm %d",
+ io_fd, dom, flags, ctx.dominfo.hvm);
+
+ ctx.domid = dom;
+
+ if ( ctx.dominfo.hvm )
+ {
+ ctx.save.ops = save_ops_x86_hvm;
+ return save(&ctx, DHDR_TYPE_X86_HVM);
+ }
+ else
+ {
+ ctx.save.ops = save_ops_x86_pv;
+ return save(&ctx, DHDR_TYPE_X86_PV);
+ }
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include <assert.h>
+
+#include "xg_sr_common_x86.h"
+
+#include <xen/hvm/params.h>
+
+/*
+ * Query for the HVM context and write an HVM_CONTEXT record into the stream.
+ */
+static int write_hvm_context(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc, hvm_buf_size;
+ struct xc_sr_record hvm_rec = {
+ .type = REC_TYPE_HVM_CONTEXT,
+ };
+
+ hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid, 0, 0);
+ if ( hvm_buf_size < 0 )
+ {
+ PERROR("Couldn't get HVM context size from Xen");
+ rc = -1;
+ goto out;
+ }
+
+ hvm_rec.data = malloc(hvm_buf_size);
+ if ( !hvm_rec.data )
+ {
+ PERROR("Couldn't allocate memory");
+ rc = -1;
+ goto out;
+ }
+
+ hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid,
+ hvm_rec.data, hvm_buf_size);
+ if ( hvm_buf_size < 0 )
+ {
+ PERROR("Couldn't get HVM context from Xen");
+ rc = -1;
+ goto out;
+ }
+
+ hvm_rec.length = hvm_buf_size;
+ rc = write_record(ctx, &hvm_rec);
+ if ( rc < 0 )
+ {
+ PERROR("error write HVM_CONTEXT record");
+ goto out;
+ }
+
+ out:
+ free(hvm_rec.data);
+ return rc;
+}
+
+/*
+ * Query for a range of HVM parameters and write an HVM_PARAMS record into the
+ * stream.
+ */
+static int write_hvm_params(struct xc_sr_context *ctx)
+{
+ static const unsigned int params[] = {
+ HVM_PARAM_STORE_PFN,
+ HVM_PARAM_IOREQ_PFN,
+ HVM_PARAM_BUFIOREQ_PFN,
+ HVM_PARAM_PAGING_RING_PFN,
+ HVM_PARAM_MONITOR_RING_PFN,
+ HVM_PARAM_SHARING_RING_PFN,
+ HVM_PARAM_VM86_TSS_SIZED,
+ HVM_PARAM_CONSOLE_PFN,
+ HVM_PARAM_ACPI_IOPORTS_LOCATION,
+ HVM_PARAM_VIRIDIAN,
+ HVM_PARAM_IDENT_PT,
+ HVM_PARAM_VM_GENERATION_ID_ADDR,
+ HVM_PARAM_IOREQ_SERVER_PFN,
+ HVM_PARAM_NR_IOREQ_SERVER_PAGES,
+ HVM_PARAM_X87_FIP_WIDTH,
+ HVM_PARAM_MCA_CAP,
+ };
+
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_hvm_params_entry entries[ARRAY_SIZE(params)];
+ struct xc_sr_rec_hvm_params hdr = {
+ .count = 0,
+ };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_HVM_PARAMS,
+ .length = sizeof(hdr),
+ .data = &hdr,
+ };
+ unsigned int i;
+ int rc;
+
+ for ( i = 0; i < ARRAY_SIZE(params); i++ )
+ {
+ uint32_t index = params[i];
+ uint64_t value;
+
+ rc = xc_hvm_param_get(xch, ctx->domid, index, &value);
+ if ( rc )
+ {
+ PERROR("Failed to get HVMPARAM at index %u", index);
+ return rc;
+ }
+
+ if ( value != 0 )
+ {
+ entries[hdr.count].index = index;
+ entries[hdr.count].value = value;
+ hdr.count++;
+ }
+ }
+
+ /* No params? Skip this record. */
+ if ( hdr.count == 0 )
+ return 0;
+
+ rc = write_split_record(ctx, &rec, entries, hdr.count * sizeof(*entries));
+ if ( rc )
+ PERROR("Failed to write HVM_PARAMS record");
+
+ return rc;
+}
+
+static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx,
+ xen_pfn_t pfn)
+{
+ /* identity map */
+ return pfn;
+}
+
+static int x86_hvm_normalise_page(struct xc_sr_context *ctx,
+ xen_pfn_t type, void **page)
+{
+ return 0;
+}
+
+static int x86_hvm_setup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t nr_pfns;
+
+ if ( xc_domain_nr_gpfns(xch, ctx->domid, &nr_pfns) < 0 )
+ {
+ PERROR("Unable to obtain the guest p2m size");
+ return -1;
+ }
+#ifdef __i386__
+ /* Very large domains (> 1TB) will exhaust virtual address space. */
+ if ( nr_pfns > 0x0fffffff )
+ {
+ errno = E2BIG;
+ PERROR("Cannot save this big a guest");
+ return -1;
+ }
+#endif
+
+ ctx->save.p2m_size = nr_pfns;
+
+ if ( ctx->save.callbacks->switch_qemu_logdirty(
+ ctx->domid, 1, ctx->save.callbacks->data) )
+ {
+ PERROR("Couldn't enable qemu log-dirty mode");
+ return -1;
+ }
+
+ ctx->x86.hvm.save.qemu_enabled_logdirty = true;
+
+ return 0;
+}
+
+static int x86_hvm_static_data(struct xc_sr_context *ctx)
+{
+ return write_x86_cpu_policy_records(ctx);
+}
+
+static int x86_hvm_start_of_stream(struct xc_sr_context *ctx)
+{
+ return 0;
+}
+
+static int x86_hvm_start_of_checkpoint(struct xc_sr_context *ctx)
+{
+ return 0;
+}
+
+static int x86_hvm_check_vm_state(struct xc_sr_context *ctx)
+{
+ return 0;
+}
+
+static int x86_hvm_end_of_checkpoint(struct xc_sr_context *ctx)
+{
+ int rc;
+
+ /* Write the TSC record. */
+ rc = write_x86_tsc_info(ctx);
+ if ( rc )
+ return rc;
+
+ /* Write the HVM_CONTEXT record. */
+ rc = write_hvm_context(ctx);
+ if ( rc )
+ return rc;
+
+ /* Write HVM_PARAMS record contains applicable HVM params. */
+ rc = write_hvm_params(ctx);
+ if ( rc )
+ return rc;
+
+ return 0;
+}
+
+static int x86_hvm_cleanup(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+
+ /* If qemu successfully enabled logdirty mode, attempt to disable. */
+ if ( ctx->x86.hvm.save.qemu_enabled_logdirty &&
+ ctx->save.callbacks->switch_qemu_logdirty(
+ ctx->domid, 0, ctx->save.callbacks->data) )
+ {
+ PERROR("Couldn't disable qemu log-dirty mode");
+ return -1;
+ }
+
+ return 0;
+}
+
+struct xc_sr_save_ops save_ops_x86_hvm =
+{
+ .pfn_to_gfn = x86_hvm_pfn_to_gfn,
+ .normalise_page = x86_hvm_normalise_page,
+ .setup = x86_hvm_setup,
+ .static_data = x86_hvm_static_data,
+ .start_of_stream = x86_hvm_start_of_stream,
+ .start_of_checkpoint = x86_hvm_start_of_checkpoint,
+ .end_of_checkpoint = x86_hvm_end_of_checkpoint,
+ .check_vm_state = x86_hvm_check_vm_state,
+ .cleanup = x86_hvm_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#include <assert.h>
+#include <limits.h>
+
+#include "xg_sr_common_x86_pv.h"
+
+/* Check a 64 bit virtual address for being canonical. */
+static inline bool is_canonical_address(xen_vaddr_t vaddr)
+{
+ return ((int64_t)vaddr >> 47) == ((int64_t)vaddr >> 63);
+}
+
+/*
+ * Maps the guests shared info page.
+ */
+static int map_shinfo(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+
+ ctx->x86.pv.shinfo = xc_map_foreign_range(
+ xch, ctx->domid, PAGE_SIZE, PROT_READ, ctx->dominfo.shared_info_frame);
+ if ( !ctx->x86.pv.shinfo )
+ {
+ PERROR("Failed to map shared info frame at mfn %#lx",
+ ctx->dominfo.shared_info_frame);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy a list of mfns from a guest, accounting for differences between guest
+ * and toolstack width. Can fail if truncation would occur.
+ */
+static int copy_mfns_from_guest(const struct xc_sr_context *ctx,
+ xen_pfn_t *dst, const void *src, size_t count)
+{
+ size_t x;
+
+ if ( ctx->x86.pv.width == sizeof(unsigned long) )
+ memcpy(dst, src, count * sizeof(*dst));
+ else
+ {
+ for ( x = 0; x < count; ++x )
+ {
+#ifdef __x86_64__
+ /* 64bit toolstack, 32bit guest. Expand any INVALID_MFN. */
+ uint32_t s = ((uint32_t *)src)[x];
+
+ dst[x] = s == ~0U ? INVALID_MFN : s;
+#else
+ /*
+ * 32bit toolstack, 64bit guest. Truncate INVALID_MFN, but bail
+ * if any other truncation would occur.
+ *
+ * This will only occur on hosts where a PV guest has ram above
+ * the 16TB boundary. A 32bit dom0 is unlikely to have
+ * successfully booted on a system this large.
+ */
+ uint64_t s = ((uint64_t *)src)[x];
+
+ if ( (s != ~0ULL) && ((s >> 32) != 0) )
+ {
+ errno = E2BIG;
+ return -1;
+ }
+
+ dst[x] = s;
+#endif
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Map the p2m leave pages and build an array of their pfns.
+ */
+static int map_p2m_leaves(struct xc_sr_context *ctx, xen_pfn_t *mfns,
+ size_t n_mfns)
+{
+ xc_interface *xch = ctx->xch;
+ unsigned int x;
+
+ ctx->x86.pv.p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
+ mfns, n_mfns);
+ if ( !ctx->x86.pv.p2m )
+ {
+ PERROR("Failed to map p2m frames");
+ return -1;
+ }
+
+ ctx->save.p2m_size = ctx->x86.pv.max_pfn + 1;
+ ctx->x86.pv.p2m_frames = n_mfns;
+ ctx->x86.pv.p2m_pfns = malloc(n_mfns * sizeof(*mfns));
+ if ( !ctx->x86.pv.p2m_pfns )
+ {
+ ERROR("Cannot allocate %zu bytes for p2m pfns list",
+ n_mfns * sizeof(*mfns));
+ return -1;
+ }
+
+ /* Convert leaf frames from mfns to pfns. */
+ for ( x = 0; x < n_mfns; ++x )
+ {
+ if ( !mfn_in_pseudophysmap(ctx, mfns[x]) )
+ {
+ ERROR("Bad mfn in p2m_frame_list[%u]", x);
+ dump_bad_pseudophysmap_entry(ctx, mfns[x]);
+ errno = ERANGE;
+ return -1;
+ }
+
+ ctx->x86.pv.p2m_pfns[x] = mfn_to_pfn(ctx, mfns[x]);
+ }
+
+ return 0;
+}
+
+/*
+ * Walk the guests frame list list and frame list to identify and map the
+ * frames making up the guests p2m table. Construct a list of pfns making up
+ * the table.
+ */
+static int map_p2m_tree(struct xc_sr_context *ctx)
+{
+ /* Terminology:
+ *
+ * fll - frame list list, top level p2m, list of fl mfns
+ * fl - frame list, mid level p2m, list of leaf mfns
+ * local - own allocated buffers, adjusted for bitness
+ * guest - mappings into the domain
+ */
+ xc_interface *xch = ctx->xch;
+ int rc = -1;
+ unsigned int x, saved_x, fpp, fll_entries, fl_entries;
+ xen_pfn_t fll_mfn, saved_mfn, max_pfn;
+
+ xen_pfn_t *local_fll = NULL;
+ void *guest_fll = NULL;
+ size_t local_fll_size;
+
+ xen_pfn_t *local_fl = NULL;
+ void *guest_fl = NULL;
+ size_t local_fl_size;
+
+ fpp = PAGE_SIZE / ctx->x86.pv.width;
+ fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
+ if ( fll_entries > fpp )
+ {
+ ERROR("max_pfn %#lx too large for p2m tree", ctx->x86.pv.max_pfn);
+ goto err;
+ }
+
+ fll_mfn = GET_FIELD(ctx->x86.pv.shinfo, arch.pfn_to_mfn_frame_list_list,
+ ctx->x86.pv.width);
+ if ( fll_mfn == 0 || fll_mfn > ctx->x86.pv.max_mfn )
+ {
+ ERROR("Bad mfn %#lx for p2m frame list list", fll_mfn);
+ goto err;
+ }
+
+ /* Map the guest top p2m. */
+ guest_fll = xc_map_foreign_range(xch, ctx->domid, PAGE_SIZE,
+ PROT_READ, fll_mfn);
+ if ( !guest_fll )
+ {
+ PERROR("Failed to map p2m frame list list at %#lx", fll_mfn);
+ goto err;
+ }
+
+ local_fll_size = fll_entries * sizeof(*local_fll);
+ local_fll = malloc(local_fll_size);
+ if ( !local_fll )
+ {
+ ERROR("Cannot allocate %zu bytes for local p2m frame list list",
+ local_fll_size);
+ goto err;
+ }
+
+ if ( copy_mfns_from_guest(ctx, local_fll, guest_fll, fll_entries) )
+ {
+ ERROR("Truncation detected copying p2m frame list list");
+ goto err;
+ }
+
+ /* Check for bad mfns in frame list list. */
+ saved_mfn = 0;
+ saved_x = 0;
+ for ( x = 0; x < fll_entries; ++x )
+ {
+ if ( local_fll[x] == 0 || local_fll[x] > ctx->x86.pv.max_mfn )
+ {
+ ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list list",
+ local_fll[x], x, fll_entries);
+ goto err;
+ }
+ if ( local_fll[x] != saved_mfn )
+ {
+ saved_mfn = local_fll[x];
+ saved_x = x;
+ }
+ }
+
+ /*
+ * Check for actual lower max_pfn:
+ * If the trailing entries of the frame list list were all the same we can
+ * assume they all reference mid pages all referencing p2m pages with all
+ * invalid entries. Otherwise there would be multiple pfns referencing all
+ * the same mfn which can't work across migration, as this sharing would be
+ * broken by the migration process.
+ * Adjust max_pfn if possible to avoid allocating much larger areas as
+ * needed for p2m and logdirty map.
+ */
+ max_pfn = (saved_x + 1) * fpp * fpp - 1;
+ if ( max_pfn < ctx->x86.pv.max_pfn )
+ {
+ ctx->x86.pv.max_pfn = max_pfn;
+ fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
+ }
+ ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
+ DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
+ ctx->x86.pv.p2m_frames);
+ fl_entries = (ctx->x86.pv.max_pfn / fpp) + 1;
+
+ /* Map the guest mid p2m frames. */
+ guest_fl = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
+ local_fll, fll_entries);
+ if ( !guest_fl )
+ {
+ PERROR("Failed to map p2m frame list");
+ goto err;
+ }
+
+ local_fl_size = fl_entries * sizeof(*local_fl);
+ local_fl = malloc(local_fl_size);
+ if ( !local_fl )
+ {
+ ERROR("Cannot allocate %zu bytes for local p2m frame list",
+ local_fl_size);
+ goto err;
+ }
+
+ if ( copy_mfns_from_guest(ctx, local_fl, guest_fl, fl_entries) )
+ {
+ ERROR("Truncation detected copying p2m frame list");
+ goto err;
+ }
+
+ for ( x = 0; x < fl_entries; ++x )
+ {
+ if ( local_fl[x] == 0 || local_fl[x] > ctx->x86.pv.max_mfn )
+ {
+ ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list",
+ local_fl[x], x, fl_entries);
+ goto err;
+ }
+ }
+
+ /* Map the p2m leaves themselves. */
+ rc = map_p2m_leaves(ctx, local_fl, fl_entries);
+
+ err:
+ free(local_fl);
+ if ( guest_fl )
+ munmap(guest_fl, fll_entries * PAGE_SIZE);
+
+ free(local_fll);
+ if ( guest_fll )
+ munmap(guest_fll, PAGE_SIZE);
+
+ return rc;
+}
+
+/*
+ * Get p2m_generation count.
+ * Returns an error if the generation count has changed since the last call.
+ */
+static int get_p2m_generation(struct xc_sr_context *ctx)
+{
+ uint64_t p2m_generation;
+ int rc;
+
+ p2m_generation = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_generation,
+ ctx->x86.pv.width);
+
+ rc = (p2m_generation == ctx->x86.pv.p2m_generation) ? 0 : -1;
+ ctx->x86.pv.p2m_generation = p2m_generation;
+
+ return rc;
+}
+
+static int x86_pv_check_vm_state_p2m_list(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc;
+
+ if ( !ctx->save.live )
+ return 0;
+
+ rc = get_p2m_generation(ctx);
+ if ( rc )
+ ERROR("p2m generation count changed. Migration aborted.");
+
+ return rc;
+}
+
+/*
+ * Map the guest p2m frames specified via a cr3 value, a virtual address, and
+ * the maximum pfn. PTE entries are 64 bits for both, 32 and 64 bit guests as
+ * in 32 bit case we support PAE guests only.
+ */
+static int map_p2m_list(struct xc_sr_context *ctx, uint64_t p2m_cr3)
+{
+ xc_interface *xch = ctx->xch;
+ xen_vaddr_t p2m_vaddr, p2m_end, mask, off;
+ xen_pfn_t p2m_mfn, mfn, saved_mfn, max_pfn;
+ uint64_t *ptes = NULL;
+ xen_pfn_t *mfns = NULL;
+ unsigned int fpp, n_pages, level, shift, idx_start, idx_end, idx, saved_idx;
+ int rc = -1;
+
+ p2m_mfn = cr3_to_mfn(ctx, p2m_cr3);
+ assert(p2m_mfn != 0);
+ if ( p2m_mfn > ctx->x86.pv.max_mfn )
+ {
+ ERROR("Bad p2m_cr3 value %#" PRIx64, p2m_cr3);
+ errno = ERANGE;
+ goto err;
+ }
+
+ get_p2m_generation(ctx);
+
+ p2m_vaddr = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_vaddr,
+ ctx->x86.pv.width);
+ fpp = PAGE_SIZE / ctx->x86.pv.width;
+ ctx->x86.pv.p2m_frames = ctx->x86.pv.max_pfn / fpp + 1;
+ p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
+
+ if ( ctx->x86.pv.width == 8 )
+ {
+ mask = 0x0000ffffffffffffULL;
+ if ( !is_canonical_address(p2m_vaddr) ||
+ !is_canonical_address(p2m_end) ||
+ p2m_end < p2m_vaddr ||
+ (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_64 &&
+ p2m_end > HYPERVISOR_VIRT_START_X86_64) )
+ {
+ ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
+ p2m_vaddr, p2m_end);
+ errno = ERANGE;
+ goto err;
+ }
+ }
+ else
+ {
+ mask = 0x00000000ffffffffULL;
+ if ( p2m_vaddr > mask || p2m_end > mask || p2m_end < p2m_vaddr ||
+ (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_32 &&
+ p2m_end > HYPERVISOR_VIRT_START_X86_32) )
+ {
+ ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
+ p2m_vaddr, p2m_end);
+ errno = ERANGE;
+ goto err;
+ }
+ }
+
+ DPRINTF("p2m list from %#" PRIx64 " to %#" PRIx64 ", root at %#lx",
+ p2m_vaddr, p2m_end, p2m_mfn);
+ DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
+ ctx->x86.pv.p2m_frames);
+
+ mfns = malloc(sizeof(*mfns));
+ if ( !mfns )
+ {
+ ERROR("Cannot allocate memory for array of %u mfns", 1);
+ goto err;
+ }
+ mfns[0] = p2m_mfn;
+ off = 0;
+ saved_mfn = 0;
+ idx_start = idx_end = saved_idx = 0;
+
+ for ( level = ctx->x86.pv.levels; level > 0; level-- )
+ {
+ n_pages = idx_end - idx_start + 1;
+ ptes = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, mfns, n_pages);
+ if ( !ptes )
+ {
+ PERROR("Failed to map %u page table pages for p2m list", n_pages);
+ goto err;
+ }
+ free(mfns);
+
+ shift = level * 9 + 3;
+ idx_start = ((p2m_vaddr - off) & mask) >> shift;
+ idx_end = ((p2m_end - off) & mask) >> shift;
+ idx = idx_end - idx_start + 1;
+ mfns = malloc(sizeof(*mfns) * idx);
+ if ( !mfns )
+ {
+ ERROR("Cannot allocate memory for array of %u mfns", idx);
+ goto err;
+ }
+
+ for ( idx = idx_start; idx <= idx_end; idx++ )
+ {
+ mfn = pte_to_frame(ptes[idx]);
+ if ( mfn == 0 || mfn > ctx->x86.pv.max_mfn )
+ {
+ ERROR("Bad mfn %#lx during page table walk for vaddr %#" PRIx64 " at level %d of p2m list",
+ mfn, off + ((xen_vaddr_t)idx << shift), level);
+ errno = ERANGE;
+ goto err;
+ }
+ mfns[idx - idx_start] = mfn;
+
+ /* Maximum pfn check at level 2. Same reasoning as for p2m tree. */
+ if ( level == 2 )
+ {
+ if ( mfn != saved_mfn )
+ {
+ saved_mfn = mfn;
+ saved_idx = idx - idx_start;
+ }
+ }
+ }
+
+ if ( level == 2 )
+ {
+ if ( saved_idx == idx_end )
+ saved_idx++;
+ max_pfn = ((xen_pfn_t)saved_idx << 9) * fpp - 1;
+ if ( max_pfn < ctx->x86.pv.max_pfn )
+ {
+ ctx->x86.pv.max_pfn = max_pfn;
+ ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
+ p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
+ idx_end = idx_start + saved_idx;
+ }
+ }
+
+ munmap(ptes, n_pages * PAGE_SIZE);
+ ptes = NULL;
+ off = p2m_vaddr & ((mask >> shift) << shift);
+ }
+
+ /* Map the p2m leaves themselves. */
+ rc = map_p2m_leaves(ctx, mfns, idx_end - idx_start + 1);
+
+ err:
+ free(mfns);
+ if ( ptes )
+ munmap(ptes, n_pages * PAGE_SIZE);
+
+ return rc;
+}
+
+/*
+ * Map the guest p2m frames.
+ * Depending on guest support this might either be a virtual mapped linear
+ * list (preferred format) or a 3 level tree linked via mfns.
+ */
+static int map_p2m(struct xc_sr_context *ctx)
+{
+ uint64_t p2m_cr3;
+
+ ctx->x86.pv.p2m_generation = ~0ULL;
+ ctx->x86.pv.max_pfn = GET_FIELD(ctx->x86.pv.shinfo, arch.max_pfn,
+ ctx->x86.pv.width) - 1;
+ p2m_cr3 = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_cr3, ctx->x86.pv.width);
+
+ return p2m_cr3 ? map_p2m_list(ctx, p2m_cr3) : map_p2m_tree(ctx);
+}
+
+/*
+ * Obtain a specific vcpus basic state and write an X86_PV_VCPU_BASIC record
+ * into the stream. Performs mfn->pfn conversion on architectural state.
+ */
+static int write_one_vcpu_basic(struct xc_sr_context *ctx, uint32_t id)
+{
+ xc_interface *xch = ctx->xch;
+ xen_pfn_t mfn, pfn;
+ unsigned int i, gdt_count;
+ int rc = -1;
+ vcpu_guest_context_any_t vcpu;
+ struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
+ .vcpu_id = id,
+ };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_X86_PV_VCPU_BASIC,
+ .length = sizeof(vhdr),
+ .data = &vhdr,
+ };
+
+ if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) )
+ {
+ PERROR("Failed to get vcpu%u context", id);
+ goto err;
+ }
+
+ /* Vcpu0 is special: Convert the suspend record to a pfn. */
+ if ( id == 0 )
+ {
+ mfn = GET_FIELD(&vcpu, user_regs.edx, ctx->x86.pv.width);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("Bad mfn for suspend record");
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ errno = ERANGE;
+ goto err;
+ }
+ SET_FIELD(&vcpu, user_regs.edx, mfn_to_pfn(ctx, mfn),
+ ctx->x86.pv.width);
+ }
+
+ gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86.pv.width);
+ if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
+ {
+ ERROR("GDT entry count (%u) out of range (max %u)",
+ gdt_count, FIRST_RESERVED_GDT_ENTRY);
+ errno = ERANGE;
+ goto err;
+ }
+ gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
+
+ /* Convert GDT frames to pfns. */
+ for ( i = 0; i < gdt_count; ++i )
+ {
+ mfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86.pv.width);
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("Bad mfn for frame %u of vcpu%u's GDT", i, id);
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ errno = ERANGE;
+ goto err;
+ }
+ SET_FIELD(&vcpu, gdt_frames[i], mfn_to_pfn(ctx, mfn),
+ ctx->x86.pv.width);
+ }
+
+ /* Convert CR3 to a pfn. */
+ mfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86.pv.width));
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("Bad mfn for vcpu%u's cr3", id);
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ errno = ERANGE;
+ goto err;
+ }
+ pfn = mfn_to_pfn(ctx, mfn);
+ SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, pfn), ctx->x86.pv.width);
+
+ /* 64bit guests: Convert CR1 (guest pagetables) to pfn. */
+ if ( ctx->x86.pv.levels == 4 && vcpu.x64.ctrlreg[1] )
+ {
+ mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ ERROR("Bad mfn for vcpu%u's cr1", id);
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ errno = ERANGE;
+ goto err;
+ }
+ pfn = mfn_to_pfn(ctx, mfn);
+ vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT);
+ }
+
+ if ( ctx->x86.pv.width == 8 )
+ rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x64));
+ else
+ rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x32));
+
+ err:
+ return rc;
+}
+
+/*
+ * Obtain a specific vcpus extended state and write an X86_PV_VCPU_EXTENDED
+ * record into the stream.
+ */
+static int write_one_vcpu_extended(struct xc_sr_context *ctx, uint32_t id)
+{
+ xc_interface *xch = ctx->xch;
+ struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
+ .vcpu_id = id,
+ };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_X86_PV_VCPU_EXTENDED,
+ .length = sizeof(vhdr),
+ .data = &vhdr,
+ };
+ struct xen_domctl domctl = {
+ .cmd = XEN_DOMCTL_get_ext_vcpucontext,
+ .domain = ctx->domid,
+ .u.ext_vcpucontext.vcpu = id,
+ };
+
+ if ( xc_domctl(xch, &domctl) < 0 )
+ {
+ PERROR("Unable to get vcpu%u extended context", id);
+ return -1;
+ }
+
+ /* No content? Skip the record. */
+ if ( domctl.u.ext_vcpucontext.size == 0 )
+ return 0;
+
+ return write_split_record(ctx, &rec, &domctl.u.ext_vcpucontext,
+ domctl.u.ext_vcpucontext.size);
+}
+
+/*
+ * Query to see whether a specific vcpu has xsave state and if so, write an
+ * X86_PV_VCPU_XSAVE record into the stream.
+ */
+static int write_one_vcpu_xsave(struct xc_sr_context *ctx, uint32_t id)
+{
+ xc_interface *xch = ctx->xch;
+ int rc = -1;
+ DECLARE_HYPERCALL_BUFFER(void, buffer);
+ struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
+ .vcpu_id = id,
+ };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_X86_PV_VCPU_XSAVE,
+ .length = sizeof(vhdr),
+ .data = &vhdr,
+ };
+ struct xen_domctl domctl = {
+ .cmd = XEN_DOMCTL_getvcpuextstate,
+ .domain = ctx->domid,
+ .u.vcpuextstate.vcpu = id,
+ };
+
+ if ( xc_domctl(xch, &domctl) < 0 )
+ {
+ PERROR("Unable to get vcpu%u's xsave context", id);
+ goto err;
+ }
+
+ /* No xsave state? skip this record. */
+ if ( !domctl.u.vcpuextstate.xfeature_mask )
+ goto out;
+
+ buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size);
+ if ( !buffer )
+ {
+ ERROR("Unable to allocate %"PRIx64" bytes for vcpu%u's xsave context",
+ domctl.u.vcpuextstate.size, id);
+ goto err;
+ }
+
+ set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
+ if ( xc_domctl(xch, &domctl) < 0 )
+ {
+ PERROR("Unable to get vcpu%u's xsave context", id);
+ goto err;
+ }
+
+ /* No xsave state? Skip this record. */
+ if ( domctl.u.vcpuextstate.size == 0 )
+ goto out;
+
+ rc = write_split_record(ctx, &rec, buffer, domctl.u.vcpuextstate.size);
+ if ( rc )
+ goto err;
+
+ out:
+ rc = 0;
+
+ err:
+ xc_hypercall_buffer_free(xch, buffer);
+
+ return rc;
+}
+
+/*
+ * Query to see whether a specific vcpu has msr state and if so, write an
+ * X86_PV_VCPU_MSRS record into the stream.
+ */
+static int write_one_vcpu_msrs(struct xc_sr_context *ctx, uint32_t id)
+{
+ xc_interface *xch = ctx->xch;
+ int rc = -1;
+ size_t buffersz;
+ DECLARE_HYPERCALL_BUFFER(void, buffer);
+ struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
+ .vcpu_id = id,
+ };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_X86_PV_VCPU_MSRS,
+ .length = sizeof(vhdr),
+ .data = &vhdr,
+ };
+ struct xen_domctl domctl = {
+ .cmd = XEN_DOMCTL_get_vcpu_msrs,
+ .domain = ctx->domid,
+ .u.vcpu_msrs.vcpu = id,
+ };
+
+ if ( xc_domctl(xch, &domctl) < 0 )
+ {
+ PERROR("Unable to get vcpu%u's msrs", id);
+ goto err;
+ }
+
+ /* No MSRs? skip this record. */
+ if ( !domctl.u.vcpu_msrs.msr_count )
+ goto out;
+
+ buffersz = domctl.u.vcpu_msrs.msr_count * sizeof(xen_domctl_vcpu_msr_t);
+ buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz);
+ if ( !buffer )
+ {
+ ERROR("Unable to allocate %zu bytes for vcpu%u's msrs",
+ buffersz, id);
+ goto err;
+ }
+
+ set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
+ if ( xc_domctl(xch, &domctl) < 0 )
+ {
+ PERROR("Unable to get vcpu%u's msrs", id);
+ goto err;
+ }
+
+ /* No MSRs? Skip this record. */
+ if ( domctl.u.vcpu_msrs.msr_count == 0 )
+ goto out;
+
+ rc = write_split_record(ctx, &rec, buffer,
+ domctl.u.vcpu_msrs.msr_count *
+ sizeof(xen_domctl_vcpu_msr_t));
+ if ( rc )
+ goto err;
+
+ out:
+ rc = 0;
+
+ err:
+ xc_hypercall_buffer_free(xch, buffer);
+
+ return rc;
+}
+
+/*
+ * For each vcpu, if it is online, write its state into the stream.
+ */
+static int write_all_vcpu_information(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ xc_vcpuinfo_t vinfo;
+ unsigned int i;
+ int rc;
+
+ for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i )
+ {
+ rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo);
+ if ( rc )
+ {
+ PERROR("Failed to get vcpu%u information", i);
+ return rc;
+ }
+
+ /* Vcpu offline? skip all these records. */
+ if ( !vinfo.online )
+ continue;
+
+ rc = write_one_vcpu_basic(ctx, i);
+ if ( rc )
+ return rc;
+
+ rc = write_one_vcpu_extended(ctx, i);
+ if ( rc )
+ return rc;
+
+ rc = write_one_vcpu_xsave(ctx, i);
+ if ( rc )
+ return rc;
+
+ rc = write_one_vcpu_msrs(ctx, i);
+ if ( rc )
+ return rc;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes an X86_PV_INFO record into the stream.
+ */
+static int write_x86_pv_info(struct xc_sr_context *ctx)
+{
+ struct xc_sr_rec_x86_pv_info info = {
+ .guest_width = ctx->x86.pv.width,
+ .pt_levels = ctx->x86.pv.levels,
+ };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_X86_PV_INFO,
+ .length = sizeof(info),
+ .data = &info,
+ };
+
+ return write_record(ctx, &rec);
+}
+
+/*
+ * Writes an X86_PV_P2M_FRAMES record into the stream. This contains the list
+ * of pfns making up the p2m table.
+ */
+static int write_x86_pv_p2m_frames(struct xc_sr_context *ctx)
+{
+ xc_interface *xch = ctx->xch;
+ int rc; unsigned int i;
+ size_t datasz = ctx->x86.pv.p2m_frames * sizeof(uint64_t);
+ uint64_t *data = NULL;
+ struct xc_sr_rec_x86_pv_p2m_frames hdr = {
+ .end_pfn = ctx->x86.pv.max_pfn,
+ };
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_X86_PV_P2M_FRAMES,
+ .length = sizeof(hdr),
+ .data = &hdr,
+ };
+
+ /* No need to translate if sizeof(uint64_t) == sizeof(xen_pfn_t). */
+ if ( sizeof(uint64_t) != sizeof(*ctx->x86.pv.p2m_pfns) )
+ {
+ if ( !(data = malloc(datasz)) )
+ {
+ ERROR("Cannot allocate %zu bytes for X86_PV_P2M_FRAMES data",
+ datasz);
+ return -1;
+ }
+
+ for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i )
+ data[i] = ctx->x86.pv.p2m_pfns[i];
+ }
+ else
+ data = (uint64_t *)ctx->x86.pv.p2m_pfns;
+
+ rc = write_split_record(ctx, &rec, data, datasz);
+
+ if ( data != (uint64_t *)ctx->x86.pv.p2m_pfns )
+ free(data);
+
+ return rc;
+}
+
+/*
+ * Writes an SHARED_INFO record into the stream.
+ */
+static int write_shared_info(struct xc_sr_context *ctx)
+{
+ struct xc_sr_record rec = {
+ .type = REC_TYPE_SHARED_INFO,
+ .length = PAGE_SIZE,
+ .data = ctx->x86.pv.shinfo,
+ };
+
+ return write_record(ctx, &rec);
+}
+
+/*
+ * Normalise a pagetable for the migration stream. Performs mfn->pfn
+ * conversions on the ptes.
+ */
+static int normalise_pagetable(struct xc_sr_context *ctx, const uint64_t *src,
+ uint64_t *dst, unsigned long type)
+{
+ xc_interface *xch = ctx->xch;
+ uint64_t pte;
+ unsigned int i, xen_first = -1, xen_last = -1; /* Indices of Xen mappings. */
+
+ type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( ctx->x86.pv.levels == 4 )
+ {
+ /* 64bit guests only have Xen mappings in their L4 tables. */
+ if ( type == XEN_DOMCTL_PFINFO_L4TAB )
+ {
+ xen_first = (HYPERVISOR_VIRT_START_X86_64 >>
+ L4_PAGETABLE_SHIFT_X86_64) & 511;
+ xen_last = (HYPERVISOR_VIRT_END_X86_64 >>
+ L4_PAGETABLE_SHIFT_X86_64) & 511;
+ }
+ }
+ else
+ {
+ switch ( type )
+ {
+ case XEN_DOMCTL_PFINFO_L4TAB:
+ ERROR("??? Found L4 table for 32bit guest");
+ errno = EINVAL;
+ return -1;
+
+ case XEN_DOMCTL_PFINFO_L3TAB:
+ /* 32bit guests can only use the first 4 entries of their L3 tables.
+ * All other are potentially used by Xen. */
+ xen_first = 4;
+ xen_last = 511;
+ break;
+
+ case XEN_DOMCTL_PFINFO_L2TAB:
+ /* It is hard to spot Xen mappings in a 32bit guest's L2. Most
+ * are normal but only a few will have Xen mappings.
+ */
+ i = (HYPERVISOR_VIRT_START_X86_32 >> L2_PAGETABLE_SHIFT_PAE) & 511;
+ if ( pte_to_frame(src[i]) == ctx->x86.pv.compat_m2p_mfn0 )
+ {
+ xen_first = i;
+ xen_last = (HYPERVISOR_VIRT_END_X86_32 >>
+ L2_PAGETABLE_SHIFT_PAE) & 511;
+ }
+ break;
+ }
+ }
+
+ for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
+ {
+ xen_pfn_t mfn;
+
+ pte = src[i];
+
+ /* Remove Xen mappings: Xen will reconstruct on the other side. */
+ if ( i >= xen_first && i <= xen_last )
+ pte = 0;
+
+ /*
+ * Errors during the live part of migration are expected as a result
+ * of split pagetable updates, page type changes, active grant
+ * mappings etc. The pagetable will need to be resent after pausing.
+ * In such cases we fail with EAGAIN.
+ *
+ * For domains which are already paused, errors are fatal.
+ */
+ if ( pte & _PAGE_PRESENT )
+ {
+ mfn = pte_to_frame(pte);
+
+#ifdef __i386__
+ if ( mfn == INVALID_MFN )
+ {
+ if ( !ctx->dominfo.paused )
+ errno = EAGAIN;
+ else
+ {
+ ERROR("PTE truncation detected. L%lu[%u] = %016"PRIx64,
+ type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+ errno = E2BIG;
+ }
+ return -1;
+ }
+#endif
+
+ if ( (type > XEN_DOMCTL_PFINFO_L1TAB) && (pte & _PAGE_PSE) )
+ {
+ ERROR("Cannot migrate superpage (L%lu[%u]: 0x%016"PRIx64")",
+ type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
+ errno = E2BIG;
+ return -1;
+ }
+
+ if ( !mfn_in_pseudophysmap(ctx, mfn) )
+ {
+ if ( !ctx->dominfo.paused )
+ errno = EAGAIN;
+ else
+ {
+ ERROR("Bad mfn for L%lu[%u]",
+ type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i);
+ dump_bad_pseudophysmap_entry(ctx, mfn);
+ errno = ERANGE;
+ }
+ return -1;
+ }
+
+ pte = merge_pte(pte, mfn_to_pfn(ctx, mfn));
+ }
+
+ dst[i] = pte;
+ }
+
+ return 0;
+}
+
+static xen_pfn_t x86_pv_pfn_to_gfn(const struct xc_sr_context *ctx,
+ xen_pfn_t pfn)
+{
+ assert(pfn <= ctx->x86.pv.max_pfn);
+
+ return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width);
+}
+
+
+/*
+ * save_ops function. Performs pagetable normalisation on appropriate pages.
+ */
+static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type,
+ void **page)
+{
+ xc_interface *xch = ctx->xch;
+ void *local_page;
+ int rc;
+
+ type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
+ return 0;
+
+ local_page = malloc(PAGE_SIZE);
+ if ( !local_page )
+ {
+ ERROR("Unable to allocate scratch page");
+ rc = -1;
+ goto out;
+ }
+
+ rc = normalise_pagetable(ctx, *page, local_page, type);
+ *page = local_page;
+
+ out:
+ return rc;
+}
+
+/*
+ * save_ops function. Queries domain information and maps the Xen m2p and the
+ * guests shinfo and p2m table.
+ */
+static int x86_pv_setup(struct xc_sr_context *ctx)
+{
+ int rc;
+
+ rc = x86_pv_domain_info(ctx);
+ if ( rc )
+ return rc;
+
+ rc = x86_pv_map_m2p(ctx);
+ if ( rc )
+ return rc;
+
+ rc = map_shinfo(ctx);
+ if ( rc )
+ return rc;
+
+ rc = map_p2m(ctx);
+ if ( rc )
+ return rc;
+
+ return 0;
+}
+
+static int x86_pv_static_data(struct xc_sr_context *ctx)
+{
+ int rc;
+
+ rc = write_x86_pv_info(ctx);
+ if ( rc )
+ return rc;
+
+ rc = write_x86_cpu_policy_records(ctx);
+ if ( rc )
+ return rc;
+
+ return 0;
+}
+
+static int x86_pv_start_of_stream(struct xc_sr_context *ctx)
+{
+ int rc;
+
+ /*
+ * Ideally should be able to change during migration. Currently
+ * corruption will occur if the contents or location of the P2M changes
+ * during the live migration loop. If one is very lucky, the breakage
+ * will not be subtle.
+ */
+ rc = write_x86_pv_p2m_frames(ctx);
+ if ( rc )
+ return rc;
+
+ return 0;
+}
+
+static int x86_pv_start_of_checkpoint(struct xc_sr_context *ctx)
+{
+ return 0;
+}
+
+static int x86_pv_end_of_checkpoint(struct xc_sr_context *ctx)
+{
+ int rc;
+
+ rc = write_x86_tsc_info(ctx);
+ if ( rc )
+ return rc;
+
+ rc = write_shared_info(ctx);
+ if ( rc )
+ return rc;
+
+ rc = write_all_vcpu_information(ctx);
+ if ( rc )
+ return rc;
+
+ return 0;
+}
+
+static int x86_pv_check_vm_state(struct xc_sr_context *ctx)
+{
+ if ( ctx->x86.pv.p2m_generation == ~0ULL )
+ return 0;
+
+ return x86_pv_check_vm_state_p2m_list(ctx);
+}
+
+static int x86_pv_cleanup(struct xc_sr_context *ctx)
+{
+ free(ctx->x86.pv.p2m_pfns);
+
+ if ( ctx->x86.pv.p2m )
+ munmap(ctx->x86.pv.p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE);
+
+ if ( ctx->x86.pv.shinfo )
+ munmap(ctx->x86.pv.shinfo, PAGE_SIZE);
+
+ if ( ctx->x86.pv.m2p )
+ munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE);
+
+ return 0;
+}
+
+struct xc_sr_save_ops save_ops_x86_pv =
+{
+ .pfn_to_gfn = x86_pv_pfn_to_gfn,
+ .normalise_page = x86_pv_normalise_page,
+ .setup = x86_pv_setup,
+ .static_data = x86_pv_static_data,
+ .start_of_stream = x86_pv_start_of_stream,
+ .start_of_checkpoint = x86_pv_start_of_checkpoint,
+ .end_of_checkpoint = x86_pv_end_of_checkpoint,
+ .check_vm_state = x86_pv_check_vm_state,
+ .cleanup = x86_pv_cleanup,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+#ifndef __STREAM_FORMAT__H
+#define __STREAM_FORMAT__H
+
+/*
+ * C structures for the Migration v2 stream format.
+ * See docs/specs/libxc-migration-stream.pandoc
+ */
+
+#include <inttypes.h>
+
+/*
+ * Image Header
+ */
+struct xc_sr_ihdr
+{
+ uint64_t marker;
+ uint32_t id;
+ uint32_t version;
+ uint16_t options;
+ uint16_t _res1;
+ uint32_t _res2;
+};
+
+#define IHDR_MARKER 0xffffffffffffffffULL
+#define IHDR_ID 0x58454E46U
+
+#define _IHDR_OPT_ENDIAN 0
+#define IHDR_OPT_LITTLE_ENDIAN (0 << _IHDR_OPT_ENDIAN)
+#define IHDR_OPT_BIG_ENDIAN (1 << _IHDR_OPT_ENDIAN)
+
+/*
+ * Domain Header
+ */
+struct xc_sr_dhdr
+{
+ uint32_t type;
+ uint16_t page_shift;
+ uint16_t _res1;
+ uint32_t xen_major;
+ uint32_t xen_minor;
+};
+
+#define DHDR_TYPE_X86_PV 0x00000001U
+#define DHDR_TYPE_X86_HVM 0x00000002U
+
+/*
+ * Record Header
+ */
+struct xc_sr_rhdr
+{
+ uint32_t type;
+ uint32_t length;
+};
+
+/* All records must be aligned up to an 8 octet boundary */
+#define REC_ALIGN_ORDER (3U)
+/* Somewhat arbitrary - 128MB */
+#define REC_LENGTH_MAX (128U << 20)
+
+#define REC_TYPE_END 0x00000000U
+#define REC_TYPE_PAGE_DATA 0x00000001U
+#define REC_TYPE_X86_PV_INFO 0x00000002U
+#define REC_TYPE_X86_PV_P2M_FRAMES 0x00000003U
+#define REC_TYPE_X86_PV_VCPU_BASIC 0x00000004U
+#define REC_TYPE_X86_PV_VCPU_EXTENDED 0x00000005U
+#define REC_TYPE_X86_PV_VCPU_XSAVE 0x00000006U
+#define REC_TYPE_SHARED_INFO 0x00000007U
+#define REC_TYPE_X86_TSC_INFO 0x00000008U
+#define REC_TYPE_HVM_CONTEXT 0x00000009U
+#define REC_TYPE_HVM_PARAMS 0x0000000aU
+#define REC_TYPE_TOOLSTACK 0x0000000bU
+#define REC_TYPE_X86_PV_VCPU_MSRS 0x0000000cU
+#define REC_TYPE_VERIFY 0x0000000dU
+#define REC_TYPE_CHECKPOINT 0x0000000eU
+#define REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST 0x0000000fU
+#define REC_TYPE_STATIC_DATA_END 0x00000010U
+#define REC_TYPE_X86_CPUID_POLICY 0x00000011U
+#define REC_TYPE_X86_MSR_POLICY 0x00000012U
+
+#define REC_TYPE_OPTIONAL 0x80000000U
+
+/* PAGE_DATA */
+struct xc_sr_rec_page_data_header
+{
+ uint32_t count;
+ uint32_t _res1;
+ uint64_t pfn[0];
+};
+
+#define PAGE_DATA_PFN_MASK 0x000fffffffffffffULL
+#define PAGE_DATA_TYPE_MASK 0xf000000000000000ULL
+
+/* X86_PV_INFO */
+struct xc_sr_rec_x86_pv_info
+{
+ uint8_t guest_width;
+ uint8_t pt_levels;
+ uint8_t _res[6];
+};
+
+/* X86_PV_P2M_FRAMES */
+struct xc_sr_rec_x86_pv_p2m_frames
+{
+ uint32_t start_pfn;
+ uint32_t end_pfn;
+ uint64_t p2m_pfns[0];
+};
+
+/* X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} */
+struct xc_sr_rec_x86_pv_vcpu_hdr
+{
+ uint32_t vcpu_id;
+ uint32_t _res1;
+ uint8_t context[0];
+};
+
+/* X86_TSC_INFO */
+struct xc_sr_rec_x86_tsc_info
+{
+ uint32_t mode;
+ uint32_t khz;
+ uint64_t nsec;
+ uint32_t incarnation;
+ uint32_t _res1;
+};
+
+/* HVM_PARAMS */
+struct xc_sr_rec_hvm_params_entry
+{
+ uint64_t index;
+ uint64_t value;
+};
+
+struct xc_sr_rec_hvm_params
+{
+ uint32_t count;
+ uint32_t _res1;
+ struct xc_sr_rec_hvm_params_entry param[0];
+};
+
+#endif
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+/*
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <xenevtchn.h>
+
+#include "xc_private.h"
+#include "xenguest.h"
+
+#define SUSPEND_LOCK_FILE XEN_RUN_DIR "/suspend-evtchn-%d.lock"
+
+/*
+ * locking
+ */
+
+#define ERR(x) do{ \
+ ERROR("Can't " #x " lock file for suspend event channel %s: %s\n", \
+ suspend_file, strerror(errno)); \
+ goto err; \
+}while(0)
+
+#define SUSPEND_FILE_BUFLEN (sizeof(SUSPEND_LOCK_FILE) + 10)
+
+static void get_suspend_file(char buf[], uint32_t domid)
+{
+ snprintf(buf, SUSPEND_FILE_BUFLEN, SUSPEND_LOCK_FILE, domid);
+}
+
+static int lock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd)
+{
+ int fd = -1, r;
+ char suspend_file[SUSPEND_FILE_BUFLEN];
+ struct stat ours, theirs;
+ struct flock fl;
+
+ get_suspend_file(suspend_file, domid);
+
+ *lockfd = -1;
+
+ for (;;) {
+ if (fd >= 0)
+ close (fd);
+
+ fd = open(suspend_file, O_CREAT | O_RDWR, 0600);
+ if (fd < 0)
+ ERR("create");
+
+ r = fcntl(fd, F_SETFD, FD_CLOEXEC);
+ if (r)
+ ERR("fcntl F_SETFD FD_CLOEXEC");
+
+ memset(&fl, 0, sizeof(fl));
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_len = 1;
+ r = fcntl(fd, F_SETLK, &fl);
+ if (r)
+ ERR("fcntl F_SETLK");
+
+ r = fstat(fd, &ours);
+ if (r)
+ ERR("fstat");
+
+ r = stat(suspend_file, &theirs);
+ if (r) {
+ if (errno == ENOENT)
+ /* try again */
+ continue;
+ ERR("stat");
+ }
+
+ if (ours.st_ino != theirs.st_ino)
+ /* someone else must have removed it while we were locking it */
+ continue;
+
+ break;
+ }
+
+ *lockfd = fd;
+ return 0;
+
+ err:
+ if (fd >= 0)
+ close(fd);
+
+ return -1;
+}
+
+static int unlock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd)
+{
+ int r;
+ char suspend_file[SUSPEND_FILE_BUFLEN];
+
+ if (*lockfd < 0)
+ return 0;
+
+ get_suspend_file(suspend_file, domid);
+
+ r = unlink(suspend_file);
+ if (r)
+ ERR("unlink");
+
+ r = close(*lockfd);
+ *lockfd = -1;
+ if (r)
+ ERR("close");
+
+ err:
+ if (*lockfd >= 0)
+ close(*lockfd);
+
+ return -1;
+}
+
+int xc_await_suspend(xc_interface *xch, xenevtchn_handle *xce, int suspend_evtchn)
+{
+ int rc;
+
+ do {
+ rc = xenevtchn_pending(xce);
+ if (rc < 0) {
+ ERROR("error polling suspend notification channel: %d", rc);
+ return -1;
+ }
+ } while (rc != suspend_evtchn);
+
+ /* harmless for one-off suspend */
+ if (xenevtchn_unmask(xce, suspend_evtchn) < 0)
+ ERROR("failed to unmask suspend notification channel: %d", rc);
+
+ return 0;
+}
+
+/* Internal callers are allowed to call this with suspend_evtchn<0
+ * but *lockfd>0. */
+int xc_suspend_evtchn_release(xc_interface *xch, xenevtchn_handle *xce,
+ uint32_t domid, int suspend_evtchn, int *lockfd)
+{
+ if (suspend_evtchn >= 0)
+ xenevtchn_unbind(xce, suspend_evtchn);
+
+ return unlock_suspend_event(xch, domid, lockfd);
+}
+
+int xc_suspend_evtchn_init_sane(xc_interface *xch, xenevtchn_handle *xce,
+ uint32_t domid, int port, int *lockfd)
+{
+ int rc, suspend_evtchn = -1;
+
+ if (lock_suspend_event(xch, domid, lockfd)) {
+ errno = EINVAL;
+ goto cleanup;
+ }
+
+ suspend_evtchn = xenevtchn_bind_interdomain(xce, domid, port);
+ if (suspend_evtchn < 0) {
+ ERROR("failed to bind suspend event channel: %d", suspend_evtchn);
+ goto cleanup;
+ }
+
+ rc = xc_domain_subscribe_for_suspend(xch, domid, port);
+ if (rc < 0) {
+ ERROR("failed to subscribe to domain: %d", rc);
+ goto cleanup;
+ }
+
+ return suspend_evtchn;
+
+cleanup:
+ xc_suspend_evtchn_release(xch, xce, domid, suspend_evtchn, lockfd);
+
+ return -1;
+}
+
+int xc_suspend_evtchn_init_exclusive(xc_interface *xch, xenevtchn_handle *xce,
+ uint32_t domid, int port, int *lockfd)
+{
+ int suspend_evtchn;
+
+ suspend_evtchn = xc_suspend_evtchn_init_sane(xch, xce, domid, port, lockfd);
+ if (suspend_evtchn < 0)
+ return suspend_evtchn;
+
+ /* event channel is pending immediately after binding */
+ xc_await_suspend(xch, xce, suspend_evtchn);
+
+ return suspend_evtchn;
+}
PKG_CONFIG_VERSION := $(MAJOR).$(MINOR)
PKG_CONFIG_USELIBS := $(SHLIB_libxen$(LIBNAME))
PKG_CONFIG_LIB := xen$(LIBNAME)
-PKG_CONFIG_REQPRIV := $(subst $(space),$(comma),$(strip $(foreach lib,$(USELIBS_$(LIBNAME)),xen$(lib))))
+PKG_CONFIG_REQPRIV := $(subst $(space),$(comma),$(strip $(foreach lib,$(patsubst ctrl,control,$(USELIBS_$(LIBNAME))),xen$(lib))))
ifneq ($(CONFIG_LIBXC_MINIOS),y)
PKG_CONFIG_INST := $(PKG_CONFIG)
USELIBS_hypfs := toollog toolcore call
LIBS_LIBS += ctrl
USELIBS_ctrl := toollog call evtchn gnttab foreignmemory devicemodel
+LIBS_LIBS += guest
+USELIBS_guest := evtchn ctrl
+++ /dev/null
-Note that the only valid version of the LGPL as far as the files in
-this directory (and its subdirectories) are concerned is _this_
-particular version of the license (i.e., *only* v2.1, not v2.2 or v3.x
-or whatever), unless explicitly otherwise stated.
-
-Where clause 3 is invoked in order to relicense under the GPL then
-this shall be considered to be GPL v2 only for files which have
-specified LGPL v2.1 only.
-
- GNU LESSER GENERAL PUBLIC LICENSE
- Version 2.1, February 1999
-
- Copyright (C) 1991, 1999 Free Software Foundation, Inc.
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-[This is the first released version of the Lesser GPL. It also counts
- as the successor of the GNU Library Public License, version 2, hence
- the version number 2.1.]
-
- Preamble
-
- The licenses for most software are designed to take away your
-freedom to share and change it. By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-
- This license, the Lesser General Public License, applies to some
-specially designated software packages--typically libraries--of the
-Free Software Foundation and other authors who decide to use it. You
-can use it too, but we suggest you first think carefully about whether
-this license or the ordinary General Public License is the better
-strategy to use in any particular case, based on the explanations below.
-
- When we speak of free software, we are referring to freedom of use,
-not price. Our General Public Licenses are designed to make sure that
-you have the freedom to distribute copies of free software (and charge
-for this service if you wish); that you receive source code or can get
-it if you want it; that you can change the software and use pieces of
-it in new free programs; and that you are informed that you can do
-these things.
-
- To protect your rights, we need to make restrictions that forbid
-distributors to deny you these rights or to ask you to surrender these
-rights. These restrictions translate to certain responsibilities for
-you if you distribute copies of the library or if you modify it.
-
- For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you. You must make sure that they, too, receive or can get the source
-code. If you link other code with the library, you must provide
-complete object files to the recipients, so that they can relink them
-with the library after making changes to the library and recompiling
-it. And you must show them these terms so they know their rights.
-
- We protect your rights with a two-step method: (1) we copyright the
-library, and (2) we offer you this license, which gives you legal
-permission to copy, distribute and/or modify the library.
-
- To protect each distributor, we want to make it very clear that
-there is no warranty for the free library. Also, if the library is
-modified by someone else and passed on, the recipients should know
-that what they have is not the original version, so that the original
-author's reputation will not be affected by problems that might be
-introduced by others.
-\f
- Finally, software patents pose a constant threat to the existence of
-any free program. We wish to make sure that a company cannot
-effectively restrict the users of a free program by obtaining a
-restrictive license from a patent holder. Therefore, we insist that
-any patent license obtained for a version of the library must be
-consistent with the full freedom of use specified in this license.
-
- Most GNU software, including some libraries, is covered by the
-ordinary GNU General Public License. This license, the GNU Lesser
-General Public License, applies to certain designated libraries, and
-is quite different from the ordinary General Public License. We use
-this license for certain libraries in order to permit linking those
-libraries into non-free programs.
-
- When a program is linked with a library, whether statically or using
-a shared library, the combination of the two is legally speaking a
-combined work, a derivative of the original library. The ordinary
-General Public License therefore permits such linking only if the
-entire combination fits its criteria of freedom. The Lesser General
-Public License permits more lax criteria for linking other code with
-the library.
-
- We call this license the "Lesser" General Public License because it
-does Less to protect the user's freedom than the ordinary General
-Public License. It also provides other free software developers Less
-of an advantage over competing non-free programs. These disadvantages
-are the reason we use the ordinary General Public License for many
-libraries. However, the Lesser license provides advantages in certain
-special circumstances.
-
- For example, on rare occasions, there may be a special need to
-encourage the widest possible use of a certain library, so that it becomes
-a de-facto standard. To achieve this, non-free programs must be
-allowed to use the library. A more frequent case is that a free
-library does the same job as widely used non-free libraries. In this
-case, there is little to gain by limiting the free library to free
-software only, so we use the Lesser General Public License.
-
- In other cases, permission to use a particular library in non-free
-programs enables a greater number of people to use a large body of
-free software. For example, permission to use the GNU C Library in
-non-free programs enables many more people to use the whole GNU
-operating system, as well as its variant, the GNU/Linux operating
-system.
-
- Although the Lesser General Public License is Less protective of the
-users' freedom, it does ensure that the user of a program that is
-linked with the Library has the freedom and the wherewithal to run
-that program using a modified version of the Library.
-
- The precise terms and conditions for copying, distribution and
-modification follow. Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library". The
-former contains code derived from the library, whereas the latter must
-be combined with the library in order to run.
-\f
- GNU LESSER GENERAL PUBLIC LICENSE
- TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
- 0. This License Agreement applies to any software library or other
-program which contains a notice placed by the copyright holder or
-other authorized party saying it may be distributed under the terms of
-this Lesser General Public License (also called "this License").
-Each licensee is addressed as "you".
-
- A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-
- The "Library", below, refers to any such software library or work
-which has been distributed under these terms. A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language. (Hereinafter, translation is
-included without limitation in the term "modification".)
-
- "Source code" for a work means the preferred form of the work for
-making modifications to it. For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-
- Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope. The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it). Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-
- 1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-
- You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-\f
- 2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
- a) The modified work must itself be a software library.
-
- b) You must cause the files modified to carry prominent notices
- stating that you changed the files and the date of any change.
-
- c) You must cause the whole of the work to be licensed at no
- charge to all third parties under the terms of this License.
-
- d) If a facility in the modified Library refers to a function or a
- table of data to be supplied by an application program that uses
- the facility, other than as an argument passed when the facility
- is invoked, then you must make a good faith effort to ensure that,
- in the event an application does not supply such function or
- table, the facility still operates, and performs whatever part of
- its purpose remains meaningful.
-
- (For example, a function in a library to compute square roots has
- a purpose that is entirely well-defined independent of the
- application. Therefore, Subsection 2d requires that any
- application-supplied function or table used by this function must
- be optional: if the application does not supply it, the square
- root function must still compute square roots.)
-
-These requirements apply to the modified work as a whole. If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works. But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
- 3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library. To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License. (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.) Do not make any other change in
-these notices.
-\f
- Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-
- This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-
- 4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-
- If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
- 5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library". Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-
- However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library". The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-
- When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library. The
-threshold for this to be true is not precisely defined by law.
-
- If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work. (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-
- Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-\f
- 6. As an exception to the Sections above, you may also combine or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-
- You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License. You must supply a copy of this License. If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License. Also, you must do one
-of these things:
-
- a) Accompany the work with the complete corresponding
- machine-readable source code for the Library including whatever
- changes were used in the work (which must be distributed under
- Sections 1 and 2 above); and, if the work is an executable linked
- with the Library, with the complete machine-readable "work that
- uses the Library", as object code and/or source code, so that the
- user can modify the Library and then relink to produce a modified
- executable containing the modified Library. (It is understood
- that the user who changes the contents of definitions files in the
- Library will not necessarily be able to recompile the application
- to use the modified definitions.)
-
- b) Use a suitable shared library mechanism for linking with the
- Library. A suitable mechanism is one that (1) uses at run time a
- copy of the library already present on the user's computer system,
- rather than copying library functions into the executable, and (2)
- will operate properly with a modified version of the library, if
- the user installs one, as long as the modified version is
- interface-compatible with the version that the work was made with.
-
- c) Accompany the work with a written offer, valid for at
- least three years, to give the same user the materials
- specified in Subsection 6a, above, for a charge no more
- than the cost of performing this distribution.
-
- d) If distribution of the work is made by offering access to copy
- from a designated place, offer equivalent access to copy the above
- specified materials from the same place.
-
- e) Verify that the user has already received a copy of these
- materials or that you have already sent this user a copy.
-
- For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it. However, as a special exception,
-the materials to be distributed need not include anything that is
-normally distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-
- It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system. Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-\f
- 7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-
- a) Accompany the combined library with a copy of the same work
- based on the Library, uncombined with any other library
- facilities. This must be distributed under the terms of the
- Sections above.
-
- b) Give prominent notice with the combined library of the fact
- that part of it is a work based on the Library, and explaining
- where to find the accompanying uncombined form of the same work.
-
- 8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License. Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License. However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-
- 9. You are not required to accept this License, since you have not
-signed it. However, nothing else grants you permission to modify or
-distribute the Library or its derivative works. These actions are
-prohibited by law if you do not accept this License. Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-
- 10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions. You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties with
-this License.
-\f
- 11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License. If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all. For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices. Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
- 12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded. In such case, this License incorporates the limitation as if
-written in the body of this License.
-
- 13. The Free Software Foundation may publish revised and/or new
-versions of the Lesser General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-
-Each version is given a distinguishing version number. If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation. If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-\f
- 14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission. For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this. Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-
- NO WARRANTY
-
- 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
- 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-
- END OF TERMS AND CONDITIONS
+++ /dev/null
-XEN_ROOT = $(CURDIR)/../..
-include $(XEN_ROOT)/tools/Rules.mk
-
-MAJOR = 4.15
-MINOR = 0
-
-ifeq ($(CONFIG_LIBXC_MINIOS),y)
-# Save/restore of a domain is currently incompatible with a stubdom environment
-override CONFIG_MIGRATE := n
-endif
-
-LINK_FILES := xc_private.h xc_core.h xc_core_x86.h xc_core_arm.h xc_bitops.h
-
-$(LINK_FILES):
- ln -sf $(XEN_ROOT)/tools/libs/ctrl/$(notdir $@) $@
-
-GUEST_SRCS-y :=
-GUEST_SRCS-y += xg_private.c
-GUEST_SRCS-y += xg_domain.c
-GUEST_SRCS-y += xg_suspend.c
-ifeq ($(CONFIG_MIGRATE),y)
-GUEST_SRCS-y += xg_sr_common.c
-GUEST_SRCS-$(CONFIG_X86) += xg_sr_common_x86.c
-GUEST_SRCS-$(CONFIG_X86) += xg_sr_common_x86_pv.c
-GUEST_SRCS-$(CONFIG_X86) += xg_sr_restore_x86_pv.c
-GUEST_SRCS-$(CONFIG_X86) += xg_sr_restore_x86_hvm.c
-GUEST_SRCS-$(CONFIG_X86) += xg_sr_save_x86_pv.c
-GUEST_SRCS-$(CONFIG_X86) += xg_sr_save_x86_hvm.c
-GUEST_SRCS-y += xg_sr_restore.c
-GUEST_SRCS-y += xg_sr_save.c
-GUEST_SRCS-y += xg_offline_page.c
-else
-GUEST_SRCS-y += xg_nomigrate.c
-endif
-
-vpath %.c ../../xen/common/libelf
-CFLAGS += -I../../xen/common/libelf
-
-ELF_SRCS-y += libelf-tools.c libelf-loader.c
-ELF_SRCS-y += libelf-dominfo.c
-
-GUEST_SRCS-y += $(ELF_SRCS-y)
-
-$(patsubst %.c,%.o,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign
-$(patsubst %.c,%.opic,$(ELF_SRCS-y)): CFLAGS += -Wno-pointer-sign
-
-ifeq ($(CONFIG_X86),y) # Add libx86 to the build
-vpath %.c ../../xen/lib/x86
-
-GUEST_SRCS-y += cpuid.c msr.c
-endif
-
-# new domain builder
-GUEST_SRCS-y += xg_dom_core.c
-GUEST_SRCS-y += xg_dom_boot.c
-GUEST_SRCS-y += xg_dom_elfloader.c
-GUEST_SRCS-$(CONFIG_X86) += xg_dom_bzimageloader.c
-GUEST_SRCS-$(CONFIG_X86) += xg_dom_decompress_lz4.c
-GUEST_SRCS-$(CONFIG_X86) += xg_dom_hvmloader.c
-GUEST_SRCS-$(CONFIG_ARM) += xg_dom_armzimageloader.c
-GUEST_SRCS-y += xg_dom_binloader.c
-GUEST_SRCS-y += xg_dom_compat_linux.c
-
-GUEST_SRCS-$(CONFIG_X86) += xg_dom_x86.c
-GUEST_SRCS-$(CONFIG_X86) += xg_cpuid_x86.c
-GUEST_SRCS-$(CONFIG_ARM) += xg_dom_arm.c
-
-ifeq ($(CONFIG_LIBXC_MINIOS),y)
-GUEST_SRCS-y += xg_dom_decompress_unsafe.c
-GUEST_SRCS-y += xg_dom_decompress_unsafe_bzip2.c
-GUEST_SRCS-y += xg_dom_decompress_unsafe_lzma.c
-GUEST_SRCS-y += xg_dom_decompress_unsafe_lzo1x.c
-GUEST_SRCS-y += xg_dom_decompress_unsafe_xz.c
-endif
-
--include $(XEN_TARGET_ARCH)/Makefile
-
-CFLAGS += -Werror -Wmissing-prototypes
-CFLAGS += -I. -I./include $(CFLAGS_xeninclude)
-CFLAGS += -D__XEN_TOOLS__
-
-# Needed for posix_fadvise64() in xc_linux.c
-CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE
-
-CFLAGS += $(PTHREAD_CFLAGS)
-CFLAGS += $(CFLAGS_libxentoollog)
-CFLAGS += $(CFLAGS_libxenevtchn)
-CFLAGS += $(CFLAGS_libxendevicemodel)
-
-GUEST_LIB_OBJS := $(patsubst %.c,%.o,$(GUEST_SRCS-y))
-GUEST_PIC_OBJS := $(patsubst %.c,%.opic,$(GUEST_SRCS-y))
-
-$(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS): CFLAGS += -include $(XEN_ROOT)/tools/config.h
-
-# libxenguest includes xc_private.h, so needs this despite not using
-# this functionality directly.
-$(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS): CFLAGS += $(CFLAGS_libxencall) $(CFLAGS_libxenforeignmemory)
-
-LIB += libxenguest.a
-ifneq ($(nosharedlibs),y)
-LIB += libxenguest.so libxenguest.so.$(MAJOR) libxenguest.so.$(MAJOR).$(MINOR)
-endif
-
-genpath-target = $(call buildmakevars2header,_paths.h)
-$(eval $(genpath-target))
-
-xc_private.h: _paths.h
-
-$(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS): $(LINK_FILES)
-
-PKG_CONFIG := xenguest.pc
-PKG_CONFIG_VERSION := $(MAJOR).$(MINOR)
-
-xenguest.pc: PKG_CONFIG_NAME = Xenguest
-xenguest.pc: PKG_CONFIG_DESC = The Xenguest library for Xen hypervisor
-xenguest.pc: PKG_CONFIG_USELIBS = $(SHLIB_libxenguest)
-xenguest.pc: PKG_CONFIG_LIB = xenguest
-xenguest.pc: PKG_CONFIG_REQPRIV = xentoollog,xencall,xenforeignmemory,xenevtchn
-
-$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_NAME = Xenguest
-$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_DESC = The Xenguest library for Xen hypervisor
-$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_USELIBS = $(SHLIB_libxenguest)
-$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_LIB = xenguest
-$(PKG_CONFIG_DIR)/xenguest.pc: PKG_CONFIG_REQPRIV = xentoollog,xencall,xenforeignmemory,xenevtchn,xencontrol
-
-ifneq ($(CONFIG_LIBXC_MINIOS),y)
-PKG_CONFIG_INST := $(PKG_CONFIG)
-$(PKG_CONFIG_INST): PKG_CONFIG_PREFIX = $(prefix)
-$(PKG_CONFIG_INST): PKG_CONFIG_INCDIR = $(includedir)
-$(PKG_CONFIG_INST): PKG_CONFIG_LIBDIR = $(libdir)
-endif
-
-PKG_CONFIG_LOCAL := $(foreach pc,$(PKG_CONFIG),$(PKG_CONFIG_DIR)/$(pc))
-
-$(PKG_CONFIG_LOCAL): PKG_CONFIG_PREFIX = $(XEN_ROOT)
-$(PKG_CONFIG_LOCAL): PKG_CONFIG_INCDIR = $(XEN_libxenctrl)/include
-$(PKG_CONFIG_LOCAL): PKG_CONFIG_LIBDIR = $(CURDIR)
-$(PKG_CONFIG_LOCAL): PKG_CONFIG_CFLAGS_LOCAL = $(CFLAGS_xeninclude)
-
-.PHONY: all
-all: build
-
-.PHONY: build
-build:
- $(MAKE) libs
-
-.PHONY: libs
-libs: $(LIB) $(PKG_CONFIG_INST) $(PKG_CONFIG_LOCAL)
-
-.PHONY: install
-install: build
- $(INSTALL_DIR) $(DESTDIR)$(libdir)
- $(INSTALL_DIR) $(DESTDIR)$(includedir)
- $(INSTALL_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
- $(INSTALL_DATA) libxenguest.a $(DESTDIR)$(libdir)
- $(SYMLINK_SHLIB) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/libxenguest.so.$(MAJOR)
- $(SYMLINK_SHLIB) libxenguest.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenguest.so
- $(INSTALL_DATA) include/xenguest.h $(DESTDIR)$(includedir)
- $(INSTALL_DATA) xenguest.pc $(DESTDIR)$(PKG_INSTALLDIR)
-
-.PHONY: uninstall
-uninstall:
- rm -f $(DESTDIR)$(PKG_INSTALLDIR)/xenguest.pc
- rm -f $(DESTDIR)$(includedir)/xenguest.h
- rm -f $(DESTDIR)$(libdir)/libxenguest.so
- rm -f $(DESTDIR)$(libdir)/libxenguest.so.$(MAJOR)
- rm -f $(DESTDIR)$(libdir)/libxenguest.so.$(MAJOR).$(MINOR)
- rm -f $(DESTDIR)$(libdir)/libxenguest.a
-
-.PHONY: TAGS
-TAGS:
- etags -t *.c *.h
-
-.PHONY: clean
-clean:
- rm -rf *.rpm $(LIB) *~ $(DEPS_RM) \
- _paths.h \
- $(LINK_FILES) \
- xenguest.pc \
- $(GUEST_LIB_OBJS) $(GUEST_PIC_OBJS)
-
-.PHONY: distclean
-distclean: clean
-
-.PHONY: rpm
-rpm: build
- rm -rf staging
- mkdir staging
- mkdir staging/i386
- rpmbuild --define "staging$$PWD/staging" --define '_builddir.' \
- --define "_rpmdir$$PWD/staging" -bb rpm.spec
- mv staging/i386/*.rpm .
- rm -rf staging
-
-# libxenguest
-
-libxenguest.a: $(GUEST_LIB_OBJS)
- $(AR) rc $@ $^
-
-libxenguest.so: libxenguest.so.$(MAJOR)
- $(SYMLINK_SHLIB) $< $@
-libxenguest.so.$(MAJOR): libxenguest.so.$(MAJOR).$(MINOR)
- $(SYMLINK_SHLIB) $< $@
-
-ifeq ($(CONFIG_MiniOS),y)
-zlib-options =
-else
-zlib-options = $(ZLIB)
-endif
-
-xc_dom_bzimageloader.o: CFLAGS += $(filter -D%,$(zlib-options))
-xc_dom_bzimageloader.opic: CFLAGS += $(filter -D%,$(zlib-options))
-
-libxenguest.so.$(MAJOR).$(MINOR): COMPRESSION_LIBS = $(filter -l%,$(zlib-options))
-libxenguest.so.$(MAJOR).$(MINOR): $(GUEST_PIC_OBJS)
- $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libxenguest.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $(GUEST_PIC_OBJS) $(COMPRESSION_LIBS) -lz $(LDLIBS_libxenevtchn) $(LDLIBS_libxenctrl) $(PTHREAD_LIBS) $(APPEND_LDFLAGS)
-
--include $(DEPS_INCLUDE)
-
+++ /dev/null
-/******************************************************************************
- * xenguest.h
- *
- * A library for guest domain management in Xen.
- *
- * Copyright (c) 2003-2004, K A Fraser.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef XENGUEST_H
-#define XENGUEST_H
-
-#include <xenctrl_dom.h>
-
-#define XC_NUMA_NO_NODE (~0U)
-
-#define XCFLAGS_LIVE (1 << 0)
-#define XCFLAGS_DEBUG (1 << 1)
-
-#define X86_64_B_SIZE 64
-#define X86_32_B_SIZE 32
-
-/*
- * User not using xc_suspend_* / xc_await_suspent may not want to
- * include the full libxenevtchn API here.
- */
-struct xenevtchn_handle;
-
-/* For save's precopy_policy(). */
-struct precopy_stats
-{
- unsigned int iteration;
- unsigned int total_written;
- long dirty_count; /* -1 if unknown */
-};
-
-/*
- * A precopy_policy callback may not be running in the same address
- * space as libxc an so precopy_stats is passed by value.
- */
-typedef int (*precopy_policy_t)(struct precopy_stats, void *);
-
-/* callbacks provided by xc_domain_save */
-struct save_callbacks {
- /*
- * Called after expiration of checkpoint interval,
- * to suspend the guest.
- */
- int (*suspend)(void *data);
-
- /*
- * Called before and after every batch of page data sent during
- * the precopy phase of a live migration to ask the caller what
- * to do next based on the current state of the precopy migration.
- *
- * Should return one of the values listed below:
- */
-#define XGS_POLICY_ABORT (-1) /* Abandon the migration entirely
- * and tidy up. */
-#define XGS_POLICY_CONTINUE_PRECOPY 0 /* Remain in the precopy phase. */
-#define XGS_POLICY_STOP_AND_COPY 1 /* Immediately suspend and transmit the
- * remaining dirty pages. */
- precopy_policy_t precopy_policy;
-
- /*
- * Called after the guest's dirty pages have been
- * copied into an output buffer.
- * Callback function resumes the guest & the device model,
- * returns to xc_domain_save.
- * xc_domain_save then flushes the output buffer, while the
- * guest continues to run.
- */
- int (*postcopy)(void *data);
-
- /*
- * Called after the memory checkpoint has been flushed
- * out into the network. Typical actions performed in this
- * callback include:
- * (a) send the saved device model state (for HVM guests),
- * (b) wait for checkpoint ack
- * (c) release the network output buffer pertaining to the acked checkpoint.
- * (c) sleep for the checkpoint interval.
- *
- * returns:
- * 0: terminate checkpointing gracefully
- * 1: take another checkpoint
- */
- int (*checkpoint)(void *data);
-
- /*
- * Called after the checkpoint callback.
- *
- * returns:
- * 0: terminate checkpointing gracefully
- * 1: take another checkpoint
- */
- int (*wait_checkpoint)(void *data);
-
- /* Enable qemu-dm logging dirty pages to xen */
- int (*switch_qemu_logdirty)(uint32_t domid, unsigned enable, void *data); /* HVM only */
-
- /* to be provided as the last argument to each callback function */
- void *data;
-};
-
-/* Type of stream. Plain, or using a continuous replication protocol? */
-typedef enum {
- XC_STREAM_PLAIN,
- XC_STREAM_REMUS,
- XC_STREAM_COLO,
-} xc_stream_type_t;
-
-/**
- * This function will save a running domain.
- *
- * @param xch a handle to an open hypervisor interface
- * @param io_fd the file descriptor to save a domain to
- * @param dom the id of the domain
- * @param flags XCFLAGS_xxx
- * @param stream_type XC_STREAM_PLAIN if the far end of the stream
- * doesn't use checkpointing
- * @param recv_fd Only used for XC_STREAM_COLO. Contains backchannel from
- * the destination side.
- * @return 0 on success, -1 on failure
- */
-int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
- uint32_t flags, struct save_callbacks *callbacks,
- xc_stream_type_t stream_type, int recv_fd);
-
-/* callbacks provided by xc_domain_restore */
-struct restore_callbacks {
- /*
- * Called once the STATIC_DATA_END record has been received/inferred.
- *
- * For compatibility with older streams, provides a list of static data
- * expected to be found in the stream, which was missing. A higher level
- * toolstack is responsible for providing any necessary compatibiltiy.
- */
-#define XGR_SDD_MISSING_CPUID (1 << 0)
-#define XGR_SDD_MISSING_MSR (1 << 1)
- int (*static_data_done)(unsigned int missing, void *data);
-
- /* Called after a new checkpoint to suspend the guest. */
- int (*suspend)(void *data);
-
- /*
- * Called after the secondary vm is ready to resume.
- * Callback function resumes the guest & the device model,
- * returns to xc_domain_restore.
- */
- int (*postcopy)(void *data);
-
- /*
- * A checkpoint record has been found in the stream.
- * returns:
- */
-#define XGR_CHECKPOINT_ERROR 0 /* Terminate processing */
-#define XGR_CHECKPOINT_SUCCESS 1 /* Continue reading more data from the stream */
-#define XGR_CHECKPOINT_FAILOVER 2 /* Failover and resume VM */
- int (*checkpoint)(void *data);
-
- /*
- * Called after the checkpoint callback.
- *
- * returns:
- * 0: terminate checkpointing gracefully
- * 1: take another checkpoint
- */
- int (*wait_checkpoint)(void *data);
-
- /*
- * callback to send store gfn and console gfn to xl
- * if we want to resume vm before xc_domain_save()
- * exits.
- */
- void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn,
- void *data);
-
- /* to be provided as the last argument to each callback function */
- void *data;
-};
-
-/**
- * This function will restore a saved domain.
- *
- * Domain is restored in a suspended state ready to be unpaused.
- *
- * @param xch a handle to an open hypervisor interface
- * @param io_fd the file descriptor to restore a domain from
- * @param dom the id of the domain
- * @param store_evtchn the xenstore event channel for this domain to use
- * @param store_mfn filled with the gfn of the store page
- * @param store_domid the backend domain for xenstore
- * @param console_evtchn the console event channel for this domain to use
- * @param console_mfn filled with the gfn of the console page
- * @param console_domid the backend domain for xenconsole
- * @param stream_type XC_STREAM_PLAIN if the far end of the stream is using
- * checkpointing
- * @param callbacks non-NULL to receive a callback to restore toolstack
- * specific data
- * @param send_back_fd Only used for XC_STREAM_COLO. Contains backchannel to
- * the source side.
- * @return 0 on success, -1 on failure
- */
-int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
- unsigned int store_evtchn, unsigned long *store_mfn,
- uint32_t store_domid, unsigned int console_evtchn,
- unsigned long *console_mfn, uint32_t console_domid,
- xc_stream_type_t stream_type,
- struct restore_callbacks *callbacks, int send_back_fd);
-
-/**
- * This function will create a domain for a paravirtualized Linux
- * using file names pointing to kernel and ramdisk
- *
- * @parm xch a handle to an open hypervisor interface
- * @parm domid the id of the domain
- * @parm mem_mb memory size in megabytes
- * @parm image_name name of the kernel image file
- * @parm ramdisk_name name of the ramdisk image file
- * @parm cmdline command line string
- * @parm flags domain creation flags
- * @parm store_evtchn the store event channel for this domain to use
- * @parm store_mfn returned with the mfn of the store page
- * @parm console_evtchn the console event channel for this domain to use
- * @parm conole_mfn returned with the mfn of the console page
- * @return 0 on success, -1 on failure
- */
-int xc_linux_build(xc_interface *xch,
- uint32_t domid,
- unsigned int mem_mb,
- const char *image_name,
- const char *ramdisk_name,
- const char *cmdline,
- const char *features,
- unsigned long flags,
- unsigned int store_evtchn,
- unsigned long *store_mfn,
- unsigned int console_evtchn,
- unsigned long *console_mfn);
-
-/*
- * Sets *lockfd to -1.
- * Has deallocated everything even on error.
- */
-int xc_suspend_evtchn_release(xc_interface *xch,
- struct xenevtchn_handle *xce,
- uint32_t domid, int suspend_evtchn, int *lockfd);
-
-/**
- * This function eats the initial notification.
- * xce must not be used for anything else
- * See xc_suspend_evtchn_init_sane re lockfd.
- */
-int xc_suspend_evtchn_init_exclusive(xc_interface *xch,
- struct xenevtchn_handle *xce,
- uint32_t domid, int port, int *lockfd);
-
-/* xce must not be used for anything else */
-int xc_await_suspend(xc_interface *xch, struct xenevtchn_handle *xce,
- int suspend_evtchn);
-
-/**
- * The port will be signaled immediately after this call
- * The caller should check the domain status and look for the next event
- * On success, *lockfd will be set to >=0 and *lockfd must be preserved
- * and fed to xc_suspend_evtchn_release. (On error *lockfd is
- * undefined and xc_suspend_evtchn_release is not allowed.)
- */
-int xc_suspend_evtchn_init_sane(xc_interface *xch,
- struct xenevtchn_handle *xce,
- uint32_t domid, int port, int *lockfd);
-
-int xc_mark_page_online(xc_interface *xch, unsigned long start,
- unsigned long end, uint32_t *status);
-
-int xc_mark_page_offline(xc_interface *xch, unsigned long start,
- unsigned long end, uint32_t *status);
-
-int xc_query_page_offline_status(xc_interface *xch, unsigned long start,
- unsigned long end, uint32_t *status);
-
-int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn);
-
-
-/**
- * Memory related information, such as PFN types, the P2M table,
- * the guest word width and the guest page table levels.
- */
-struct xc_domain_meminfo {
- unsigned int pt_levels;
- unsigned int guest_width;
- xen_pfn_t *pfn_type;
- xen_pfn_t *p2m_table;
- unsigned long p2m_size;
-};
-
-int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid,
- struct xc_domain_meminfo *minfo);
-
-int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *mem);
-
-/**
- * This function map m2p table
- * @parm xch a handle to an open hypervisor interface
- * @parm max_mfn the max pfn
- * @parm prot the flags to map, such as read/write etc
- * @parm mfn0 return the first mfn, can be NULL
- * @return mapped m2p table on success, NULL on failure
- */
-xen_pfn_t *xc_map_m2p(xc_interface *xch,
- unsigned long max_mfn,
- int prot,
- unsigned long *mfn0);
-#endif /* XENGUEST_H */
+++ /dev/null
-/******************************************************************************
- * xc_cpuid_x86.c
- *
- * Compute cpuid of a domain.
- *
- * Copyright (c) 2008, Citrix Systems, Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdlib.h>
-#include <stdbool.h>
-#include <limits.h>
-#include "xc_private.h"
-#include "xc_bitops.h"
-#include <xen/hvm/params.h>
-#include <xen-tools/libs.h>
-
-enum {
-#define XEN_CPUFEATURE(name, value) X86_FEATURE_##name = value,
-#include <xen/arch-x86/cpufeatureset.h>
-};
-
-#include <xen/asm/x86-vendors.h>
-
-#include <xen/lib/x86/cpu-policy.h>
-
-#define bitmaskof(idx) (1u << ((idx) & 31))
-#define featureword_of(idx) ((idx) >> 5)
-
-int xc_get_cpu_levelling_caps(xc_interface *xch, uint32_t *caps)
-{
- DECLARE_SYSCTL;
- int ret;
-
- sysctl.cmd = XEN_SYSCTL_get_cpu_levelling_caps;
- ret = do_sysctl(xch, &sysctl);
-
- if ( !ret )
- *caps = sysctl.u.cpu_levelling_caps.caps;
-
- return ret;
-}
-
-int xc_get_cpu_featureset(xc_interface *xch, uint32_t index,
- uint32_t *nr_features, uint32_t *featureset)
-{
- DECLARE_SYSCTL;
- DECLARE_HYPERCALL_BOUNCE(featureset,
- *nr_features * sizeof(*featureset),
- XC_HYPERCALL_BUFFER_BOUNCE_OUT);
- int ret;
-
- if ( xc_hypercall_bounce_pre(xch, featureset) )
- return -1;
-
- sysctl.cmd = XEN_SYSCTL_get_cpu_featureset;
- sysctl.u.cpu_featureset.index = index;
- sysctl.u.cpu_featureset.nr_features = *nr_features;
- set_xen_guest_handle(sysctl.u.cpu_featureset.features, featureset);
-
- ret = do_sysctl(xch, &sysctl);
-
- xc_hypercall_bounce_post(xch, featureset);
-
- if ( !ret )
- *nr_features = sysctl.u.cpu_featureset.nr_features;
-
- return ret;
-}
-
-uint32_t xc_get_cpu_featureset_size(void)
-{
- return FEATURESET_NR_ENTRIES;
-}
-
-const uint32_t *xc_get_static_cpu_featuremask(
- enum xc_static_cpu_featuremask mask)
-{
- static const uint32_t masks[][FEATURESET_NR_ENTRIES] = {
-#define MASK(x) [XC_FEATUREMASK_ ## x] = INIT_ ## x ## _FEATURES
-
- MASK(KNOWN),
- MASK(SPECIAL),
- MASK(PV_MAX),
- MASK(PV_DEF),
- MASK(HVM_SHADOW_MAX),
- MASK(HVM_SHADOW_DEF),
- MASK(HVM_HAP_MAX),
- MASK(HVM_HAP_DEF),
-
-#undef MASK
- };
-
- if ( (unsigned int)mask >= ARRAY_SIZE(masks) )
- return NULL;
-
- return masks[mask];
-}
-
-int xc_get_cpu_policy_size(xc_interface *xch, uint32_t *nr_leaves,
- uint32_t *nr_msrs)
-{
- struct xen_sysctl sysctl = {};
- int ret;
-
- sysctl.cmd = XEN_SYSCTL_get_cpu_policy;
-
- ret = do_sysctl(xch, &sysctl);
-
- if ( !ret )
- {
- *nr_leaves = sysctl.u.cpu_policy.nr_leaves;
- *nr_msrs = sysctl.u.cpu_policy.nr_msrs;
- }
-
- return ret;
-}
-
-int xc_get_system_cpu_policy(xc_interface *xch, uint32_t index,
- uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves,
- uint32_t *nr_msrs, xen_msr_entry_t *msrs)
-{
- struct xen_sysctl sysctl = {};
- DECLARE_HYPERCALL_BOUNCE(leaves,
- *nr_leaves * sizeof(*leaves),
- XC_HYPERCALL_BUFFER_BOUNCE_OUT);
- DECLARE_HYPERCALL_BOUNCE(msrs,
- *nr_msrs * sizeof(*msrs),
- XC_HYPERCALL_BUFFER_BOUNCE_OUT);
- int ret;
-
- if ( xc_hypercall_bounce_pre(xch, leaves) ||
- xc_hypercall_bounce_pre(xch, msrs) )
- return -1;
-
- sysctl.cmd = XEN_SYSCTL_get_cpu_policy;
- sysctl.u.cpu_policy.index = index;
- sysctl.u.cpu_policy.nr_leaves = *nr_leaves;
- set_xen_guest_handle(sysctl.u.cpu_policy.cpuid_policy, leaves);
- sysctl.u.cpu_policy.nr_msrs = *nr_msrs;
- set_xen_guest_handle(sysctl.u.cpu_policy.msr_policy, msrs);
-
- ret = do_sysctl(xch, &sysctl);
-
- xc_hypercall_bounce_post(xch, leaves);
- xc_hypercall_bounce_post(xch, msrs);
-
- if ( !ret )
- {
- *nr_leaves = sysctl.u.cpu_policy.nr_leaves;
- *nr_msrs = sysctl.u.cpu_policy.nr_msrs;
- }
-
- return ret;
-}
-
-int xc_get_domain_cpu_policy(xc_interface *xch, uint32_t domid,
- uint32_t *nr_leaves, xen_cpuid_leaf_t *leaves,
- uint32_t *nr_msrs, xen_msr_entry_t *msrs)
-{
- DECLARE_DOMCTL;
- DECLARE_HYPERCALL_BOUNCE(leaves,
- *nr_leaves * sizeof(*leaves),
- XC_HYPERCALL_BUFFER_BOUNCE_OUT);
- DECLARE_HYPERCALL_BOUNCE(msrs,
- *nr_msrs * sizeof(*msrs),
- XC_HYPERCALL_BUFFER_BOUNCE_OUT);
- int ret;
-
- if ( xc_hypercall_bounce_pre(xch, leaves) ||
- xc_hypercall_bounce_pre(xch, msrs) )
- return -1;
-
- domctl.cmd = XEN_DOMCTL_get_cpu_policy;
- domctl.domain = domid;
- domctl.u.cpu_policy.nr_leaves = *nr_leaves;
- set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves);
- domctl.u.cpu_policy.nr_msrs = *nr_msrs;
- set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs);
-
- ret = do_domctl(xch, &domctl);
-
- xc_hypercall_bounce_post(xch, leaves);
- xc_hypercall_bounce_post(xch, msrs);
-
- if ( !ret )
- {
- *nr_leaves = domctl.u.cpu_policy.nr_leaves;
- *nr_msrs = domctl.u.cpu_policy.nr_msrs;
- }
-
- return ret;
-}
-
-int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid,
- uint32_t nr_leaves, xen_cpuid_leaf_t *leaves,
- uint32_t nr_msrs, xen_msr_entry_t *msrs,
- uint32_t *err_leaf_p, uint32_t *err_subleaf_p,
- uint32_t *err_msr_p)
-{
- DECLARE_DOMCTL;
- DECLARE_HYPERCALL_BOUNCE(leaves,
- nr_leaves * sizeof(*leaves),
- XC_HYPERCALL_BUFFER_BOUNCE_IN);
- DECLARE_HYPERCALL_BOUNCE(msrs,
- nr_msrs * sizeof(*msrs),
- XC_HYPERCALL_BUFFER_BOUNCE_IN);
- int ret;
-
- if ( err_leaf_p )
- *err_leaf_p = -1;
- if ( err_subleaf_p )
- *err_subleaf_p = -1;
- if ( err_msr_p )
- *err_msr_p = -1;
-
- if ( xc_hypercall_bounce_pre(xch, leaves) )
- return -1;
-
- if ( xc_hypercall_bounce_pre(xch, msrs) )
- return -1;
-
- domctl.cmd = XEN_DOMCTL_set_cpu_policy;
- domctl.domain = domid;
- domctl.u.cpu_policy.nr_leaves = nr_leaves;
- set_xen_guest_handle(domctl.u.cpu_policy.cpuid_policy, leaves);
- domctl.u.cpu_policy.nr_msrs = nr_msrs;
- set_xen_guest_handle(domctl.u.cpu_policy.msr_policy, msrs);
- domctl.u.cpu_policy.err_leaf = -1;
- domctl.u.cpu_policy.err_subleaf = -1;
- domctl.u.cpu_policy.err_msr = -1;
-
- ret = do_domctl(xch, &domctl);
-
- xc_hypercall_bounce_post(xch, leaves);
- xc_hypercall_bounce_post(xch, msrs);
-
- if ( err_leaf_p )
- *err_leaf_p = domctl.u.cpu_policy.err_leaf;
- if ( err_subleaf_p )
- *err_subleaf_p = domctl.u.cpu_policy.err_subleaf;
- if ( err_msr_p )
- *err_msr_p = domctl.u.cpu_policy.err_msr;
-
- return ret;
-}
-
-static int compare_leaves(const void *l, const void *r)
-{
- const xen_cpuid_leaf_t *lhs = l;
- const xen_cpuid_leaf_t *rhs = r;
-
- if ( lhs->leaf != rhs->leaf )
- return lhs->leaf < rhs->leaf ? -1 : 1;
-
- if ( lhs->subleaf != rhs->subleaf )
- return lhs->subleaf < rhs->subleaf ? -1 : 1;
-
- return 0;
-}
-
-static xen_cpuid_leaf_t *find_leaf(
- xen_cpuid_leaf_t *leaves, unsigned int nr_leaves,
- const struct xc_xend_cpuid *xend)
-{
- const xen_cpuid_leaf_t key = { xend->leaf, xend->subleaf };
-
- return bsearch(&key, leaves, nr_leaves, sizeof(*leaves), compare_leaves);
-}
-
-static int xc_cpuid_xend_policy(
- xc_interface *xch, uint32_t domid, const struct xc_xend_cpuid *xend)
-{
- int rc;
- xc_dominfo_t di;
- unsigned int nr_leaves, nr_msrs;
- uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1;
- /*
- * Three full policies. The host, domain max, and domain current for the
- * domain type.
- */
- xen_cpuid_leaf_t *host = NULL, *max = NULL, *cur = NULL;
- unsigned int nr_host, nr_max, nr_cur;
-
- if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
- di.domid != domid )
- {
- ERROR("Failed to obtain d%d info", domid);
- rc = -ESRCH;
- goto fail;
- }
-
- rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs);
- if ( rc )
- {
- PERROR("Failed to obtain policy info size");
- rc = -errno;
- goto fail;
- }
-
- rc = -ENOMEM;
- if ( (host = calloc(nr_leaves, sizeof(*host))) == NULL ||
- (max = calloc(nr_leaves, sizeof(*max))) == NULL ||
- (cur = calloc(nr_leaves, sizeof(*cur))) == NULL )
- {
- ERROR("Unable to allocate memory for %u CPUID leaves", nr_leaves);
- goto fail;
- }
-
- /* Get the domain's current policy. */
- nr_msrs = 0;
- nr_cur = nr_leaves;
- rc = xc_get_domain_cpu_policy(xch, domid, &nr_cur, cur, &nr_msrs, NULL);
- if ( rc )
- {
- PERROR("Failed to obtain d%d current policy", domid);
- rc = -errno;
- goto fail;
- }
-
- /* Get the domain's max policy. */
- nr_msrs = 0;
- nr_max = nr_leaves;
- rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_max
- : XEN_SYSCTL_cpu_policy_pv_max,
- &nr_max, max, &nr_msrs, NULL);
- if ( rc )
- {
- PERROR("Failed to obtain %s max policy", di.hvm ? "hvm" : "pv");
- rc = -errno;
- goto fail;
- }
-
- /* Get the host policy. */
- nr_msrs = 0;
- nr_host = nr_leaves;
- rc = xc_get_system_cpu_policy(xch, XEN_SYSCTL_cpu_policy_host,
- &nr_host, host, &nr_msrs, NULL);
- if ( rc )
- {
- PERROR("Failed to obtain host policy");
- rc = -errno;
- goto fail;
- }
-
- rc = -EINVAL;
- for ( ; xend->leaf != XEN_CPUID_INPUT_UNUSED; ++xend )
- {
- xen_cpuid_leaf_t *cur_leaf = find_leaf(cur, nr_cur, xend);
- const xen_cpuid_leaf_t *max_leaf = find_leaf(max, nr_max, xend);
- const xen_cpuid_leaf_t *host_leaf = find_leaf(host, nr_host, xend);
-
- if ( cur_leaf == NULL || max_leaf == NULL || host_leaf == NULL )
- {
- ERROR("Missing leaf %#x, subleaf %#x", xend->leaf, xend->subleaf);
- goto fail;
- }
-
- for ( unsigned int i = 0; i < ARRAY_SIZE(xend->policy); i++ )
- {
- uint32_t *cur_reg = &cur_leaf->a + i;
- const uint32_t *max_reg = &max_leaf->a + i;
- const uint32_t *host_reg = &host_leaf->a + i;
-
- if ( xend->policy[i] == NULL )
- continue;
-
- for ( unsigned int j = 0; j < 32; j++ )
- {
- bool val;
-
- if ( xend->policy[i][j] == '1' )
- val = true;
- else if ( xend->policy[i][j] == '0' )
- val = false;
- else if ( xend->policy[i][j] == 'x' )
- val = test_bit(31 - j, max_reg);
- else if ( xend->policy[i][j] == 'k' ||
- xend->policy[i][j] == 's' )
- val = test_bit(31 - j, host_reg);
- else
- {
- ERROR("Bad character '%c' in policy[%d] string '%s'",
- xend->policy[i][j], i, xend->policy[i]);
- goto fail;
- }
-
- clear_bit(31 - j, cur_reg);
- if ( val )
- set_bit(31 - j, cur_reg);
- }
- }
- }
-
- /* Feed the transformed currrent policy back up to Xen. */
- rc = xc_set_domain_cpu_policy(xch, domid, nr_cur, cur, 0, NULL,
- &err_leaf, &err_subleaf, &err_msr);
- if ( rc )
- {
- PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)",
- domid, err_leaf, err_subleaf, err_msr);
- rc = -errno;
- goto fail;
- }
-
- /* Success! */
-
- fail:
- free(cur);
- free(max);
- free(host);
-
- return rc;
-}
-
-int xc_cpuid_apply_policy(xc_interface *xch, uint32_t domid, bool restore,
- const uint32_t *featureset, unsigned int nr_features,
- bool pae,
- const struct xc_xend_cpuid *xend)
-{
- int rc;
- xc_dominfo_t di;
- unsigned int i, nr_leaves, nr_msrs;
- xen_cpuid_leaf_t *leaves = NULL;
- struct cpuid_policy *p = NULL;
- uint32_t err_leaf = -1, err_subleaf = -1, err_msr = -1;
- uint32_t host_featureset[FEATURESET_NR_ENTRIES] = {};
- uint32_t len = ARRAY_SIZE(host_featureset);
-
- if ( xc_domain_getinfo(xch, domid, 1, &di) != 1 ||
- di.domid != domid )
- {
- ERROR("Failed to obtain d%d info", domid);
- rc = -ESRCH;
- goto out;
- }
-
- rc = xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs);
- if ( rc )
- {
- PERROR("Failed to obtain policy info size");
- rc = -errno;
- goto out;
- }
-
- rc = -ENOMEM;
- if ( (leaves = calloc(nr_leaves, sizeof(*leaves))) == NULL ||
- (p = calloc(1, sizeof(*p))) == NULL )
- goto out;
-
- /* Get the host policy. */
- rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_host,
- &len, host_featureset);
- if ( rc )
- {
- /* Tolerate "buffer too small", as we've got the bits we need. */
- if ( errno == ENOBUFS )
- rc = 0;
- else
- {
- PERROR("Failed to obtain host featureset");
- rc = -errno;
- goto out;
- }
- }
-
- /* Get the domain's default policy. */
- nr_msrs = 0;
- rc = xc_get_system_cpu_policy(xch, di.hvm ? XEN_SYSCTL_cpu_policy_hvm_default
- : XEN_SYSCTL_cpu_policy_pv_default,
- &nr_leaves, leaves, &nr_msrs, NULL);
- if ( rc )
- {
- PERROR("Failed to obtain %s default policy", di.hvm ? "hvm" : "pv");
- rc = -errno;
- goto out;
- }
-
- rc = x86_cpuid_copy_from_buffer(p, leaves, nr_leaves,
- &err_leaf, &err_subleaf);
- if ( rc )
- {
- ERROR("Failed to deserialise CPUID (err leaf %#x, subleaf %#x) (%d = %s)",
- err_leaf, err_subleaf, -rc, strerror(-rc));
- goto out;
- }
-
- /*
- * Account for feature which have been disabled by default since Xen 4.13,
- * so migrated-in VM's don't risk seeing features disappearing.
- */
- if ( restore )
- {
- p->basic.rdrand = test_bit(X86_FEATURE_RDRAND, host_featureset);
-
- if ( di.hvm )
- {
- p->feat.mpx = test_bit(X86_FEATURE_MPX, host_featureset);
- }
- }
-
- if ( featureset )
- {
- uint32_t disabled_features[FEATURESET_NR_ENTRIES],
- feat[FEATURESET_NR_ENTRIES] = {};
- static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
- unsigned int i, b;
-
- /*
- * The user supplied featureset may be shorter or longer than
- * FEATURESET_NR_ENTRIES. Shorter is fine, and we will zero-extend.
- * Longer is fine, so long as it only padded with zeros.
- */
- unsigned int user_len = min(FEATURESET_NR_ENTRIES + 0u, nr_features);
-
- /* Check for truncated set bits. */
- rc = -EOPNOTSUPP;
- for ( i = user_len; i < nr_features; ++i )
- if ( featureset[i] != 0 )
- goto out;
-
- memcpy(feat, featureset, sizeof(*featureset) * user_len);
-
- /* Disable deep dependencies of disabled features. */
- for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
- disabled_features[i] = ~feat[i] & deep_features[i];
-
- for ( b = 0; b < sizeof(disabled_features) * CHAR_BIT; ++b )
- {
- const uint32_t *dfs;
-
- if ( !test_bit(b, disabled_features) ||
- !(dfs = x86_cpuid_lookup_deep_deps(b)) )
- continue;
-
- for ( i = 0; i < ARRAY_SIZE(disabled_features); ++i )
- {
- feat[i] &= ~dfs[i];
- disabled_features[i] &= ~dfs[i];
- }
- }
-
- cpuid_featureset_to_policy(feat, p);
- }
- else
- {
- if ( di.hvm )
- p->basic.pae = pae;
- }
-
- if ( !di.hvm )
- {
- /*
- * On hardware without CPUID Faulting, PV guests see real topology.
- * As a consequence, they also need to see the host htt/cmp fields.
- */
- p->basic.htt = test_bit(X86_FEATURE_HTT, host_featureset);
- p->extd.cmp_legacy = test_bit(X86_FEATURE_CMP_LEGACY, host_featureset);
- }
- else
- {
- /*
- * Topology for HVM guests is entirely controlled by Xen. For now, we
- * hardcode APIC_ID = vcpu_id * 2 to give the illusion of no SMT.
- */
- p->basic.htt = true;
- p->extd.cmp_legacy = false;
-
- /*
- * Leaf 1 EBX[23:16] is Maximum Logical Processors Per Package.
- * Update to reflect vLAPIC_ID = vCPU_ID * 2, but make sure to avoid
- * overflow.
- */
- if ( !(p->basic.lppp & 0x80) )
- p->basic.lppp *= 2;
-
- switch ( p->x86_vendor )
- {
- case X86_VENDOR_INTEL:
- for ( i = 0; (p->cache.subleaf[i].type &&
- i < ARRAY_SIZE(p->cache.raw)); ++i )
- {
- p->cache.subleaf[i].cores_per_package =
- (p->cache.subleaf[i].cores_per_package << 1) | 1;
- p->cache.subleaf[i].threads_per_cache = 0;
- }
- break;
-
- case X86_VENDOR_AMD:
- case X86_VENDOR_HYGON:
- /*
- * Leaf 0x80000008 ECX[15:12] is ApicIdCoreSize.
- * Leaf 0x80000008 ECX[7:0] is NumberOfCores (minus one).
- * Update to reflect vLAPIC_ID = vCPU_ID * 2. But avoid
- * - overflow,
- * - going out of sync with leaf 1 EBX[23:16],
- * - incrementing ApicIdCoreSize when it's zero (which changes the
- * meaning of bits 7:0).
- *
- * UPDATE: I addition to avoiding overflow, some
- * proprietary operating systems have trouble with
- * apic_id_size values greater than 7. Limit the value to
- * 7 for now.
- */
- if ( p->extd.nc < 0x7f )
- {
- if ( p->extd.apic_id_size != 0 && p->extd.apic_id_size < 0x7 )
- p->extd.apic_id_size++;
-
- p->extd.nc = (p->extd.nc << 1) | 1;
- }
- break;
- }
-
- /*
- * These settings are necessary to cause earlier HVM_PARAM_NESTEDHVM /
- * XEN_DOMCTL_disable_migrate settings to be reflected correctly in
- * CPUID. Xen will discard these bits if configuration hasn't been
- * set for the domain.
- */
- p->extd.itsc = true;
- p->basic.vmx = true;
- p->extd.svm = true;
- }
-
- rc = x86_cpuid_copy_to_buffer(p, leaves, &nr_leaves);
- if ( rc )
- {
- ERROR("Failed to serialise CPUID (%d = %s)", -rc, strerror(-rc));
- goto out;
- }
-
- rc = xc_set_domain_cpu_policy(xch, domid, nr_leaves, leaves, 0, NULL,
- &err_leaf, &err_subleaf, &err_msr);
- if ( rc )
- {
- PERROR("Failed to set d%d's policy (err leaf %#x, subleaf %#x, msr %#x)",
- domid, err_leaf, err_subleaf, err_msr);
- rc = -errno;
- goto out;
- }
-
- if ( xend && (rc = xc_cpuid_xend_policy(xch, domid, xend)) )
- goto out;
-
- rc = 0;
-
-out:
- free(p);
- free(leaves);
-
- return rc;
-}
+++ /dev/null
-/*
- * Xen domain builder -- ARM
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * Copyright (c) 2011, Citrix Systems
- */
-#include <inttypes.h>
-#include <assert.h>
-
-#include <xen/xen.h>
-#include <xen/io/protocols.h>
-#include <xen-tools/libs.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-
-#define NR_MAGIC_PAGES 4
-#define CONSOLE_PFN_OFFSET 0
-#define XENSTORE_PFN_OFFSET 1
-#define MEMACCESS_PFN_OFFSET 2
-#define VUART_PFN_OFFSET 3
-
-#define LPAE_SHIFT 9
-
-#define PFN_4K_SHIFT (0)
-#define PFN_2M_SHIFT (PFN_4K_SHIFT+LPAE_SHIFT)
-#define PFN_1G_SHIFT (PFN_2M_SHIFT+LPAE_SHIFT)
-#define PFN_512G_SHIFT (PFN_1G_SHIFT+LPAE_SHIFT)
-
-/* get guest IO ABI protocol */
-const char *xc_domain_get_native_protocol(xc_interface *xch,
- uint32_t domid)
-{
- return XEN_IO_PROTO_ABI_ARM;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int alloc_magic_pages(struct xc_dom_image *dom)
-{
- int rc, i;
- const xen_pfn_t base = GUEST_MAGIC_BASE >> XC_PAGE_SHIFT;
- xen_pfn_t p2m[NR_MAGIC_PAGES];
-
- BUILD_BUG_ON(NR_MAGIC_PAGES > GUEST_MAGIC_SIZE >> XC_PAGE_SHIFT);
-
- DOMPRINTF_CALLED(dom->xch);
-
- for (i = 0; i < NR_MAGIC_PAGES; i++)
- p2m[i] = base + i;
-
- rc = xc_domain_populate_physmap_exact(
- dom->xch, dom->guest_domid, NR_MAGIC_PAGES,
- 0, 0, p2m);
- if ( rc < 0 )
- return rc;
-
- dom->console_pfn = base + CONSOLE_PFN_OFFSET;
- dom->xenstore_pfn = base + XENSTORE_PFN_OFFSET;
- dom->vuart_gfn = base + VUART_PFN_OFFSET;
-
- xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn);
- xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn);
- xc_clear_domain_page(dom->xch, dom->guest_domid, base + MEMACCESS_PFN_OFFSET);
- xc_clear_domain_page(dom->xch, dom->guest_domid, dom->vuart_gfn);
-
- xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_PFN,
- dom->console_pfn);
- xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_PFN,
- dom->xenstore_pfn);
- xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_MONITOR_RING_PFN,
- base + MEMACCESS_PFN_OFFSET);
- /* allocated by toolstack */
- xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_CONSOLE_EVTCHN,
- dom->console_evtchn);
- xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN,
- dom->xenstore_evtchn);
-
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int start_info_arm(struct xc_dom_image *dom)
-{
- DOMPRINTF_CALLED(dom->xch);
- return 0;
-}
-
-static int shared_info_arm(struct xc_dom_image *dom, void *ptr)
-{
- DOMPRINTF_CALLED(dom->xch);
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int vcpu_arm32(struct xc_dom_image *dom)
-{
- vcpu_guest_context_any_t any_ctx;
- vcpu_guest_context_t *ctxt = &any_ctx.c;
- int rc;
-
- DOMPRINTF_CALLED(dom->xch);
-
- /* clear everything */
- memset(ctxt, 0, sizeof(*ctxt));
-
- ctxt->user_regs.pc32 = dom->parms.virt_entry;
-
- /* Linux boot protocol. See linux.Documentation/arm/Booting. */
- ctxt->user_regs.r0_usr = 0; /* SBZ */
- /* Machine ID: We use DTB therefore no machine id */
- ctxt->user_regs.r1_usr = 0xffffffff;
- /* ATAGS/DTB: We currently require that the guest kernel to be
- * using CONFIG_ARM_APPENDED_DTB. Ensure that r2 does not look
- * like a valid pointer to a set of ATAGS or a DTB.
- */
- ctxt->user_regs.r2_usr = dom->devicetree_blob ?
- dom->devicetree_seg.vstart : 0xffffffff;
-
- ctxt->sctlr = SCTLR_GUEST_INIT;
-
- ctxt->ttbr0 = 0;
- ctxt->ttbr1 = 0;
- ctxt->ttbcr = 0; /* Defined Reset Value */
-
- ctxt->user_regs.cpsr = PSR_GUEST32_INIT;
-
- ctxt->flags = VGCF_online;
-
- DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx32,
- ctxt->user_regs.cpsr, ctxt->user_regs.pc32);
-
- rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
- if ( rc != 0 )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
-
- return rc;
-}
-
-static int vcpu_arm64(struct xc_dom_image *dom)
-{
- vcpu_guest_context_any_t any_ctx;
- vcpu_guest_context_t *ctxt = &any_ctx.c;
- int rc;
-
- DOMPRINTF_CALLED(dom->xch);
- /* clear everything */
- memset(ctxt, 0, sizeof(*ctxt));
-
- ctxt->user_regs.pc64 = dom->parms.virt_entry;
-
- /* Linux boot protocol. See linux.Documentation/arm64/booting.txt. */
- ctxt->user_regs.x0 = dom->devicetree_blob ?
- dom->devicetree_seg.vstart : 0xffffffff;
- ctxt->user_regs.x1 = 0;
- ctxt->user_regs.x2 = 0;
- ctxt->user_regs.x3 = 0;
-
- DOMPRINTF("DTB %"PRIx64, ctxt->user_regs.x0);
-
- ctxt->sctlr = SCTLR_GUEST_INIT;
-
- ctxt->ttbr0 = 0;
- ctxt->ttbr1 = 0;
- ctxt->ttbcr = 0; /* Defined Reset Value */
-
- ctxt->user_regs.cpsr = PSR_GUEST64_INIT;
-
- ctxt->flags = VGCF_online;
-
- DOMPRINTF("Initial state CPSR %#"PRIx32" PC %#"PRIx64,
- ctxt->user_regs.cpsr, ctxt->user_regs.pc64);
-
- rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
- if ( rc != 0 )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
-
- return rc;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int set_mode(xc_interface *xch, uint32_t domid, char *guest_type)
-{
- static const struct {
- char *guest;
- uint32_t size;
- } types[] = {
- { "xen-3.0-aarch64", 64 },
- { "xen-3.0-armv7l", 32 },
- };
- DECLARE_DOMCTL;
- int i,rc;
-
- domctl.domain = domid;
- domctl.cmd = XEN_DOMCTL_set_address_size;
- domctl.u.address_size.size = 0;
-
- for ( i = 0; i < ARRAY_SIZE(types); i++ )
- if ( !strcmp(types[i].guest, guest_type) )
- domctl.u.address_size.size = types[i].size;
- if ( domctl.u.address_size.size == 0 )
- {
- xc_dom_printf(xch, "%s: warning: unknown guest type %s",
- __FUNCTION__, guest_type);
- return -EINVAL;
- }
-
- xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__,
- guest_type, domctl.u.address_size.size);
- rc = do_domctl(xch, &domctl);
- if ( rc != 0 )
- xc_dom_printf(xch, "%s: warning: failed (rc=%d)",
- __FUNCTION__, rc);
- return rc;
-}
-
-/* >0: success, *nr_pfns set to number actually populated
- * 0: didn't try with this pfn shift (e.g. misaligned base etc)
- * <0: ERROR
- */
-static int populate_one_size(struct xc_dom_image *dom, int pfn_shift,
- xen_pfn_t base_pfn, xen_pfn_t *nr_pfns,
- xen_pfn_t *extents)
-{
- /* The mask for this level */
- const uint64_t mask = ((uint64_t)1<<(pfn_shift))-1;
- /* The shift, mask and next boundary for the level above this one */
- const int next_shift = pfn_shift + LPAE_SHIFT;
- const uint64_t next_mask = ((uint64_t)1<<next_shift)-1;
- const xen_pfn_t next_boundary
- = (base_pfn + ((uint64_t)1<<next_shift)) & ~next_mask;
-
- int nr, i, count;
- xen_pfn_t end_pfn = base_pfn + *nr_pfns;
-
- /* No level zero super pages with current hardware */
- if ( pfn_shift == PFN_512G_SHIFT )
- return 0;
-
- /* base is misaligned for this level */
- if ( mask & base_pfn )
- return 0;
-
- /*
- * If base is not aligned at the next level up then try and make
- * it so for next time around.
- */
- if ( (base_pfn & next_mask) && end_pfn > next_boundary )
- end_pfn = next_boundary;
-
- count = ( end_pfn - base_pfn ) >> pfn_shift;
-
- /* Nothing to allocate */
- if ( !count )
- return 0;
-
- for ( i = 0 ; i < count ; i ++ )
- extents[i] = base_pfn + (i<<pfn_shift);
-
- nr = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count,
- pfn_shift, 0, extents);
- if ( nr <= 0 ) return nr;
- DOMPRINTF("%s: populated %#x/%#x entries with shift %d",
- __FUNCTION__, nr, count, pfn_shift);
-
- *nr_pfns = nr << pfn_shift;
-
- return 1;
-}
-
-static int populate_guest_memory(struct xc_dom_image *dom,
- xen_pfn_t base_pfn, xen_pfn_t nr_pfns)
-{
- int rc = 0;
- xen_pfn_t allocsz, pfn, *extents;
-
- extents = calloc(1024*1024,sizeof(xen_pfn_t));
- if ( extents == NULL )
- {
- DOMPRINTF("%s: Unable to allocate extent array", __FUNCTION__);
- return -1;
- }
-
- DOMPRINTF("%s: populating RAM @ %016"PRIx64"-%016"PRIx64" (%"PRId64"MB)",
- __FUNCTION__,
- (uint64_t)base_pfn << XC_PAGE_SHIFT,
- (uint64_t)(base_pfn + nr_pfns) << XC_PAGE_SHIFT,
- (uint64_t)nr_pfns >> (20-XC_PAGE_SHIFT));
-
- for ( pfn = 0; pfn < nr_pfns; pfn += allocsz )
- {
- allocsz = min_t(int, 1024*1024, nr_pfns - pfn);
-#if 0 /* Enable this to exercise/debug the code which tries to realign
- * to a superpage boundary, by misaligning at the start. */
- if ( pfn == 0 )
- {
- allocsz = 1;
- rc = populate_one_size(dom, PFN_4K_SHIFT,
- base_pfn + pfn, &allocsz, extents);
- if (rc < 0) break;
- if (rc > 0) continue;
- /* Failed to allocate a single page? */
- break;
- }
-#endif
-
- rc = populate_one_size(dom, PFN_512G_SHIFT,
- base_pfn + pfn, &allocsz, extents);
- if ( rc < 0 ) break;
- if ( rc > 0 ) continue;
-
- rc = populate_one_size(dom, PFN_1G_SHIFT,
- base_pfn + pfn, &allocsz, extents);
- if ( rc < 0 ) break;
- if ( rc > 0 ) continue;
-
- rc = populate_one_size(dom, PFN_2M_SHIFT,
- base_pfn + pfn, &allocsz, extents);
- if ( rc < 0 ) break;
- if ( rc > 0 ) continue;
-
- rc = populate_one_size(dom, PFN_4K_SHIFT,
- base_pfn + pfn, &allocsz, extents);
- if ( rc < 0 ) break;
- if ( rc == 0 )
- {
- DOMPRINTF("%s: Not enough RAM", __FUNCTION__);
- errno = ENOMEM;
- rc = -1;
- goto out;
- }
- }
-
-out:
- free(extents);
- return rc < 0 ? rc : 0;
-}
-
-static int meminit(struct xc_dom_image *dom)
-{
- int i, rc;
- uint64_t modbase;
-
- uint64_t ramsize = (uint64_t)dom->total_pages << XC_PAGE_SHIFT;
-
- const uint64_t bankbase[] = GUEST_RAM_BANK_BASES;
- const uint64_t bankmax[] = GUEST_RAM_BANK_SIZES;
-
- /* Convenient */
- const uint64_t kernbase = dom->kernel_seg.vstart;
- const uint64_t kernend = ROUNDUP(dom->kernel_seg.vend, 21/*2MB*/);
- const uint64_t kernsize = kernend - kernbase;
- const uint64_t dtb_size = dom->devicetree_blob ?
- ROUNDUP(dom->devicetree_size, XC_PAGE_SHIFT) : 0;
- const uint64_t ramdisk_size = dom->modules[0].blob ?
- ROUNDUP(dom->modules[0].size, XC_PAGE_SHIFT) : 0;
- const uint64_t modsize = dtb_size + ramdisk_size;
- const uint64_t ram128mb = bankbase[0] + (128<<20);
-
- xen_pfn_t p2m_size;
- uint64_t bank0end;
-
- assert(dom->rambase_pfn << XC_PAGE_SHIFT == bankbase[0]);
-
- if ( modsize + kernsize > bankmax[0] )
- {
- DOMPRINTF("%s: Not enough memory for the kernel+dtb+initrd",
- __FUNCTION__);
- return -1;
- }
-
- if ( ramsize == 0 )
- {
- DOMPRINTF("%s: ram size is 0", __FUNCTION__);
- return -1;
- }
-
- if ( ramsize > GUEST_RAM_MAX )
- {
- DOMPRINTF("%s: ram size is too large for guest address space: "
- "%"PRIx64" > %llx",
- __FUNCTION__, ramsize, GUEST_RAM_MAX);
- return -1;
- }
-
- rc = set_mode(dom->xch, dom->guest_domid, dom->guest_type);
- if ( rc )
- return rc;
-
- for ( i = 0; ramsize && i < GUEST_RAM_BANKS; i++ )
- {
- uint64_t banksize = ramsize > bankmax[i] ? bankmax[i] : ramsize;
-
- ramsize -= banksize;
-
- p2m_size = ( bankbase[i] + banksize - bankbase[0] ) >> XC_PAGE_SHIFT;
-
- dom->rambank_size[i] = banksize >> XC_PAGE_SHIFT;
- }
-
- assert(dom->rambank_size[0] != 0);
- assert(ramsize == 0); /* Too much RAM is rejected above */
-
- dom->p2m_size = p2m_size;
-
- /* setup initial p2m and allocate guest memory */
- for ( i = 0; i < GUEST_RAM_BANKS && dom->rambank_size[i]; i++ )
- {
- if ((rc = populate_guest_memory(dom,
- bankbase[i] >> XC_PAGE_SHIFT,
- dom->rambank_size[i])))
- return rc;
- }
-
- /*
- * We try to place dtb+initrd at 128MB or if we have less RAM
- * as high as possible. If there is no space then fallback to
- * just before the kernel.
- *
- * If changing this then consider
- * xen/arch/arm/kernel.c:place_modules as well.
- */
- bank0end = bankbase[0] + ((uint64_t)dom->rambank_size[0] << XC_PAGE_SHIFT);
-
- if ( bank0end >= ram128mb + modsize && kernend < ram128mb )
- modbase = ram128mb;
- else if ( bank0end - modsize > kernend )
- modbase = bank0end - modsize;
- else if (kernbase - bankbase[0] > modsize )
- modbase = kernbase - modsize;
- else
- return -1;
-
- DOMPRINTF("%s: placing boot modules at 0x%" PRIx64, __FUNCTION__, modbase);
-
- /*
- * Must map DTB *after* initrd, to satisfy order of calls to
- * xc_dom_alloc_segment in xc_dom_build_image, which must map
- * things at monotonolically increasing addresses.
- */
- if ( ramdisk_size )
- {
- dom->modules[0].seg.vstart = modbase;
- dom->modules[0].seg.vend = modbase + ramdisk_size;
-
- DOMPRINTF("%s: ramdisk: 0x%" PRIx64 " -> 0x%" PRIx64 "",
- __FUNCTION__,
- dom->modules[0].seg.vstart, dom->modules[0].seg.vend);
-
- modbase += ramdisk_size;
- }
-
- if ( dtb_size )
- {
- dom->devicetree_seg.vstart = modbase;
- dom->devicetree_seg.vend = modbase + dtb_size;
-
- DOMPRINTF("%s: devicetree: 0x%" PRIx64 " -> 0x%" PRIx64 "",
- __FUNCTION__,
- dom->devicetree_seg.vstart, dom->devicetree_seg.vend);
-
- modbase += dtb_size;
- }
-
- return 0;
-}
-
-bool xc_dom_translated(const struct xc_dom_image *dom)
-{
- return true;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int bootearly(struct xc_dom_image *dom)
-{
- DOMPRINTF("%s: doing nothing", __FUNCTION__);
- return 0;
-}
-
-static int bootlate(struct xc_dom_image *dom)
-{
- /* XXX
- * map shared info
- * map grant tables
- * setup shared info
- */
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static struct xc_dom_arch xc_dom_32 = {
- .guest_type = "xen-3.0-armv7l",
- .native_protocol = XEN_IO_PROTO_ABI_ARM,
- .page_shift = PAGE_SHIFT_ARM,
- .sizeof_pfn = 8,
- .alloc_magic_pages = alloc_magic_pages,
- .start_info = start_info_arm,
- .shared_info = shared_info_arm,
- .vcpu = vcpu_arm32,
- .meminit = meminit,
- .bootearly = bootearly,
- .bootlate = bootlate,
-};
-
-static struct xc_dom_arch xc_dom_64 = {
- .guest_type = "xen-3.0-aarch64",
- .native_protocol = XEN_IO_PROTO_ABI_ARM,
- .page_shift = PAGE_SHIFT_ARM,
- .sizeof_pfn = 8,
- .alloc_magic_pages = alloc_magic_pages,
- .start_info = start_info_arm,
- .shared_info = shared_info_arm,
- .vcpu = vcpu_arm64,
- .meminit = meminit,
- .bootearly = bootearly,
- .bootlate = bootlate,
-};
-
-static void __init register_arch_hooks(void)
-{
- xc_dom_register_arch_hooks(&xc_dom_32);
- xc_dom_register_arch_hooks(&xc_dom_64);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * Xen domain builder -- ARM zImage bits
- *
- * Parse and load ARM zImage kernel images.
- *
- * Copyright (C) 2012, Citrix Systems.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-
-#include <arpa/inet.h> /* XXX ntohl is not the right function... */
-
-struct minimal_dtb_header {
- uint32_t magic;
- uint32_t total_size;
- /* There are other fields but we don't use them yet. */
-};
-
-#define DTB_MAGIC 0xd00dfeed
-
-/* ------------------------------------------------------------ */
-/* 32-bit zImage Support */
-/* ------------------------------------------------------------ */
-
-#define ZIMAGE32_MAGIC_OFFSET 0x24
-#define ZIMAGE32_START_OFFSET 0x28
-#define ZIMAGE32_END_OFFSET 0x2c
-
-#define ZIMAGE32_MAGIC 0x016f2818
-
-static int xc_dom_probe_zimage32_kernel(struct xc_dom_image *dom)
-{
- uint32_t *zimage;
-
- if ( dom->kernel_blob == NULL )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: no kernel image loaded", __FUNCTION__);
- return -EINVAL;
- }
-
- if ( dom->kernel_size < 0x30 /*sizeof(struct setup_header)*/ )
- {
- xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__);
- return -EINVAL;
- }
-
- zimage = (uint32_t *)dom->kernel_blob;
- if ( zimage[ZIMAGE32_MAGIC_OFFSET/4] != ZIMAGE32_MAGIC )
- {
- xc_dom_printf(dom->xch, "%s: kernel is not an arm32 zImage", __FUNCTION__);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int xc_dom_parse_zimage32_kernel(struct xc_dom_image *dom)
-{
- uint32_t *zimage;
- uint32_t start, entry_addr;
- uint64_t v_start, v_end;
- uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT;
-
- DOMPRINTF_CALLED(dom->xch);
-
- zimage = (uint32_t *)dom->kernel_blob;
-
- /* Do not load kernel at the very first RAM address */
- v_start = rambase + 0x8000;
-
- if ( dom->kernel_size > UINT64_MAX - v_start )
- {
- DOMPRINTF("%s: kernel is too large\n", __FUNCTION__);
- return -EINVAL;
- }
-
- v_end = v_start + dom->kernel_size;
-
- /*
- * If start is invalid then the guest will start at some invalid
- * address and crash, but this happens in guest context so doesn't
- * concern us here.
- */
- start = zimage[ZIMAGE32_START_OFFSET/4];
-
- if (start == 0)
- entry_addr = v_start;
- else
- entry_addr = start;
-
- /* find kernel segment */
- dom->kernel_seg.vstart = v_start;
- dom->kernel_seg.vend = v_end;
-
- dom->parms.virt_entry = entry_addr;
- dom->parms.virt_base = rambase;
-
- dom->guest_type = "xen-3.0-armv7l";
- DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "",
- __FUNCTION__, dom->guest_type,
- dom->kernel_seg.vstart, dom->kernel_seg.vend);
- return 0;
-}
-
-/* ------------------------------------------------------------ */
-/* 64-bit zImage Support */
-/* ------------------------------------------------------------ */
-
-#define ZIMAGE64_MAGIC_V0 0x14000008
-#define ZIMAGE64_MAGIC_V1 0x644d5241 /* "ARM\x64" */
-
-/* linux/Documentation/arm64/booting.txt */
-struct zimage64_hdr {
- uint32_t magic0;
- uint32_t res0;
- uint64_t text_offset; /* Image load offset */
- uint64_t res1;
- uint64_t res2;
- /* zImage V1 only from here */
- uint64_t res3;
- uint64_t res4;
- uint64_t res5;
- uint32_t magic1;
- uint32_t res6;
-};
-static int xc_dom_probe_zimage64_kernel(struct xc_dom_image *dom)
-{
- struct zimage64_hdr *zimage;
-
- if ( dom->kernel_blob == NULL )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: no kernel image loaded", __FUNCTION__);
- return -EINVAL;
- }
-
- if ( dom->kernel_size < sizeof(*zimage) )
- {
- xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__);
- return -EINVAL;
- }
-
- zimage = dom->kernel_blob;
- if ( zimage->magic0 != ZIMAGE64_MAGIC_V0 &&
- zimage->magic1 != ZIMAGE64_MAGIC_V1 )
- {
- xc_dom_printf(dom->xch, "%s: kernel is not an arm64 Image", __FUNCTION__);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int xc_dom_parse_zimage64_kernel(struct xc_dom_image *dom)
-{
- struct zimage64_hdr *zimage;
- uint64_t v_start, v_end;
- uint64_t rambase = dom->rambase_pfn << XC_PAGE_SHIFT;
-
- DOMPRINTF_CALLED(dom->xch);
-
- zimage = dom->kernel_blob;
-
- if ( zimage->text_offset > UINT64_MAX - rambase )
- {
- DOMPRINTF("%s: kernel text offset is too large\n", __FUNCTION__);
- return -EINVAL;
- }
-
- v_start = rambase + zimage->text_offset;
-
- if ( dom->kernel_size > UINT64_MAX - v_start )
- {
- DOMPRINTF("%s: kernel is too large\n", __FUNCTION__);
- return -EINVAL;
- }
-
- v_end = v_start + dom->kernel_size;
-
- dom->kernel_seg.vstart = v_start;
- dom->kernel_seg.vend = v_end;
-
- /* Call the kernel at offset 0 */
- dom->parms.virt_entry = v_start;
- dom->parms.virt_base = rambase;
-
- dom->guest_type = "xen-3.0-aarch64";
- DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "",
- __FUNCTION__, dom->guest_type,
- dom->kernel_seg.vstart, dom->kernel_seg.vend);
-
- return 0;
-}
-
-/* ------------------------------------------------------------ */
-/* Common zImage Support */
-/* ------------------------------------------------------------ */
-
-static int xc_dom_load_zimage_kernel(struct xc_dom_image *dom)
-{
- void *dst;
-
- DOMPRINTF_CALLED(dom->xch);
-
- dst = xc_dom_seg_to_ptr(dom, &dom->kernel_seg);
- if ( dst == NULL )
- {
- DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->kernel_seg) => NULL",
- __func__);
- return -1;
- }
-
- DOMPRINTF("%s: kernel seg %#"PRIx64"-%#"PRIx64,
- __func__, dom->kernel_seg.vstart, dom->kernel_seg.vend);
- DOMPRINTF("%s: copy %zd bytes from blob %p to dst %p",
- __func__, dom->kernel_size, dom->kernel_blob, dst);
-
- memcpy(dst, dom->kernel_blob, dom->kernel_size);
-
- return 0;
-}
-
-static struct xc_dom_loader zimage32_loader = {
- .name = "Linux zImage (ARM32)",
- .probe = xc_dom_probe_zimage32_kernel,
- .parser = xc_dom_parse_zimage32_kernel,
- .loader = xc_dom_load_zimage_kernel,
-};
-
-static struct xc_dom_loader zimage64_loader = {
- .name = "Linux zImage (ARM64)",
- .probe = xc_dom_probe_zimage64_kernel,
- .parser = xc_dom_parse_zimage64_kernel,
- .loader = xc_dom_load_zimage_kernel,
-};
-
-static void __init register_loader(void)
-{
- xc_dom_register_loader(&zimage32_loader);
- xc_dom_register_loader(&zimage64_loader);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * Some of the field descriptions were copied from "The Multiboot
- * Specification", Copyright 1995, 96 Bryan Ford <baford@cs.utah.edu>,
- * Erich Stefan Boleyn <erich@uruk.org> Copyright 1999, 2000, 2001, 2002
- * Free Software Foundation, Inc.
- */
-
-/******************************************************************************
- *
- * Loads simple binary images. It's like a .COM file in MS-DOS. No headers are
- * present. The only requirement is that it must have a xen_bin_image table
- * somewhere in the first 8192 bytes, starting on a 32-bit aligned address.
- * Those familiar with the multiboot specification should recognize this, it's
- * (almost) the same as the multiboot header.
- * The layout of the xen_bin_image table is:
- *
- * Offset Type Name Note
- * 0 uint32_t magic required
- * 4 uint32_t flags required
- * 8 uint32_t checksum required
- * 12 uint32_t header_addr required
- * 16 uint32_t load_addr required
- * 20 uint32_t load_end_addr required
- * 24 uint32_t bss_end_addr required
- * 28 uint32_t entry_addr required
- *
- * - magic
- * Magic number identifying the table. For images to be loaded by Xen 3, the
- * magic value is 0x336ec578 ("xEn3" with the 0x80 bit of the "E" set).
- * - flags
- * bit 0: indicates whether the image needs to be loaded on a page boundary
- * bit 1: reserved, must be 0 (the multiboot spec uses this bit to indicate
- * that memory info should be passed to the image)
- * bit 2: reserved, must be 0 (the multiboot spec uses this bit to indicate
- * that the bootloader should pass video mode info to the image)
- * bit 16: reserved, must be 1 (the multiboot spec uses this bit to indicate
- * that the values in the fields header_addr - entry_addr are
- * valid)
- * All other bits should be set to 0.
- * - checksum
- * When added to "magic" and "flags", the resulting value should be 0.
- * - header_addr
- * Contains the virtual address corresponding to the beginning of the
- * table - the memory location at which the magic value is supposed to be
- * loaded. This field serves to synchronize the mapping between OS image
- * offsets and virtual memory addresses.
- * - load_addr
- * Contains the virtual address of the beginning of the text segment. The
- * offset in the OS image file at which to start loading is defined by the
- * offset at which the table was found, minus (header addr - load addr).
- * load addr must be less than or equal to header addr.
- * - load_end_addr
- * Contains the virtual address of the end of the data segment.
- * (load_end_addr - load_addr) specifies how much data to load. This implies
- * that the text and data segments must be consecutive in the OS image. If
- * this field is zero, the domain builder assumes that the text and data
- * segments occupy the whole OS image file.
- * - bss_end_addr
- * Contains the virtual address of the end of the bss segment. The domain
- * builder initializes this area to zero, and reserves the memory it occupies
- * to avoid placing boot modules and other data relevant to the loaded image
- * in that area. If this field is zero, the domain builder assumes that no bss
- * segment is present.
- * - entry_addr
- * The virtual address at which to start execution of the loaded image.
- *
- */
-
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-
-#define round_pgup(_p) (((_p)+(PAGE_SIZE_X86-1))&PAGE_MASK_X86)
-#define round_pgdown(_p) ((_p)&PAGE_MASK_X86)
-
-struct xen_bin_image_table
-{
- uint32_t magic;
- uint32_t flags;
- uint32_t checksum;
- uint32_t header_addr;
- uint32_t load_addr;
- uint32_t load_end_addr;
- uint32_t bss_end_addr;
- uint32_t entry_addr;
-};
-
-#define XEN_MULTIBOOT_MAGIC3 0x336ec578
-
-#define XEN_MULTIBOOT_FLAG_ALIGN4K 0x00000001
-#define XEN_MULTIBOOT_FLAG_NEEDMEMINFO 0x00000002
-#define XEN_MULTIBOOT_FLAG_NEEDVIDINFO 0x00000004
-#define XEN_MULTIBOOT_FLAG_ADDRSVALID 0x00010000
-#define XEN_MULTIBOOT_FLAG_PAE_SHIFT 14
-#define XEN_MULTIBOOT_FLAG_PAE_MASK (3 << XEN_MULTIBOOT_FLAG_PAE_SHIFT)
-
-/* Flags we test for */
-#define FLAGS_MASK ((~ 0) & (~ XEN_MULTIBOOT_FLAG_ALIGN4K) & \
- (~ XEN_MULTIBOOT_FLAG_PAE_MASK))
-#define FLAGS_REQUIRED XEN_MULTIBOOT_FLAG_ADDRSVALID
-
-/* --------------------------------------------------------------------- */
-
-static struct xen_bin_image_table *find_table(struct xc_dom_image *dom)
-{
- struct xen_bin_image_table *table;
- uint32_t *probe_ptr;
- uint32_t *probe_end;
-
- if ( dom->kernel_size < sizeof(*table) )
- return NULL;
- probe_ptr = dom->kernel_blob;
- if ( dom->kernel_size > (8192 + sizeof(*table)) )
- probe_end = dom->kernel_blob + 8192;
- else
- probe_end = dom->kernel_blob + dom->kernel_size - sizeof(*table);
-
- for ( table = NULL; probe_ptr < probe_end; probe_ptr++ )
- {
- if ( *probe_ptr == XEN_MULTIBOOT_MAGIC3 )
- {
- table = (struct xen_bin_image_table *) probe_ptr;
- /* Checksum correct? */
- if ( (table->magic + table->flags + table->checksum) == 0 )
- return table;
- }
- }
- return NULL;
-}
-
-static int xc_dom_probe_bin_kernel(struct xc_dom_image *dom)
-{
- return find_table(dom) ? 0 : -EINVAL;
-}
-
-static int xc_dom_parse_bin_kernel(struct xc_dom_image *dom)
-{
- struct xen_bin_image_table *image_info;
- char *image = dom->kernel_blob;
- size_t image_size = dom->kernel_size;
- uint32_t start_addr;
- uint32_t load_end_addr;
- uint32_t bss_end_addr;
- uint32_t pae_flags;
-
- image_info = find_table(dom);
- if ( !image_info )
- return -EINVAL;
-
- DOMPRINTF("%s: multiboot header fields", __FUNCTION__);
- DOMPRINTF(" flags: 0x%" PRIx32 "", image_info->flags);
- DOMPRINTF(" header_addr: 0x%" PRIx32 "", image_info->header_addr);
- DOMPRINTF(" load_addr: 0x%" PRIx32 "", image_info->load_addr);
- DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", image_info->load_end_addr);
- DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", image_info->bss_end_addr);
- DOMPRINTF(" entry_addr: 0x%" PRIx32 "", image_info->entry_addr);
-
- /* Check the flags */
- if ( (image_info->flags & FLAGS_MASK) != FLAGS_REQUIRED )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s: xen_bin_image_table flags required "
- "0x%08" PRIx32 " found 0x%08" PRIx32 "",
- __FUNCTION__, FLAGS_REQUIRED, image_info->flags & FLAGS_MASK);
- return -EINVAL;
- }
-
- /* Sanity check on the addresses */
- if ( (image_info->header_addr < image_info->load_addr) ||
- ((char *) image_info - image) <
- (image_info->header_addr - image_info->load_addr) )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid header_addr.",
- __FUNCTION__);
- return -EINVAL;
- }
-
- start_addr = image_info->header_addr - ((char *)image_info - image);
- load_end_addr = image_info->load_end_addr ?: start_addr + image_size;
- bss_end_addr = image_info->bss_end_addr ?: load_end_addr;
-
- DOMPRINTF("%s: calculated addresses", __FUNCTION__);
- DOMPRINTF(" start_addr: 0x%" PRIx32 "", start_addr);
- DOMPRINTF(" load_end_addr: 0x%" PRIx32 "", load_end_addr);
- DOMPRINTF(" bss_end_addr: 0x%" PRIx32 "", bss_end_addr);
-
- if ( (start_addr + image_size) < load_end_addr )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid load_end_addr.",
- __FUNCTION__);
- return -EINVAL;
- }
-
- if ( bss_end_addr < load_end_addr)
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Invalid bss_end_addr.",
- __FUNCTION__);
- return -EINVAL;
- }
-
- dom->kernel_seg.vstart = image_info->load_addr;
- dom->kernel_seg.vend = bss_end_addr;
- dom->parms.virt_base = start_addr;
- dom->parms.virt_entry = image_info->entry_addr;
-
- pae_flags = image_info->flags & XEN_MULTIBOOT_FLAG_PAE_MASK;
- switch (pae_flags >> XEN_MULTIBOOT_FLAG_PAE_SHIFT) {
- case 0:
- dom->guest_type = "xen-3.0-x86_32";
- break;
- case 1:
- dom->guest_type = "xen-3.0-x86_32p";
- break;
- case 2:
- dom->guest_type = "xen-3.0-x86_64";
- break;
- case 3:
- /* Kernel detects PAE at runtime. So try to figure whenever
- * xen supports PAE and advertise a PAE-capable kernel in case
- * it does. */
- dom->guest_type = "xen-3.0-x86_32";
- if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") )
- {
- DOMPRINTF("%s: PAE fixup", __FUNCTION__);
- dom->guest_type = "xen-3.0-x86_32p";
- dom->parms.pae = XEN_PAE_EXTCR3;
- }
- break;
- }
- return 0;
-}
-
-static int xc_dom_load_bin_kernel(struct xc_dom_image *dom)
-{
- struct xen_bin_image_table *image_info;
- char *image = dom->kernel_blob;
- char *dest;
- size_t image_size = dom->kernel_size;
- size_t dest_size;
- uint32_t start_addr;
- uint32_t load_end_addr;
- uint32_t bss_end_addr;
- uint32_t skip, text_size, bss_size;
-
- image_info = find_table(dom);
- if ( !image_info )
- return -EINVAL;
-
- start_addr = image_info->header_addr - ((char *)image_info - image);
- load_end_addr = image_info->load_end_addr ?: start_addr + image_size;
- bss_end_addr = image_info->bss_end_addr ?: load_end_addr;
-
- /* It's possible that we need to skip the first part of the image */
- skip = image_info->load_addr - start_addr;
- text_size = load_end_addr - image_info->load_addr;
- bss_size = bss_end_addr - load_end_addr;
-
- DOMPRINTF("%s: calculated sizes", __FUNCTION__);
- DOMPRINTF(" skip: 0x%" PRIx32 "", skip);
- DOMPRINTF(" text_size: 0x%" PRIx32 "", text_size);
- DOMPRINTF(" bss_size: 0x%" PRIx32 "", bss_size);
-
- dest = xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart, &dest_size);
- if ( dest == NULL )
- {
- DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom, dom->kernel_seg.vstart)"
- " => NULL", __FUNCTION__);
- return -EINVAL;
- }
-
- if ( dest_size < text_size ||
- dest_size - text_size < bss_size )
- {
- DOMPRINTF("%s: mapped region is too small for image", __FUNCTION__);
- return -EINVAL;
- }
-
- if ( image_size < skip ||
- image_size - skip < text_size )
- {
- DOMPRINTF("%s: image is too small for declared text size",
- __FUNCTION__);
- return -EINVAL;
- }
-
- memcpy(dest, image + skip, text_size);
- memset(dest + text_size, 0, bss_size);
-
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static struct xc_dom_loader bin_loader = {
- .name = "multiboot-binary",
- .probe = xc_dom_probe_bin_kernel,
- .parser = xc_dom_parse_bin_kernel,
- .loader = xc_dom_load_bin_kernel,
-};
-
-static void __init register_loader(void)
-{
- xc_dom_register_loader(&bin_loader);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * Xen domain builder -- xen booter.
- *
- * This is the code which actually boots a fresh
- * prepared domain image as xen guest domain.
- *
- * ==> this is the only domain builder code piece
- * where xen hypercalls are allowed <==
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include <zlib.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-#include "xc_core.h"
-#include <xen/hvm/params.h>
-#include <xen/grant_table.h>
-
-/* ------------------------------------------------------------------------ */
-
-static int setup_hypercall_page(struct xc_dom_image *dom)
-{
- DECLARE_DOMCTL;
- xen_pfn_t pfn;
- int rc;
-
- if ( dom->parms.virt_hypercall == -1 )
- return 0;
- pfn = (dom->parms.virt_hypercall - dom->parms.virt_base)
- >> XC_DOM_PAGE_SHIFT(dom);
-
- DOMPRINTF("%s: vaddr=0x%" PRIx64 " pfn=0x%" PRIpfn "", __FUNCTION__,
- dom->parms.virt_hypercall, pfn);
- domctl.cmd = XEN_DOMCTL_hypercall_init;
- domctl.domain = dom->guest_domid;
- domctl.u.hypercall_init.gmfn = xc_dom_p2m(dom, pfn);
- rc = do_domctl(dom->xch, &domctl);
- if ( rc != 0 )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: HYPERCALL_INIT failed: %d - %s)",
- __FUNCTION__, errno, strerror(errno));
- return rc;
-}
-
-
-/* ------------------------------------------------------------------------ */
-
-int xc_dom_compat_check(struct xc_dom_image *dom)
-{
- xen_capabilities_info_t xen_caps;
- char *item, *ptr;
- int match, found = 0;
-
- strncpy(xen_caps, dom->xen_caps, XEN_CAPABILITIES_INFO_LEN - 1);
- xen_caps[XEN_CAPABILITIES_INFO_LEN - 1] = '\0';
-
- for ( item = strtok_r(xen_caps, " ", &ptr);
- item != NULL ; item = strtok_r(NULL, " ", &ptr) )
- {
- match = !strcmp(dom->guest_type, item);
- DOMPRINTF("%s: supported guest type: %s%s", __FUNCTION__,
- item, match ? " <= matches" : "");
- if ( match )
- found++;
- }
- if ( !found )
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s: guest type %s not supported by xen kernel, sorry",
- __FUNCTION__, dom->guest_type);
-
- return found;
-}
-
-int xc_dom_boot_xen_init(struct xc_dom_image *dom, xc_interface *xch, uint32_t domid)
-{
- dom->xch = xch;
- dom->guest_domid = domid;
-
- dom->xen_version = xc_version(xch, XENVER_version, NULL);
- if ( xc_version(xch, XENVER_capabilities, &dom->xen_caps) < 0 )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR, "can't get xen capabilities");
- return -1;
- }
- DOMPRINTF("%s: ver %d.%d, caps %s", __FUNCTION__,
- dom->xen_version >> 16, dom->xen_version & 0xff,
- dom->xen_caps);
- return 0;
-}
-
-int xc_dom_boot_mem_init(struct xc_dom_image *dom)
-{
- long rc;
-
- DOMPRINTF_CALLED(dom->xch);
-
- rc = dom->arch_hooks->meminit(dom);
- if ( rc != 0 )
- {
- xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
- "%s: can't allocate low memory for domain",
- __FUNCTION__);
- return rc;
- }
-
- return 0;
-}
-
-void *xc_dom_boot_domU_map(struct xc_dom_image *dom, xen_pfn_t pfn,
- xen_pfn_t count)
-{
- int page_shift = XC_DOM_PAGE_SHIFT(dom);
- privcmd_mmap_entry_t *entries;
- void *ptr;
- int i;
- int err;
-
- entries = xc_dom_malloc(dom, count * sizeof(privcmd_mmap_entry_t));
- if ( entries == NULL )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn
- " [malloc]", __FUNCTION__, pfn, count);
- return NULL;
- }
-
- for ( i = 0; i < count; i++ )
- entries[i].mfn = xc_dom_p2m(dom, pfn + i);
-
- ptr = xc_map_foreign_ranges(dom->xch, dom->guest_domid,
- count << page_shift, PROT_READ | PROT_WRITE, 1 << page_shift,
- entries, count);
- if ( ptr == NULL )
- {
- err = errno;
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn
- " [mmap, errno=%i (%s)]", __FUNCTION__, pfn, count,
- err, strerror(err));
- return NULL;
- }
-
- return ptr;
-}
-
-int xc_dom_boot_image(struct xc_dom_image *dom)
-{
- xc_dominfo_t info;
- int rc;
-
- DOMPRINTF_CALLED(dom->xch);
-
- /* misc stuff*/
- if ( (rc = dom->arch_hooks->bootearly(dom)) != 0 )
- return rc;
-
- /* collect some info */
- rc = xc_domain_getinfo(dom->xch, dom->guest_domid, 1, &info);
- if ( rc < 0 )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: getdomaininfo failed (rc=%d)", __FUNCTION__, rc);
- return rc;
- }
- if ( rc == 0 || info.domid != dom->guest_domid )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: Huh? No domains found (nr_domains=%d) "
- "or domid mismatch (%d != %d)", __FUNCTION__,
- rc, info.domid, dom->guest_domid);
- return -1;
- }
- dom->shared_info_mfn = info.shared_info_frame;
-
- /* sanity checks */
- if ( !xc_dom_compat_check(dom) )
- return -1;
-
- /* initial mm setup */
- if ( dom->arch_hooks->setup_pgtables &&
- (rc = dom->arch_hooks->setup_pgtables(dom)) != 0 )
- return rc;
-
- /* start info page */
- if ( dom->arch_hooks->start_info )
- dom->arch_hooks->start_info(dom);
-
- /* hypercall page */
- if ( (rc = setup_hypercall_page(dom)) != 0 )
- return rc;
- xc_dom_log_memory_footprint(dom);
-
- /* misc x86 stuff */
- if ( (rc = dom->arch_hooks->bootlate(dom)) != 0 )
- return rc;
-
- /* let the vm run */
- if ( (rc = dom->arch_hooks->vcpu(dom)) != 0 )
- return rc;
- xc_dom_unmap_all(dom);
-
- return rc;
-}
-
-static xen_pfn_t xc_dom_gnttab_setup(xc_interface *xch, uint32_t domid)
-{
- gnttab_setup_table_t setup;
- DECLARE_HYPERCALL_BUFFER(xen_pfn_t, gmfnp);
- int rc;
- xen_pfn_t gmfn;
-
- gmfnp = xc_hypercall_buffer_alloc(xch, gmfnp, sizeof(*gmfnp));
- if (gmfnp == NULL)
- return -1;
-
- setup.dom = domid;
- setup.nr_frames = 1;
- set_xen_guest_handle(setup.frame_list, gmfnp);
- setup.status = 0;
-
- rc = xc_gnttab_op(xch, GNTTABOP_setup_table, &setup, sizeof(setup), 1);
- gmfn = *gmfnp;
- xc_hypercall_buffer_free(xch, gmfnp);
-
- if ( rc != 0 || setup.status != GNTST_okay )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to setup domU grant table "
- "[errno=%d, status=%" PRId16 "]\n",
- __FUNCTION__, rc != 0 ? errno : 0, setup.status);
- return -1;
- }
-
- return gmfn;
-}
-
-static void xc_dom_set_gnttab_entry(xc_interface *xch,
- grant_entry_v1_t *gnttab,
- unsigned int idx,
- uint32_t guest_domid,
- uint32_t backend_domid,
- xen_pfn_t guest_gfn)
-{
- if ( guest_domid == backend_domid || guest_gfn == -1 )
- return;
-
- xc_dom_printf(xch, "%s: d%d gnt[%u] -> d%d 0x%"PRI_xen_pfn,
- __func__, guest_domid, idx, backend_domid, guest_gfn);
-
- gnttab[idx].flags = GTF_permit_access;
- gnttab[idx].domid = backend_domid;
- gnttab[idx].frame = guest_gfn;
-}
-
-static int compat_gnttab_seed(xc_interface *xch, uint32_t domid,
- xen_pfn_t console_gfn,
- xen_pfn_t xenstore_gfn,
- uint32_t console_domid,
- uint32_t xenstore_domid)
-{
-
- xen_pfn_t gnttab_gfn;
- grant_entry_v1_t *gnttab;
-
- gnttab_gfn = xc_dom_gnttab_setup(xch, domid);
- if ( gnttab_gfn == -1 )
- return -1;
-
- gnttab = xc_map_foreign_range(xch,
- domid,
- PAGE_SIZE,
- PROT_READ|PROT_WRITE,
- gnttab_gfn);
- if ( gnttab == NULL )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to map d%d grant table "
- "[errno=%d]\n",
- __func__, domid, errno);
- return -1;
- }
-
- xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_CONSOLE,
- domid, console_domid, console_gfn);
- xc_dom_set_gnttab_entry(xch, gnttab, GNTTAB_RESERVED_XENSTORE,
- domid, xenstore_domid, xenstore_gfn);
-
- if ( munmap(gnttab, PAGE_SIZE) == -1 )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to unmap d%d grant table "
- "[errno=%d]\n",
- __func__, domid, errno);
- return -1;
- }
-
- /* Guest shouldn't really touch its grant table until it has
- * enabled its caches. But lets be nice. */
- xc_domain_cacheflush(xch, domid, gnttab_gfn, 1);
-
- return 0;
-}
-
-static int compat_gnttab_hvm_seed(xc_interface *xch, uint32_t domid,
- xen_pfn_t console_gfn,
- xen_pfn_t xenstore_gfn,
- uint32_t console_domid,
- uint32_t xenstore_domid)
-{
- int rc;
- xen_pfn_t scratch_gfn;
- struct xen_add_to_physmap xatp = {
- .domid = domid,
- .space = XENMAPSPACE_grant_table,
- .idx = 0,
- };
- struct xen_remove_from_physmap xrfp = {
- .domid = domid,
- };
-
- rc = xc_core_arch_get_scratch_gpfn(xch, domid, &scratch_gfn);
- if ( rc < 0 )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to get a scratch gfn from d%d"
- "[errno=%d]\n",
- __func__, domid, errno);
- return -1;
- }
- xatp.gpfn = scratch_gfn;
- xrfp.gpfn = scratch_gfn;
-
- xc_dom_printf(xch, "%s: d%d: pfn=0x%"PRI_xen_pfn, __func__,
- domid, scratch_gfn);
-
- rc = do_memory_op(xch, XENMEM_add_to_physmap, &xatp, sizeof(xatp));
- if ( rc != 0 )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to add gnttab to d%d physmap "
- "[errno=%d]\n",
- __func__, domid, errno);
- return -1;
- }
-
- rc = compat_gnttab_seed(xch, domid,
- console_gfn, xenstore_gfn,
- console_domid, xenstore_domid);
- if (rc != 0)
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to seed gnttab entries for d%d\n",
- __func__, domid);
- (void) do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp,
- sizeof(xrfp));
- return -1;
- }
-
- rc = do_memory_op(xch, XENMEM_remove_from_physmap, &xrfp, sizeof(xrfp));
- if (rc != 0)
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to remove gnttab from d%d physmap "
- "[errno=%d]\n",
- __func__, domid, errno);
- return -1;
- }
-
- return 0;
-}
-
-int xc_dom_gnttab_seed(xc_interface *xch, uint32_t guest_domid,
- bool is_hvm, xen_pfn_t console_gfn,
- xen_pfn_t xenstore_gfn, uint32_t console_domid,
- uint32_t xenstore_domid)
-{
- xenforeignmemory_handle* fmem = xch->fmem;
- xenforeignmemory_resource_handle *fres;
- void *addr = NULL;
-
- fres = xenforeignmemory_map_resource(
- fmem, guest_domid, XENMEM_resource_grant_table,
- XENMEM_resource_grant_table_id_shared, 0, 1, &addr,
- PROT_READ | PROT_WRITE, 0);
- if ( !fres )
- {
- if ( errno == EOPNOTSUPP )
- return is_hvm ?
- compat_gnttab_hvm_seed(xch, guest_domid,
- console_gfn, xenstore_gfn,
- console_domid, xenstore_domid) :
- compat_gnttab_seed(xch, guest_domid,
- console_gfn, xenstore_gfn,
- console_domid, xenstore_domid);
-
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: failed to acquire d%d grant table [errno=%d]\n",
- __func__, guest_domid, errno);
- return -1;
- }
-
- xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_CONSOLE,
- guest_domid, console_domid, console_gfn);
- xc_dom_set_gnttab_entry(xch, addr, GNTTAB_RESERVED_XENSTORE,
- guest_domid, xenstore_domid, xenstore_gfn);
-
- xenforeignmemory_unmap_resource(fmem, fres);
-
- return 0;
-}
-
-int xc_dom_gnttab_init(struct xc_dom_image *dom)
-{
- bool is_hvm = xc_dom_translated(dom);
- xen_pfn_t console_gfn = xc_dom_p2m(dom, dom->console_pfn);
- xen_pfn_t xenstore_gfn = xc_dom_p2m(dom, dom->xenstore_pfn);
-
- return xc_dom_gnttab_seed(dom->xch, dom->guest_domid, is_hvm,
- console_gfn, xenstore_gfn,
- dom->console_domid, dom->xenstore_domid);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * Xen domain builder -- bzImage bits
- *
- * Parse and load bzImage kernel images.
- *
- * This relies on version 2.08 of the boot protocol, which contains an
- * ELF file embedded in the bzImage. The loader extracts this ELF
- * image and passes it off to the standard ELF loader.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
- * written 2007 by Jeremy Fitzhardinge <jeremy@xensource.com>
- * written 2008 by Ian Campbell <ijc@hellion.org.uk>
- * written 2009 by Chris Lalancette <clalance@redhat.com>
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xg_dom_decompress.h"
-
-#include <xen-tools/libs.h>
-
-#ifndef __MINIOS__
-
-#if defined(HAVE_BZLIB)
-
-#include <bzlib.h>
-
-static int xc_try_bzip2_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- bz_stream stream;
- int ret;
- char *out_buf;
- char *tmp_buf;
- int retval = -1;
- unsigned int outsize;
- uint64_t total;
-
- stream.bzalloc = NULL;
- stream.bzfree = NULL;
- stream.opaque = NULL;
-
- if ( dom->kernel_size == 0)
- {
- DOMPRINTF("BZIP2: Input is 0 size");
- return -1;
- }
-
- ret = BZ2_bzDecompressInit(&stream, 0, 0);
- if ( ret != BZ_OK )
- {
- DOMPRINTF("BZIP2: Error initting stream");
- return -1;
- }
-
- /* sigh. We don't know up-front how much memory we are going to need
- * for the output buffer. Allocate the output buffer to be equal
- * the input buffer to start, and we'll realloc as needed.
- */
- outsize = dom->kernel_size;
-
- /*
- * stream.avail_in and outsize are unsigned int, while kernel_size
- * is a size_t. Check we aren't overflowing.
- */
- if ( outsize != dom->kernel_size )
- {
- DOMPRINTF("BZIP2: Input too large");
- goto bzip2_cleanup;
- }
-
- out_buf = malloc(outsize);
- if ( out_buf == NULL )
- {
- DOMPRINTF("BZIP2: Failed to alloc memory");
- goto bzip2_cleanup;
- }
-
- stream.next_in = dom->kernel_blob;
- stream.avail_in = dom->kernel_size;
-
- stream.next_out = out_buf;
- stream.avail_out = dom->kernel_size;
-
- for ( ; ; )
- {
- ret = BZ2_bzDecompress(&stream);
- if ( ret == BZ_STREAM_END )
- {
- DOMPRINTF("BZIP2: Saw data stream end");
- retval = 0;
- break;
- }
- if ( ret != BZ_OK )
- {
- DOMPRINTF("BZIP2: error %d", ret);
- free(out_buf);
- goto bzip2_cleanup;
- }
-
- if ( stream.avail_out == 0 )
- {
- /* Protect against output buffer overflow */
- if ( outsize > UINT_MAX / 2 )
- {
- DOMPRINTF("BZIP2: output buffer overflow");
- free(out_buf);
- goto bzip2_cleanup;
- }
-
- if ( xc_dom_kernel_check_size(dom, outsize * 2) )
- {
- DOMPRINTF("BZIP2: output too large");
- free(out_buf);
- goto bzip2_cleanup;
- }
-
- tmp_buf = realloc(out_buf, outsize * 2);
- if ( tmp_buf == NULL )
- {
- DOMPRINTF("BZIP2: Failed to realloc memory");
- free(out_buf);
- goto bzip2_cleanup;
- }
- out_buf = tmp_buf;
-
- stream.next_out = out_buf + outsize;
- stream.avail_out = (outsize * 2) - outsize;
- outsize *= 2;
- }
- else if ( stream.avail_in == 0 )
- {
- /*
- * If there is output buffer available then this indicates
- * that BZ2_bzDecompress would like more input data to be
- * provided. However our complete input buffer is in
- * memory and provided upfront so if avail_in is zero this
- * actually indicates a truncated input.
- */
- DOMPRINTF("BZIP2: not enough input");
- free(out_buf);
- goto bzip2_cleanup;
- }
- }
-
- total = (((uint64_t)stream.total_out_hi32) << 32) | stream.total_out_lo32;
-
- if ( xc_dom_register_external(dom, out_buf, total) )
- {
- DOMPRINTF("BZIP2: Error registering stream output");
- free(out_buf);
- goto bzip2_cleanup;
- }
-
- DOMPRINTF("%s: BZIP2 decompress OK, 0x%zx -> 0x%lx",
- __FUNCTION__, *size, (long unsigned int) total);
-
- *blob = out_buf;
- *size = total;
-
- bzip2_cleanup:
- BZ2_bzDecompressEnd(&stream);
-
- return retval;
-}
-
-#else /* !defined(HAVE_BZLIB) */
-
-static int xc_try_bzip2_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: BZIP2 decompress support unavailable",
- __FUNCTION__);
- return -1;
-}
-
-#endif
-
-#if defined(HAVE_LZMA)
-
-#include <lzma.h>
-
-static int _xc_try_lzma_decode(
- struct xc_dom_image *dom, void **blob, size_t *size,
- lzma_stream *stream, const char *what)
-{
- lzma_ret ret;
- lzma_action action = LZMA_RUN;
- unsigned char *out_buf;
- unsigned char *tmp_buf;
- int retval = -1;
- size_t outsize;
- const char *msg;
-
- if ( dom->kernel_size == 0)
- {
- DOMPRINTF("%s: Input is 0 size", what);
- return -1;
- }
-
- /* sigh. We don't know up-front how much memory we are going to need
- * for the output buffer. Allocate the output buffer to be equal
- * the input buffer to start, and we'll realloc as needed.
- */
- outsize = dom->kernel_size;
- out_buf = malloc(outsize);
- if ( out_buf == NULL )
- {
- DOMPRINTF("%s: Failed to alloc memory", what);
- goto lzma_cleanup;
- }
-
- stream->next_in = dom->kernel_blob;
- stream->avail_in = dom->kernel_size;
-
- stream->next_out = out_buf;
- stream->avail_out = dom->kernel_size;
-
- for ( ; ; )
- {
- ret = lzma_code(stream, action);
- if ( ret == LZMA_STREAM_END )
- {
- DOMPRINTF("%s: Saw data stream end", what);
- retval = 0;
- break;
- }
- if ( ret != LZMA_OK )
- {
- switch ( ret )
- {
- case LZMA_MEM_ERROR:
- msg = strerror(ENOMEM);
- break;
-
- case LZMA_MEMLIMIT_ERROR:
- msg = "Memory usage limit reached";
- break;
-
- case LZMA_FORMAT_ERROR:
- msg = "File format not recognized";
- break;
-
- case LZMA_OPTIONS_ERROR:
- // FIXME: Better message?
- msg = "Unsupported compression options";
- break;
-
- case LZMA_DATA_ERROR:
- msg = "File is corrupt";
- break;
-
- case LZMA_BUF_ERROR:
- msg = "Unexpected end of input";
- break;
-
- default:
- msg = "Internal program error (bug)";
- break;
- }
- DOMPRINTF("%s: %s decompression error: %s",
- __FUNCTION__, what, msg);
- free(out_buf);
- goto lzma_cleanup;
- }
-
- if ( stream->avail_out == 0 )
- {
- /* Protect against output buffer overflow */
- if ( outsize > SIZE_MAX / 2 )
- {
- DOMPRINTF("%s: output buffer overflow", what);
- free(out_buf);
- goto lzma_cleanup;
- }
-
- if ( xc_dom_kernel_check_size(dom, outsize * 2) )
- {
- DOMPRINTF("%s: output too large", what);
- free(out_buf);
- goto lzma_cleanup;
- }
-
- tmp_buf = realloc(out_buf, outsize * 2);
- if ( tmp_buf == NULL )
- {
- DOMPRINTF("%s: Failed to realloc memory", what);
- free(out_buf);
- goto lzma_cleanup;
- }
- out_buf = tmp_buf;
-
- stream->next_out = out_buf + outsize;
- stream->avail_out = (outsize * 2) - outsize;
- outsize *= 2;
- }
- }
-
- if ( xc_dom_register_external(dom, out_buf, stream->total_out) )
- {
- DOMPRINTF("%s: Error registering stream output", what);
- free(out_buf);
- goto lzma_cleanup;
- }
-
- DOMPRINTF("%s: %s decompress OK, 0x%zx -> 0x%zx",
- __FUNCTION__, what, *size, (size_t)stream->total_out);
-
- *blob = out_buf;
- *size = stream->total_out;
-
- lzma_cleanup:
- lzma_end(stream);
-
- return retval;
-}
-
-/* 128 Mb is the minimum size (half-way) documented to work for all inputs. */
-#define LZMA_BLOCK_SIZE (128*1024*1024)
-
-static int xc_try_xz_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- lzma_stream stream = LZMA_STREAM_INIT;
-
- if ( lzma_stream_decoder(&stream, LZMA_BLOCK_SIZE, 0) != LZMA_OK )
- {
- DOMPRINTF("XZ: Failed to init decoder");
- return -1;
- }
-
- return _xc_try_lzma_decode(dom, blob, size, &stream, "XZ");
-}
-
-static int xc_try_lzma_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- lzma_stream stream = LZMA_STREAM_INIT;
-
- if ( lzma_alone_decoder(&stream, LZMA_BLOCK_SIZE) != LZMA_OK )
- {
- DOMPRINTF("LZMA: Failed to init decoder");
- return -1;
- }
-
- return _xc_try_lzma_decode(dom, blob, size, &stream, "LZMA");
-}
-
-#else /* !defined(HAVE_LZMA) */
-
-static int xc_try_xz_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: XZ decompress support unavailable",
- __FUNCTION__);
- return -1;
-}
-
-static int xc_try_lzma_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: LZMA decompress support unavailable",
- __FUNCTION__);
- return -1;
-}
-
-#endif
-
-#if defined(HAVE_LZO1X)
-
-#include <lzo/lzo1x.h>
-
-#define LZOP_HEADER_HAS_FILTER 0x00000800
-#define LZOP_MAX_BLOCK_SIZE (64*1024*1024)
-
-static inline uint_fast16_t lzo_read_16(const unsigned char *buf)
-{
- return buf[1] | (buf[0] << 8);
-}
-
-static inline uint_fast32_t lzo_read_32(const unsigned char *buf)
-{
- return lzo_read_16(buf + 2) | ((uint32_t)lzo_read_16(buf) << 16);
-}
-
-static int xc_try_lzo1x_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- int ret;
- const unsigned char *cur = dom->kernel_blob;
- unsigned char *out_buf = NULL;
- size_t left = dom->kernel_size;
- const char *msg;
- unsigned version;
- static const unsigned char magic[] = {
- 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a
- };
-
- /*
- * lzo_uint should match size_t. Check that this is the case to be
- * sure we won't overflow various lzo_uint fields.
- */
- BUILD_BUG_ON(sizeof(lzo_uint) != sizeof(size_t));
-
- ret = lzo_init();
- if ( ret != LZO_E_OK )
- {
- DOMPRINTF("LZO1x: Failed to init library (%d)\n", ret);
- return -1;
- }
-
- if ( left < 16 || memcmp(cur, magic, 9) )
- {
- DOMPRINTF("LZO1x: Unrecognized magic\n");
- return -1;
- }
-
- /* get version (2bytes), skip library version (2),
- * 'need to be extracted' version (2) and method (1) */
- version = lzo_read_16(cur + 9);
- cur += 16;
- left -= 16;
-
- if ( version >= 0x0940 )
- {
- /* skip level */
- ++cur;
- if ( left )
- --left;
- }
-
- if ( left >= 4 && (lzo_read_32(cur) & LZOP_HEADER_HAS_FILTER) )
- ret = 8; /* flags + filter info */
- else
- ret = 4; /* flags */
-
- /* skip mode and mtime_low */
- ret += 8;
- if ( version >= 0x0940 )
- ret += 4; /* skip mtime_high */
-
- /* don't care about the file name, and skip checksum */
- if ( left > ret )
- ret += 1 + cur[ret] + 4;
-
- if ( left < ret )
- {
- DOMPRINTF("LZO1x: Incomplete header\n");
- return -1;
- }
- cur += ret;
- left -= ret;
-
- for ( *size = 0; ; )
- {
- lzo_uint src_len, dst_len, out_len;
- unsigned char *tmp_buf;
-
- msg = "Short input";
- if ( left < 4 )
- break;
-
- dst_len = lzo_read_32(cur);
- if ( !dst_len )
- {
- msg = "Error registering stream output";
- if ( xc_dom_register_external(dom, out_buf, *size) )
- break;
-
- return 0;
- }
-
- if ( dst_len > LZOP_MAX_BLOCK_SIZE )
- {
- msg = "Block size too large";
- break;
- }
-
- if ( left < 12 )
- break;
-
- src_len = lzo_read_32(cur + 4);
- cur += 12; /* also skip block checksum info */
- left -= 12;
-
- msg = "Bad source length";
- if ( src_len <= 0 || src_len > dst_len || src_len > left )
- break;
-
- msg = "Output buffer overflow";
- if ( *size > SIZE_MAX - dst_len )
- break;
-
- msg = "Decompressed image too large";
- if ( xc_dom_kernel_check_size(dom, *size + dst_len) )
- break;
-
- msg = "Failed to (re)alloc memory";
- tmp_buf = realloc(out_buf, *size + dst_len);
- if ( tmp_buf == NULL )
- break;
-
- out_buf = tmp_buf;
- out_len = dst_len;
-
- ret = lzo1x_decompress_safe(cur, src_len,
- out_buf + *size, &out_len, NULL);
- switch ( ret )
- {
- case LZO_E_OK:
- msg = "Input underrun";
- if ( out_len != dst_len )
- break;
-
- *blob = out_buf;
- *size += out_len;
- cur += src_len;
- left -= src_len;
- continue;
-
- case LZO_E_INPUT_NOT_CONSUMED:
- msg = "Unconsumed input";
- break;
-
- case LZO_E_OUTPUT_OVERRUN:
- msg = "Output overrun";
- break;
-
- case LZO_E_INPUT_OVERRUN:
- msg = "Input overrun";
- break;
-
- case LZO_E_LOOKBEHIND_OVERRUN:
- msg = "Look-behind overrun";
- break;
-
- case LZO_E_EOF_NOT_FOUND:
- msg = "No EOF marker";
- break;
-
- case LZO_E_ERROR:
- msg = "General error";
- break;
-
- default:
- msg = "Internal program error (bug)";
- break;
- }
-
- break;
- }
-
- free(out_buf);
- DOMPRINTF("LZO1x decompression error: %s\n", msg);
-
- return -1;
-}
-
-#else /* !defined(HAVE_LZO1X) */
-
-static int xc_try_lzo1x_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: LZO1x decompress support unavailable\n",
- __FUNCTION__);
- return -1;
-}
-
-#endif
-
-#else /* __MINIOS__ */
-
-int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size);
-int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size);
-int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size);
-int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size);
-
-#endif /* !__MINIOS__ */
-
-struct setup_header {
- uint8_t _pad0[0x1f1]; /* skip uninteresting stuff */
- uint8_t setup_sects;
- uint16_t root_flags;
- uint32_t syssize;
- uint16_t ram_size;
- uint16_t vid_mode;
- uint16_t root_dev;
- uint16_t boot_flag;
- uint16_t jump;
- uint32_t header;
-#define HDR_MAGIC "HdrS"
-#define HDR_MAGIC_SZ 4
- uint16_t version;
-#define VERSION(h,l) (((h)<<8) | (l))
- uint32_t realmode_swtch;
- uint16_t start_sys;
- uint16_t kernel_version;
- uint8_t type_of_loader;
- uint8_t loadflags;
- uint16_t setup_move_size;
- uint32_t code32_start;
- uint32_t ramdisk_image;
- uint32_t ramdisk_size;
- uint32_t bootsect_kludge;
- uint16_t heap_end_ptr;
- uint16_t _pad1;
- uint32_t cmd_line_ptr;
- uint32_t initrd_addr_max;
- uint32_t kernel_alignment;
- uint8_t relocatable_kernel;
- uint8_t _pad2[3];
- uint32_t cmdline_size;
- uint32_t hardware_subarch;
- uint64_t hardware_subarch_data;
- uint32_t payload_offset;
- uint32_t payload_length;
-} __attribute__((packed));
-
-extern struct xc_dom_loader elf_loader;
-
-static int check_magic(struct xc_dom_image *dom, const void *magic, size_t len)
-{
- if (len > dom->kernel_size)
- return 0;
-
- return (memcmp(dom->kernel_blob, magic, len) == 0);
-}
-
-static int xc_dom_probe_bzimage_kernel(struct xc_dom_image *dom)
-{
- struct setup_header *hdr;
- uint64_t payload_offset, payload_length;
- int ret;
-
- if ( dom->kernel_blob == NULL )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: no kernel image loaded", __FUNCTION__);
- return -EINVAL;
- }
-
- if ( dom->kernel_size < sizeof(struct setup_header) )
- {
- xc_dom_printf(dom->xch, "%s: kernel image too small", __FUNCTION__);
- return -EINVAL;
- }
-
- hdr = dom->kernel_blob;
-
- if ( memcmp(&hdr->header, HDR_MAGIC, HDR_MAGIC_SZ) != 0 )
- {
- xc_dom_printf(dom->xch, "%s: kernel is not a bzImage", __FUNCTION__);
- return -EINVAL;
- }
-
- if ( hdr->version < VERSION(2,8) )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: boot protocol"
- " too old (%04x)", __FUNCTION__, hdr->version);
- return -EINVAL;
- }
-
-
- /* upcast to 64 bits to avoid overflow */
- /* setup_sects is u8 and so cannot overflow */
- payload_offset = (hdr->setup_sects + 1) * 512;
- payload_offset += hdr->payload_offset;
- payload_length = hdr->payload_length;
-
- if ( payload_offset >= dom->kernel_size )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload offset overflow",
- __FUNCTION__);
- return -EINVAL;
- }
- if ( (payload_offset + payload_length) > dom->kernel_size )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: payload length overflow",
- __FUNCTION__);
- return -EINVAL;
- }
-
- dom->kernel_blob = dom->kernel_blob + payload_offset;
- dom->kernel_size = payload_length;
-
- if ( check_magic(dom, "\037\213", 2) )
- {
- ret = xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size);
- if ( ret == -1 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: unable to"
- " gzip decompress kernel", __FUNCTION__);
- return -EINVAL;
- }
- }
- else if ( check_magic(dom, "\102\132\150", 3) )
- {
- ret = xc_try_bzip2_decode(dom, &dom->kernel_blob, &dom->kernel_size);
- if ( ret < 0 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s unable to BZIP2 decompress kernel",
- __FUNCTION__);
- return -EINVAL;
- }
- }
- else if ( check_magic(dom, "\3757zXZ", 6) )
- {
- ret = xc_try_xz_decode(dom, &dom->kernel_blob, &dom->kernel_size);
- if ( ret < 0 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s unable to XZ decompress kernel",
- __FUNCTION__);
- return -EINVAL;
- }
- }
- else if ( check_magic(dom, "\135\000", 2) )
- {
- ret = xc_try_lzma_decode(dom, &dom->kernel_blob, &dom->kernel_size);
- if ( ret < 0 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s unable to LZMA decompress kernel",
- __FUNCTION__);
- return -EINVAL;
- }
- }
- else if ( check_magic(dom, "\x89LZO", 5) )
- {
- ret = xc_try_lzo1x_decode(dom, &dom->kernel_blob, &dom->kernel_size);
- if ( ret < 0 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s unable to LZO decompress kernel\n",
- __FUNCTION__);
- return -EINVAL;
- }
- }
- else if ( check_magic(dom, "\x02\x21", 2) )
- {
- ret = xc_try_lz4_decode(dom, &dom->kernel_blob, &dom->kernel_size);
- if ( ret < 0 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s unable to LZ4 decompress kernel\n",
- __FUNCTION__);
- return -EINVAL;
- }
- }
- else
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s: unknown compression format", __FUNCTION__);
- return -EINVAL;
- }
-
- return elf_loader.probe(dom);
-}
-
-static int xc_dom_parse_bzimage_kernel(struct xc_dom_image *dom)
-{
- return elf_loader.parser(dom);
-}
-
-static int xc_dom_load_bzimage_kernel(struct xc_dom_image *dom)
-{
- return elf_loader.loader(dom);
-}
-
-static struct xc_dom_loader bzimage_loader = {
- .name = "Linux bzImage",
- .probe = xc_dom_probe_bzimage_kernel,
- .parser = xc_dom_parse_bzimage_kernel,
- .loader = xc_dom_load_bzimage_kernel,
-};
-
-static void __init register_loader(void)
-{
- xc_dom_register_loader(&bzimage_loader);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * Xen domain builder -- compatibility code.
- *
- * Replacements for xc_linux_build & friends,
- * as example code and to make the new builder
- * usable as drop-in replacement.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include <zlib.h>
-
-#include "xenctrl.h"
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-
-/* ------------------------------------------------------------------------ */
-
-int xc_linux_build(xc_interface *xch, uint32_t domid,
- unsigned int mem_mb,
- const char *image_name,
- const char *initrd_name,
- const char *cmdline,
- const char *features,
- unsigned long flags,
- unsigned int store_evtchn,
- unsigned long *store_mfn,
- unsigned int console_evtchn,
- unsigned long *console_mfn)
-{
- struct xc_dom_image *dom;
- int rc;
-
- xc_dom_loginit(xch);
- dom = xc_dom_allocate(xch, cmdline, features);
- if (dom == NULL)
- return -1;
- if ( (rc = xc_dom_kernel_file(dom, image_name)) != 0 )
- goto out;
- if ( initrd_name && strlen(initrd_name) &&
- ((rc = xc_dom_module_file(dom, initrd_name, NULL)) != 0) )
- goto out;
-
- dom->flags |= flags;
- dom->console_evtchn = console_evtchn;
- dom->xenstore_evtchn = store_evtchn;
-
- if ( (rc = xc_dom_boot_xen_init(dom, xch, domid)) != 0 )
- goto out;
- if ( (rc = xc_dom_parse_image(dom)) != 0 )
- goto out;
- if ( (rc = xc_dom_mem_init(dom, mem_mb)) != 0 )
- goto out;
- if ( (rc = xc_dom_boot_mem_init(dom)) != 0 )
- goto out;
- if ( (rc = xc_dom_build_image(dom)) != 0 )
- goto out;
- if ( (rc = xc_dom_boot_image(dom)) != 0 )
- goto out;
- if ( (rc = xc_dom_gnttab_init(dom)) != 0)
- goto out;
-
- *console_mfn = xc_dom_p2m(dom, dom->console_pfn);
- *store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
-
- out:
- xc_dom_release(dom);
- return rc;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * Xen domain builder -- core bits.
- *
- * The core code goes here:
- * - allocate and release domain structs.
- * - memory management functions.
- * - misc helper functions.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <inttypes.h>
-#include <zlib.h>
-#include <assert.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-#include "_paths.h"
-
-/* ------------------------------------------------------------------------ */
-/* debugging */
-
-
-
-static const char *default_logfile = XEN_LOG_DIR "/domain-builder-ng.log";
-
-int xc_dom_loginit(xc_interface *xch) {
- if (xch->dombuild_logger) return 0;
-
- if (!xch->dombuild_logger_file) {
- xch->dombuild_logger_file = fopen(default_logfile, "a");
- if (!xch->dombuild_logger_file) {
- PERROR("Could not open logfile `%s'", default_logfile);
- return -1;
- }
- }
-
- xch->dombuild_logger = xch->dombuild_logger_tofree =
- (xentoollog_logger*)
- xtl_createlogger_stdiostream(xch->dombuild_logger_file, XTL_DETAIL,
- XTL_STDIOSTREAM_SHOW_DATE|XTL_STDIOSTREAM_SHOW_PID);
- if (!xch->dombuild_logger)
- return -1;
-
- xc_dom_printf(xch, "### ----- xc domain builder logfile opened -----");
-
- return 0;
-}
-
-void xc_dom_printf(xc_interface *xch, const char *fmt, ...)
-{
- va_list args;
- if (!xch->dombuild_logger) return;
- va_start(args, fmt);
- xtl_logv(xch->dombuild_logger, XTL_DETAIL, -1, "domainbuilder", fmt, args);
- va_end(args);
-}
-
-void xc_dom_panic_func(xc_interface *xch,
- const char *file, int line, xc_error_code err,
- const char *fmt, ...)
-{
- va_list args;
- char msg[XC_MAX_ERROR_MSG_LEN];
-
- va_start(args, fmt);
- vsnprintf(msg, sizeof(msg), fmt, args);
- va_end(args);
- msg[sizeof(msg)-1] = 0;
-
- xc_report(xch,
- xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler,
- XTL_ERROR, err, "panic: %s:%d: %s",
- file, line, msg);
-}
-
-static void print_mem(struct xc_dom_image *dom, const char *name, size_t mem)
-{
- if ( mem > (32 * 1024 * 1024) )
- DOMPRINTF("%-24s : %zd MB", name, mem / (1024 * 1024));
- else if ( mem > (32 * 1024) )
- DOMPRINTF("%-24s : %zd kB", name, mem / 1024);
- else
- DOMPRINTF("%-24s : %zd bytes", name, mem);
-}
-
-void xc_dom_log_memory_footprint(struct xc_dom_image *dom)
-{
- DOMPRINTF("domain builder memory footprint");
- DOMPRINTF(" allocated");
- print_mem(dom, " malloc", dom->alloc_malloc);
- print_mem(dom, " anon mmap", dom->alloc_mem_map);
- DOMPRINTF(" mapped");
- print_mem(dom, " file mmap", dom->alloc_file_map);
- print_mem(dom, " domU mmap", dom->alloc_domU_map);
-}
-
-/* ------------------------------------------------------------------------ */
-/* simple memory pool */
-
-void *xc_dom_malloc(struct xc_dom_image *dom, size_t size)
-{
- struct xc_dom_mem *block;
-
- if ( size > SIZE_MAX - sizeof(*block) )
- {
- DOMPRINTF("%s: unreasonable allocation size", __FUNCTION__);
- return NULL;
- }
- block = malloc(sizeof(*block) + size);
- if ( block == NULL )
- {
- DOMPRINTF("%s: allocation failed", __FUNCTION__);
- return NULL;
- }
- memset(block, 0, sizeof(*block) + size);
- block->type = XC_DOM_MEM_TYPE_MALLOC_INTERNAL;
- block->next = dom->memblocks;
- dom->memblocks = block;
- dom->alloc_malloc += sizeof(*block) + size;
- if ( size > (100 * 1024) )
- print_mem(dom, __FUNCTION__, size);
- return block->memory;
-}
-
-void *xc_dom_malloc_page_aligned(struct xc_dom_image *dom, size_t size)
-{
- struct xc_dom_mem *block;
-
- block = malloc(sizeof(*block));
- if ( block == NULL )
- {
- DOMPRINTF("%s: allocation failed", __FUNCTION__);
- return NULL;
- }
- memset(block, 0, sizeof(*block));
- block->len = size;
- block->ptr = mmap(NULL, block->len,
- PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
- -1, 0);
- if ( block->ptr == MAP_FAILED )
- {
- DOMPRINTF("%s: mmap failed", __FUNCTION__);
- free(block);
- return NULL;
- }
- block->type = XC_DOM_MEM_TYPE_MMAP;
- block->next = dom->memblocks;
- dom->memblocks = block;
- dom->alloc_malloc += sizeof(*block);
- dom->alloc_mem_map += block->len;
- if ( size > (100 * 1024) )
- print_mem(dom, __FUNCTION__, size);
- return block->ptr;
-}
-
-int xc_dom_register_external(struct xc_dom_image *dom, void *ptr, size_t size)
-{
- struct xc_dom_mem *block;
-
- block = malloc(sizeof(*block));
- if ( block == NULL )
- {
- DOMPRINTF("%s: allocation failed", __FUNCTION__);
- return -1;
- }
- memset(block, 0, sizeof(*block));
- block->ptr = ptr;
- block->len = size;
- block->type = XC_DOM_MEM_TYPE_MALLOC_EXTERNAL;
- block->next = dom->memblocks;
- dom->memblocks = block;
- dom->alloc_malloc += sizeof(*block);
- dom->alloc_mem_map += block->len;
- return 0;
-}
-
-void *xc_dom_malloc_filemap(struct xc_dom_image *dom,
- const char *filename, size_t * size,
- const size_t max_size)
-{
- struct xc_dom_mem *block = NULL;
- int fd = -1;
- off_t offset;
-
- fd = open(filename, O_RDONLY);
- if ( fd == -1 ) {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "failed to open file '%s': %s",
- filename, strerror(errno));
- goto err;
- }
-
- if ( (lseek(fd, 0, SEEK_SET) == -1) ||
- ((offset = lseek(fd, 0, SEEK_END)) == -1) ) {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "failed to seek on file '%s': %s",
- filename, strerror(errno));
- goto err;
- }
-
- *size = offset;
-
- if ( max_size && *size > max_size )
- {
- xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
- "tried to map file which is too large");
- goto err;
- }
-
- if ( !*size )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "'%s': zero length file", filename);
- goto err;
- }
-
- block = malloc(sizeof(*block));
- if ( block == NULL ) {
- xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
- "failed to allocate block (%zu bytes)",
- sizeof(*block));
- goto err;
- }
-
- memset(block, 0, sizeof(*block));
- block->len = *size;
- block->ptr = mmap(NULL, block->len, PROT_READ,
- MAP_SHARED, fd, 0);
- if ( block->ptr == MAP_FAILED ) {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "failed to mmap file '%s': %s",
- filename, strerror(errno));
- goto err;
- }
-
- block->type = XC_DOM_MEM_TYPE_MMAP;
- block->next = dom->memblocks;
- dom->memblocks = block;
- dom->alloc_malloc += sizeof(*block);
- dom->alloc_file_map += block->len;
- close(fd);
- if ( *size > (100 * 1024) )
- print_mem(dom, __FUNCTION__, *size);
- return block->ptr;
-
- err:
- if ( fd != -1 )
- close(fd);
- free(block);
- DOMPRINTF("%s: failed (on file `%s')", __FUNCTION__, filename);
- return NULL;
-}
-
-static void xc_dom_free_all(struct xc_dom_image *dom)
-{
- struct xc_dom_mem *block;
-
- while ( (block = dom->memblocks) != NULL )
- {
- dom->memblocks = block->next;
- switch ( block->type )
- {
- case XC_DOM_MEM_TYPE_MALLOC_INTERNAL:
- break;
- case XC_DOM_MEM_TYPE_MALLOC_EXTERNAL:
- free(block->ptr);
- break;
- case XC_DOM_MEM_TYPE_MMAP:
- munmap(block->ptr, block->len);
- break;
- }
- free(block);
- }
-}
-
-char *xc_dom_strdup(struct xc_dom_image *dom, const char *str)
-{
- size_t len = strlen(str) + 1;
- char *nstr = xc_dom_malloc(dom, len);
-
- if ( nstr == NULL )
- return NULL;
- memcpy(nstr, str, len);
- return nstr;
-}
-
-/* ------------------------------------------------------------------------ */
-/* decompression buffer sizing */
-int xc_dom_kernel_check_size(struct xc_dom_image *dom, size_t sz)
-{
- /* No limit */
- if ( !dom->max_kernel_size )
- return 0;
-
- if ( sz > dom->max_kernel_size )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "kernel image too large");
- return 1;
- }
-
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-/* read files, copy memory blocks, with transparent gunzip */
-
-size_t xc_dom_check_gzip(xc_interface *xch, void *blob, size_t ziplen)
-{
- unsigned char *gzlen;
- size_t unziplen;
-
- if ( ziplen < 6 )
- /* Too small. We need (i.e. the subsequent code relies on)
- * 2 bytes for the magic number plus 4 bytes length. */
- return 0;
-
- if ( strncmp(blob, "\037\213", 2) )
- /* not gzipped */
- return 0;
-
- gzlen = blob + ziplen - 4;
- unziplen = (size_t)gzlen[3] << 24 | gzlen[2] << 16 | gzlen[1] << 8 | gzlen[0];
- if ( unziplen > XC_DOM_DECOMPRESS_MAX )
- {
- xc_dom_printf
- (xch,
- "%s: size (zip %zd, unzip %zd) looks insane, skip gunzip",
- __FUNCTION__, ziplen, unziplen);
- return 0;
- }
-
- return unziplen + 16;
-}
-
-int xc_dom_do_gunzip(xc_interface *xch,
- void *src, size_t srclen, void *dst, size_t dstlen)
-{
- z_stream zStream;
- int rc;
-
- memset(&zStream, 0, sizeof(zStream));
- zStream.next_in = src;
- zStream.avail_in = srclen;
- zStream.next_out = dst;
- zStream.avail_out = dstlen;
- rc = inflateInit2(&zStream, (MAX_WBITS + 32)); /* +32 means "handle gzip" */
- if ( rc != Z_OK )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: inflateInit2 failed (rc=%d)", __FUNCTION__, rc);
- return -1;
- }
- rc = inflate(&zStream, Z_FINISH);
- inflateEnd(&zStream);
- if ( rc != Z_STREAM_END )
- {
- xc_dom_panic(xch, XC_INTERNAL_ERROR,
- "%s: inflate failed (rc=%d)", __FUNCTION__, rc);
- return -1;
- }
-
- xc_dom_printf(xch, "%s: unzip ok, 0x%zx -> 0x%zx",
- __FUNCTION__, srclen, dstlen);
- return 0;
-}
-
-int xc_dom_try_gunzip(struct xc_dom_image *dom, void **blob, size_t * size)
-{
- void *unzip;
- size_t unziplen;
-
- unziplen = xc_dom_check_gzip(dom->xch, *blob, *size);
- if ( unziplen == 0 )
- return 0;
-
- if ( xc_dom_kernel_check_size(dom, unziplen) )
- return 0;
-
- unzip = xc_dom_malloc(dom, unziplen);
- if ( unzip == NULL )
- return -1;
-
- if ( xc_dom_do_gunzip(dom->xch, *blob, *size, unzip, unziplen) == -1 )
- return -1;
-
- *blob = unzip;
- *size = unziplen;
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-/* domain memory */
-
-void *xc_dom_pfn_to_ptr(struct xc_dom_image *dom, xen_pfn_t pfn,
- xen_pfn_t count)
-{
- xen_pfn_t count_out_dummy;
- return xc_dom_pfn_to_ptr_retcount(dom, pfn, count, &count_out_dummy);
-}
-
-void *xc_dom_pfn_to_ptr_retcount(struct xc_dom_image *dom, xen_pfn_t pfn,
- xen_pfn_t count, xen_pfn_t *count_out)
-{
- struct xc_dom_phys *phys;
- xen_pfn_t offset;
- unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom);
- char *mode = "unset";
-
- *count_out = 0;
-
- offset = pfn - dom->rambase_pfn;
- if ( offset > dom->total_pages || /* multiple checks to avoid overflows */
- count > dom->total_pages ||
- offset > dom->total_pages - count )
- {
- DOMPRINTF("%s: pfn %"PRI_xen_pfn" out of range (0x%" PRIpfn " > 0x%" PRIpfn ")",
- __FUNCTION__, pfn, offset, dom->total_pages);
- return NULL;
- }
-
- /* already allocated? */
- for ( phys = dom->phys_pages; phys != NULL; phys = phys->next )
- {
- if ( pfn >= (phys->first + phys->count) )
- continue;
- if ( count )
- {
- /* size given: must be completely within the already allocated block */
- if ( (pfn + count) <= phys->first )
- continue;
- if ( (pfn < phys->first) ||
- ((pfn + count) > (phys->first + phys->count)) )
- {
- DOMPRINTF("%s: request overlaps allocated block"
- " (req 0x%" PRIpfn "+0x%" PRIpfn ","
- " blk 0x%" PRIpfn "+0x%" PRIpfn ")",
- __FUNCTION__, pfn, count, phys->first,
- phys->count);
- return NULL;
- }
- *count_out = count;
- }
- else
- {
- /* no size given: block must be allocated already,
- just hand out a pointer to it */
- if ( pfn < phys->first )
- continue;
- if ( pfn >= phys->first + phys->count )
- continue;
- *count_out = phys->count - (pfn - phys->first);
- }
- return phys->ptr + ((pfn - phys->first) << page_shift);
- }
-
- /* allocating is allowed with size specified only */
- if ( count == 0 )
- {
- DOMPRINTF("%s: no block found, no size given,"
- " can't malloc (pfn 0x%" PRIpfn ")",
- __FUNCTION__, pfn);
- return NULL;
- }
-
- /* not found, no overlap => allocate */
- phys = xc_dom_malloc(dom, sizeof(*phys));
- if ( phys == NULL )
- return NULL;
- memset(phys, 0, sizeof(*phys));
- phys->first = pfn;
- phys->count = count;
-
- if ( dom->guest_domid )
- {
- mode = "domU mapping";
- phys->ptr = xc_dom_boot_domU_map(dom, phys->first, phys->count);
- if ( phys->ptr == NULL )
- return NULL;
- dom->alloc_domU_map += phys->count << page_shift;
- }
- else
- {
- int err;
-
- mode = "anonymous memory";
- phys->ptr = mmap(NULL, phys->count << page_shift,
- PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
- -1, 0);
- if ( phys->ptr == MAP_FAILED )
- {
- err = errno;
- xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
- "%s: oom: can't allocate 0x%" PRIpfn " pages"
- " [mmap, errno=%i (%s)]",
- __FUNCTION__, count, err, strerror(err));
- return NULL;
- }
- dom->alloc_mem_map += phys->count << page_shift;
- }
-
-#if 1
- DOMPRINTF("%s: %s: pfn 0x%" PRIpfn "+0x%" PRIpfn " at %p",
- __FUNCTION__, mode, phys->first, phys->count, phys->ptr);
-#endif
- phys->next = dom->phys_pages;
- dom->phys_pages = phys;
- return phys->ptr;
-}
-
-static int xc_dom_chk_alloc_pages(struct xc_dom_image *dom, char *name,
- xen_pfn_t pages)
-{
- unsigned int page_size = XC_DOM_PAGE_SIZE(dom);
-
- if ( pages > dom->total_pages || /* multiple test avoids overflow probs */
- dom->pfn_alloc_end - dom->rambase_pfn > dom->total_pages ||
- pages > dom->total_pages - dom->pfn_alloc_end + dom->rambase_pfn )
- {
- xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
- "%s: segment %s too large (0x%"PRIpfn" > "
- "0x%"PRIpfn" - 0x%"PRIpfn" pages)", __FUNCTION__, name,
- pages, dom->total_pages,
- dom->pfn_alloc_end - dom->rambase_pfn);
- return -1;
- }
-
- dom->pfn_alloc_end += pages;
- dom->virt_alloc_end += pages * page_size;
-
- if ( dom->allocate )
- dom->allocate(dom);
-
- return 0;
-}
-
-static int xc_dom_alloc_pad(struct xc_dom_image *dom, xen_vaddr_t boundary)
-{
- unsigned int page_size = XC_DOM_PAGE_SIZE(dom);
- xen_pfn_t pages;
-
- if ( boundary & (page_size - 1) )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: segment boundary isn't page aligned (0x%" PRIx64 ")",
- __FUNCTION__, boundary);
- return -1;
- }
- if ( boundary < dom->virt_alloc_end )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: segment boundary too low (0x%" PRIx64 " < 0x%" PRIx64
- ")", __FUNCTION__, boundary, dom->virt_alloc_end);
- return -1;
- }
- pages = (boundary - dom->virt_alloc_end) / page_size;
-
- return xc_dom_chk_alloc_pages(dom, "padding", pages);
-}
-
-int xc_dom_alloc_segment(struct xc_dom_image *dom,
- struct xc_dom_seg *seg, char *name,
- xen_vaddr_t start, xen_vaddr_t size)
-{
- unsigned int page_size = XC_DOM_PAGE_SIZE(dom);
- xen_pfn_t pages;
- void *ptr;
-
- if ( start && xc_dom_alloc_pad(dom, start) )
- return -1;
-
- pages = (size + page_size - 1) / page_size;
- start = dom->virt_alloc_end;
-
- seg->pfn = dom->pfn_alloc_end;
- seg->pages = pages;
-
- if ( xc_dom_chk_alloc_pages(dom, name, pages) )
- return -1;
-
- /* map and clear pages */
- ptr = xc_dom_seg_to_ptr(dom, seg);
- if ( ptr == NULL )
- return -1;
- memset(ptr, 0, pages * page_size);
-
- seg->vstart = start;
- seg->vend = dom->virt_alloc_end;
-
- DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " -> 0x%" PRIx64
- " (pfn 0x%" PRIpfn " + 0x%" PRIpfn " pages)",
- __FUNCTION__, name, seg->vstart, seg->vend, seg->pfn, pages);
-
- return 0;
-}
-
-xen_pfn_t xc_dom_alloc_page(struct xc_dom_image *dom, char *name)
-{
- xen_vaddr_t start;
- xen_pfn_t pfn;
-
- start = dom->virt_alloc_end;
- pfn = dom->pfn_alloc_end - dom->rambase_pfn;
-
- if ( xc_dom_chk_alloc_pages(dom, name, 1) )
- return INVALID_PFN;
-
- DOMPRINTF("%-20s: %-12s : 0x%" PRIx64 " (pfn 0x%" PRIpfn ")",
- __FUNCTION__, name, start, pfn);
- return pfn;
-}
-
-void xc_dom_unmap_one(struct xc_dom_image *dom, xen_pfn_t pfn)
-{
- unsigned int page_shift = XC_DOM_PAGE_SHIFT(dom);
- struct xc_dom_phys *phys, *prev = NULL;
-
- for ( phys = dom->phys_pages; phys != NULL; phys = phys->next )
- {
- if ( (pfn >= phys->first) && (pfn < (phys->first + phys->count)) )
- break;
- prev = phys;
- }
- if ( !phys )
- {
- DOMPRINTF("%s: Huh? no mapping with pfn 0x%" PRIpfn "",
- __FUNCTION__, pfn);
- return;
- }
-
- munmap(phys->ptr, phys->count << page_shift);
- if ( prev )
- prev->next = phys->next;
- else
- dom->phys_pages = phys->next;
-
- xc_domain_cacheflush(dom->xch, dom->guest_domid, phys->first, phys->count);
-}
-
-void xc_dom_unmap_all(struct xc_dom_image *dom)
-{
- while ( dom->phys_pages )
- xc_dom_unmap_one(dom, dom->phys_pages->first);
-}
-
-/* ------------------------------------------------------------------------ */
-/* pluggable kernel loaders */
-
-static struct xc_dom_loader *first_loader = NULL;
-static struct xc_dom_arch *first_hook = NULL;
-
-void xc_dom_register_loader(struct xc_dom_loader *loader)
-{
- loader->next = first_loader;
- first_loader = loader;
-}
-
-static struct xc_dom_loader *xc_dom_find_loader(struct xc_dom_image *dom)
-{
- struct xc_dom_loader *loader = first_loader;
-
- while ( loader != NULL )
- {
- DOMPRINTF("%s: trying %s loader ... ", __FUNCTION__, loader->name);
- if ( loader->probe(dom) == 0 )
- {
- DOMPRINTF("loader probe OK");
- return loader;
- }
- DOMPRINTF("loader probe failed");
- loader = loader->next;
- }
- xc_dom_panic(dom->xch,
- XC_INVALID_KERNEL, "%s: no loader found", __FUNCTION__);
- return NULL;
-}
-
-void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks)
-{
- hooks->next = first_hook;
- first_hook = hooks;
-}
-
-int xc_dom_set_arch_hooks(struct xc_dom_image *dom)
-{
- struct xc_dom_arch *hooks = first_hook;
-
- while ( hooks != NULL )
- {
- if ( !strcmp(hooks->guest_type, dom->guest_type) )
- {
- if ( hooks->arch_private_size )
- {
- dom->arch_private = malloc(hooks->arch_private_size);
- if ( dom->arch_private == NULL )
- return -1;
- memset(dom->arch_private, 0, hooks->arch_private_size);
- dom->alloc_malloc += hooks->arch_private_size;
- }
- dom->arch_hooks = hooks;
- return 0;
- }
- hooks = hooks->next;
- }
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s: not found (type %s)", __FUNCTION__, dom->guest_type);
- return -1;
-}
-
-/* ------------------------------------------------------------------------ */
-/* public interface */
-
-void xc_dom_release(struct xc_dom_image *dom)
-{
- DOMPRINTF_CALLED(dom->xch);
- if ( dom->phys_pages )
- xc_dom_unmap_all(dom);
- xc_dom_free_all(dom);
- free(dom->arch_private);
- free(dom);
-}
-
-struct xc_dom_image *xc_dom_allocate(xc_interface *xch,
- const char *cmdline, const char *features)
-{
- struct xc_dom_image *dom;
-
- xc_dom_printf(xch, "%s: cmdline=\"%s\", features=\"%s\"",
- __FUNCTION__, cmdline ? cmdline : "",
- features ? features : "");
- dom = malloc(sizeof(*dom));
- if ( !dom )
- goto err;
-
- memset(dom, 0, sizeof(*dom));
- dom->xch = xch;
-
- dom->max_kernel_size = XC_DOM_DECOMPRESS_MAX;
- dom->max_module_size = XC_DOM_DECOMPRESS_MAX;
- dom->max_devicetree_size = XC_DOM_DECOMPRESS_MAX;
-
- if ( cmdline )
- dom->cmdline = xc_dom_strdup(dom, cmdline);
- if ( features )
- elf_xen_parse_features(features, dom->f_requested, NULL);
-
- dom->parms.virt_base = UNSET_ADDR;
- dom->parms.virt_entry = UNSET_ADDR;
- dom->parms.virt_hypercall = UNSET_ADDR;
- dom->parms.virt_hv_start_low = UNSET_ADDR;
- dom->parms.elf_paddr_offset = UNSET_ADDR;
- dom->parms.p2m_base = UNSET_ADDR;
-
- dom->flags = SIF_VIRT_P2M_4TOOLS;
-
- dom->alloc_malloc += sizeof(*dom);
- return dom;
-
- err:
- if ( dom )
- xc_dom_release(dom);
- return NULL;
-}
-
-int xc_dom_kernel_max_size(struct xc_dom_image *dom, size_t sz)
-{
- DOMPRINTF("%s: kernel_max_size=%zx", __FUNCTION__, sz);
- dom->max_kernel_size = sz;
- return 0;
-}
-
-int xc_dom_module_max_size(struct xc_dom_image *dom, size_t sz)
-{
- DOMPRINTF("%s: module_max_size=%zx", __FUNCTION__, sz);
- dom->max_module_size = sz;
- return 0;
-}
-
-int xc_dom_devicetree_max_size(struct xc_dom_image *dom, size_t sz)
-{
- DOMPRINTF("%s: devicetree_max_size=%zx", __FUNCTION__, sz);
- dom->max_devicetree_size = sz;
- return 0;
-}
-
-int xc_dom_kernel_file(struct xc_dom_image *dom, const char *filename)
-{
- DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename);
- dom->kernel_blob = xc_dom_malloc_filemap(dom, filename, &dom->kernel_size,
- dom->max_kernel_size);
- if ( dom->kernel_blob == NULL )
- return -1;
- return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size);
-}
-
-int xc_dom_module_file(struct xc_dom_image *dom, const char *filename, const char *cmdline)
-{
- unsigned int mod = dom->num_modules++;
-
- DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename);
- dom->modules[mod].blob =
- xc_dom_malloc_filemap(dom, filename, &dom->modules[mod].size,
- dom->max_module_size);
-
- if ( dom->modules[mod].blob == NULL )
- return -1;
-
- if ( cmdline )
- {
- dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline);
-
- if ( dom->modules[mod].cmdline == NULL )
- return -1;
- }
- else
- {
- dom->modules[mod].cmdline = NULL;
- }
-
- return 0;
-}
-
-int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename)
-{
-#if defined (__arm__) || defined(__aarch64__)
- DOMPRINTF("%s: filename=\"%s\"", __FUNCTION__, filename);
- dom->devicetree_blob =
- xc_dom_malloc_filemap(dom, filename, &dom->devicetree_size,
- dom->max_devicetree_size);
-
- if ( dom->devicetree_blob == NULL )
- return -1;
- return 0;
-#else
- errno = -EINVAL;
- return -1;
-#endif
-}
-
-int xc_dom_kernel_mem(struct xc_dom_image *dom, const void *mem, size_t memsize)
-{
- DOMPRINTF_CALLED(dom->xch);
- dom->kernel_blob = (void *)mem;
- dom->kernel_size = memsize;
- return xc_dom_try_gunzip(dom, &dom->kernel_blob, &dom->kernel_size);
-}
-
-int xc_dom_module_mem(struct xc_dom_image *dom, const void *mem,
- size_t memsize, const char *cmdline)
-{
- unsigned int mod = dom->num_modules++;
-
- DOMPRINTF_CALLED(dom->xch);
-
- dom->modules[mod].blob = (void *)mem;
- dom->modules[mod].size = memsize;
-
- if ( cmdline )
- {
- dom->modules[mod].cmdline = xc_dom_strdup(dom, cmdline);
-
- if ( dom->modules[mod].cmdline == NULL )
- return -1;
- }
- else
- {
- dom->modules[mod].cmdline = NULL;
- }
-
- return 0;
-}
-
-int xc_dom_devicetree_mem(struct xc_dom_image *dom, const void *mem,
- size_t memsize)
-{
- DOMPRINTF_CALLED(dom->xch);
- dom->devicetree_blob = (void *)mem;
- dom->devicetree_size = memsize;
- return 0;
-}
-
-int xc_dom_parse_image(struct xc_dom_image *dom)
-{
- int i;
-
- DOMPRINTF_CALLED(dom->xch);
-
- /* parse kernel image */
- dom->kernel_loader = xc_dom_find_loader(dom);
- if ( dom->kernel_loader == NULL )
- goto err;
- if ( dom->kernel_loader->parser(dom) != 0 )
- goto err;
- if ( dom->guest_type == NULL )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: guest_type not set", __FUNCTION__);
- goto err;
- }
-
- /* check features */
- for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
- {
- dom->f_active[i] |= dom->f_requested[i]; /* cmd line */
- dom->f_active[i] |= dom->parms.f_required[i]; /* kernel */
- if ( (dom->f_active[i] & dom->parms.f_supported[i]) !=
- dom->f_active[i] )
- {
- xc_dom_panic(dom->xch, XC_INVALID_PARAM,
- "%s: unsupported feature requested", __FUNCTION__);
- goto err;
- }
- }
- return 0;
-
- err:
- return -1;
-}
-
-int xc_dom_rambase_init(struct xc_dom_image *dom, uint64_t rambase)
-{
- dom->rambase_pfn = rambase >> XC_PAGE_SHIFT;
- dom->pfn_alloc_end = dom->rambase_pfn;
- DOMPRINTF("%s: RAM starts at %"PRI_xen_pfn,
- __FUNCTION__, dom->rambase_pfn);
- return 0;
-}
-
-int xc_dom_mem_init(struct xc_dom_image *dom, unsigned int mem_mb)
-{
- unsigned int page_shift;
- xen_pfn_t nr_pages;
-
- if ( xc_dom_set_arch_hooks(dom) )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set",
- __FUNCTION__);
- return -1;
- }
-
- page_shift = XC_DOM_PAGE_SHIFT(dom);
- nr_pages = mem_mb << (20 - page_shift);
-
- DOMPRINTF("%s: mem %d MB, pages 0x%" PRIpfn " pages, %dk each",
- __FUNCTION__, mem_mb, nr_pages, 1 << (page_shift-10));
- dom->total_pages = nr_pages;
-
- DOMPRINTF("%s: 0x%" PRIpfn " pages",
- __FUNCTION__, dom->total_pages);
-
- return 0;
-}
-
-static int xc_dom_build_module(struct xc_dom_image *dom, unsigned int mod)
-{
- size_t unziplen, modulelen;
- void *modulemap;
- char name[10];
-
- if ( !dom->modules[mod].seg.vstart )
- unziplen = xc_dom_check_gzip(dom->xch,
- dom->modules[mod].blob, dom->modules[mod].size);
- else
- unziplen = 0;
-
- modulelen = max(unziplen, dom->modules[mod].size);
- if ( dom->max_module_size )
- {
- if ( unziplen && modulelen > dom->max_module_size )
- {
- modulelen = min(unziplen, dom->modules[mod].size);
- if ( unziplen > modulelen )
- unziplen = 0;
- }
- if ( modulelen > dom->max_module_size )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "module %u image too large", mod);
- goto err;
- }
- }
-
- snprintf(name, sizeof(name), "module%u", mod);
- if ( xc_dom_alloc_segment(dom, &dom->modules[mod].seg, name,
- dom->modules[mod].seg.vstart, modulelen) != 0 )
- goto err;
- modulemap = xc_dom_seg_to_ptr(dom, &dom->modules[mod].seg);
- if ( modulemap == NULL )
- {
- DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->modules[%u].seg) => NULL",
- __FUNCTION__, mod);
- goto err;
- }
- if ( unziplen )
- {
- if ( xc_dom_do_gunzip(dom->xch, dom->modules[mod].blob, dom->modules[mod].size,
- modulemap, unziplen) != -1 )
- return 0;
- if ( dom->modules[mod].size > modulelen )
- goto err;
- }
-
- /* Fall back to handing over the raw blob. */
- memcpy(modulemap, dom->modules[mod].blob, dom->modules[mod].size);
- /* If an unzip attempt was made, the buffer may no longer be all zero. */
- if ( unziplen > dom->modules[mod].size )
- memset(modulemap + dom->modules[mod].size, 0,
- unziplen - dom->modules[mod].size);
-
- return 0;
-
- err:
- return -1;
-}
-
-static int populate_acpi_pages(struct xc_dom_image *dom,
- xen_pfn_t *extents,
- unsigned int num_pages)
-{
- int rc;
- xc_interface *xch = dom->xch;
- uint32_t domid = dom->guest_domid;
- unsigned long idx;
- unsigned long first_high_idx = 4UL << (30 - PAGE_SHIFT); /* 4GB */
-
- for ( ; num_pages; num_pages--, extents++ )
- {
-
- if ( xc_domain_populate_physmap(xch, domid, 1, 0, 0, extents) == 1 )
- continue;
-
- if ( dom->highmem_end )
- {
- idx = --dom->highmem_end;
- if ( idx == first_high_idx )
- dom->highmem_end = 0;
- }
- else
- {
- idx = --dom->lowmem_end;
- }
-
- rc = xc_domain_add_to_physmap(xch, domid,
- XENMAPSPACE_gmfn,
- idx, *extents);
- if ( rc )
- return rc;
- }
-
- return 0;
-}
-
-static int xc_dom_load_acpi(struct xc_dom_image *dom)
-{
- int j, i = 0;
- unsigned num_pages;
- xen_pfn_t *extents, base;
- void *ptr;
-
- while ( (i < MAX_ACPI_MODULES) && dom->acpi_modules[i].length )
- {
- DOMPRINTF("%s: %d bytes at address %" PRIx64, __FUNCTION__,
- dom->acpi_modules[i].length,
- dom->acpi_modules[i].guest_addr_out);
-
- num_pages = (dom->acpi_modules[i].length +
- (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK) +
- (XC_PAGE_SIZE - 1)) >> XC_PAGE_SHIFT;
- extents = malloc(num_pages * sizeof(*extents));
- if ( !extents )
- {
- DOMPRINTF("%s: Out of memory", __FUNCTION__);
- goto err;
- }
-
- base = dom->acpi_modules[i].guest_addr_out >> XC_PAGE_SHIFT;
- for ( j = 0; j < num_pages; j++ )
- extents[j] = base + j;
- if ( populate_acpi_pages(dom, extents, num_pages) )
- {
- DOMPRINTF("%s: Can populate ACPI pages", __FUNCTION__);
- goto err;
- }
-
- ptr = xc_map_foreign_range(dom->xch, dom->guest_domid,
- XC_PAGE_SIZE * num_pages,
- PROT_READ | PROT_WRITE, base);
- if ( !ptr )
- {
- DOMPRINTF("%s: Can't map %d pages at 0x%"PRI_xen_pfn,
- __FUNCTION__, num_pages, base);
- goto err;
- }
-
- memcpy((uint8_t *)ptr +
- (dom->acpi_modules[i].guest_addr_out & ~XC_PAGE_MASK),
- dom->acpi_modules[i].data, dom->acpi_modules[i].length);
- munmap(ptr, XC_PAGE_SIZE * num_pages);
-
- free(extents);
- i++;
- }
-
- return 0;
-
-err:
- free(extents);
- return -1;
-}
-
-int xc_dom_build_image(struct xc_dom_image *dom)
-{
- unsigned int page_size;
- bool unmapped_initrd;
- unsigned int mod;
-
- DOMPRINTF_CALLED(dom->xch);
-
- /* check for arch hooks */
- if ( dom->arch_hooks == NULL )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: arch hooks not set",
- __FUNCTION__);
- goto err;
- }
- page_size = XC_DOM_PAGE_SIZE(dom);
- if ( dom->parms.virt_base != UNSET_ADDR )
- dom->virt_alloc_end = dom->parms.virt_base;
-
- /* load kernel */
- if ( xc_dom_alloc_segment(dom, &dom->kernel_seg, "kernel",
- dom->kernel_seg.vstart,
- dom->kernel_seg.vend -
- dom->kernel_seg.vstart) != 0 )
- goto err;
- if ( dom->kernel_loader->loader(dom) != 0 )
- goto err;
-
- /* Don't load ramdisk / other modules now if no initial mapping required. */
- for ( mod = 0; mod < dom->num_modules; mod++ )
- {
- unmapped_initrd = (dom->parms.unmapped_initrd &&
- !dom->modules[mod].seg.vstart);
-
- if ( dom->modules[mod].blob && !unmapped_initrd )
- {
- if ( xc_dom_build_module(dom, mod) != 0 )
- goto err;
-
- if ( mod == 0 )
- {
- dom->initrd_start = dom->modules[mod].seg.vstart;
- dom->initrd_len =
- dom->modules[mod].seg.vend - dom->modules[mod].seg.vstart;
- }
- }
- }
-
- /* load devicetree */
- if ( dom->devicetree_blob )
- {
- void *devicetreemap;
-
- if ( xc_dom_alloc_segment(dom, &dom->devicetree_seg, "devicetree",
- dom->devicetree_seg.vstart,
- dom->devicetree_size) != 0 )
- goto err;
- devicetreemap = xc_dom_seg_to_ptr(dom, &dom->devicetree_seg);
- if ( devicetreemap == NULL )
- {
- DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &dom->devicetree_seg) => NULL",
- __FUNCTION__);
- goto err;
- }
- memcpy(devicetreemap, dom->devicetree_blob, dom->devicetree_size);
- }
-
- /* load ACPI tables */
- if ( xc_dom_load_acpi(dom) != 0 )
- goto err;
-
- /* allocate other pages */
- if ( !dom->arch_hooks->p2m_base_supported ||
- dom->parms.p2m_base >= dom->parms.virt_base ||
- (dom->parms.p2m_base & (XC_DOM_PAGE_SIZE(dom) - 1)) )
- dom->parms.p2m_base = UNSET_ADDR;
- if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base == UNSET_ADDR &&
- dom->arch_hooks->alloc_p2m_list(dom) != 0 )
- goto err;
- if ( dom->arch_hooks->alloc_magic_pages(dom) != 0 )
- goto err;
- if ( dom->arch_hooks->alloc_pgtables &&
- dom->arch_hooks->alloc_pgtables(dom) != 0 )
- goto err;
- if ( dom->alloc_bootstack )
- {
- dom->bootstack_pfn = xc_dom_alloc_page(dom, "boot stack");
- if ( dom->bootstack_pfn == INVALID_PFN )
- goto err;
- }
-
- DOMPRINTF("%-20s: virt_alloc_end : 0x%" PRIx64 "",
- __FUNCTION__, dom->virt_alloc_end);
- DOMPRINTF("%-20s: virt_pgtab_end : 0x%" PRIx64 "",
- __FUNCTION__, dom->virt_pgtab_end);
-
- /* Make sure all memory mapped by initial page tables is available */
- if ( dom->virt_pgtab_end && xc_dom_alloc_pad(dom, dom->virt_pgtab_end) )
- return -1;
-
- for ( mod = 0; mod < dom->num_modules; mod++ )
- {
- unmapped_initrd = (dom->parms.unmapped_initrd &&
- !dom->modules[mod].seg.vstart);
-
- /* Load ramdisk / other modules if no initial mapping required. */
- if ( dom->modules[mod].blob && unmapped_initrd )
- {
- if ( xc_dom_build_module(dom, mod) != 0 )
- goto err;
-
- if ( mod == 0 )
- {
- dom->flags |= SIF_MOD_START_PFN;
- dom->initrd_start = dom->modules[mod].seg.pfn;
- dom->initrd_len = page_size * dom->modules[mod].seg.pages;
- }
- }
- }
-
- /* Allocate p2m list if outside of initial kernel mapping. */
- if ( dom->arch_hooks->alloc_p2m_list && dom->parms.p2m_base != UNSET_ADDR )
- {
- if ( dom->arch_hooks->alloc_p2m_list(dom) != 0 )
- goto err;
- dom->p2m_seg.vstart = dom->parms.p2m_base;
- }
-
- return 0;
-
- err:
- return -1;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#ifndef __MINIOS__
-# include "xenctrl_dom.h"
-#else
-# include "xg_dom_decompress_unsafe.h"
-#endif
-
-int xc_try_lz4_decode(struct xc_dom_image *dom, void **blob, size_t *size);
-
+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <stdint.h>
-
-#include "xg_private.h"
-#include "xg_dom_decompress.h"
-
-#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-#define likely(a) a
-#define unlikely(a) a
-
-static inline uint_fast16_t le16_to_cpup(const unsigned char *buf)
-{
- return buf[0] | (buf[1] << 8);
-}
-
-static inline uint_fast32_t le32_to_cpup(const unsigned char *buf)
-{
- return le16_to_cpup(buf) | ((uint32_t)le16_to_cpup(buf + 2) << 16);
-}
-
-#include "../../xen/include/xen/lz4.h"
-#include "../../xen/common/decompress.h"
-
-#ifndef __MINIOS__
-
-#include "../../xen/common/lz4/decompress.c"
-
-#define ARCHIVE_MAGICNUMBER 0x184C2102
-
-int xc_try_lz4_decode(
- struct xc_dom_image *dom, void **blob, size_t *psize)
-{
- int ret = -1;
- unsigned char *inp = *blob, *output, *outp;
- ssize_t size = *psize - 4;
- size_t out_len, dest_len, chunksize;
- const char *msg;
-
- if (size < 4) {
- msg = "input too small";
- goto exit_0;
- }
-
- out_len = get_unaligned_le32(inp + size);
- if (xc_dom_kernel_check_size(dom, out_len)) {
- msg = "Decompressed image too large";
- goto exit_0;
- }
-
- output = malloc(out_len);
- if (!output) {
- msg = "Could not allocate output buffer";
- goto exit_0;
- }
- outp = output;
-
- chunksize = get_unaligned_le32(inp);
- if (chunksize == ARCHIVE_MAGICNUMBER) {
- inp += 4;
- size -= 4;
- } else {
- msg = "invalid header";
- goto exit_2;
- }
-
- for (;;) {
- if (size < 4) {
- msg = "missing data";
- goto exit_2;
- }
- chunksize = get_unaligned_le32(inp);
- if (chunksize == ARCHIVE_MAGICNUMBER) {
- inp += 4;
- size -= 4;
- continue;
- }
- inp += 4;
- size -= 4;
- if (chunksize > size) {
- msg = "insufficient input data";
- goto exit_2;
- }
-
- dest_len = out_len - (outp - output);
- ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp,
- &dest_len);
- if (ret < 0) {
- msg = "decoding failed";
- goto exit_2;
- }
-
- ret = -1;
- outp += dest_len;
- size -= chunksize;
-
- if (size == 0)
- {
- if ( xc_dom_register_external(dom, output, out_len) )
- {
- msg = "Error registering stream output";
- goto exit_2;
- }
- *blob = output;
- *psize = out_len;
- return 0;
- }
-
- if (size < 0) {
- msg = "data corrupted";
- goto exit_2;
- }
-
- inp += chunksize;
- }
-
-exit_2:
- free(output);
-exit_0:
- DOMPRINTF("LZ4 decompression error: %s\n", msg);
- return ret;
-}
-
-#else /* __MINIOS__ */
-
-#include "../../xen/common/unlz4.c"
-
-int xc_try_lz4_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- return xc_dom_decompress_unsafe(unlz4, dom, blob, size);
-}
-
-#endif
+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xg_dom_decompress_unsafe.h"
-
-static struct xc_dom_image *unsafe_dom;
-static unsigned char *output_blob;
-static unsigned int output_size;
-
-static void unsafe_error(const char *msg)
-{
- xc_dom_panic(unsafe_dom->xch, XC_INVALID_KERNEL, "%s", msg);
-}
-
-static int unsafe_flush(void *src, unsigned int size)
-{
- void *n = realloc(output_blob, output_size + size);
- if (!n)
- return -1;
- output_blob = n;
-
- memcpy(&output_blob[output_size], src, size);
- output_size += size;
- return size;
-}
-
-int xc_dom_decompress_unsafe(
- decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size)
-{
- int ret;
-
- unsafe_dom = dom;
- output_blob = NULL;
- output_size = 0;
-
- ret = fn(dom->kernel_blob, dom->kernel_size, NULL, unsafe_flush, NULL, NULL, unsafe_error);
-
- if (ret)
- free(output_blob);
- else {
- *blob = output_blob;
- *size = output_size;
- }
-
- return ret;
-}
+++ /dev/null
-#include "xenctrl_dom.h"
-
-typedef int decompress_fn(unsigned char *inbuf, unsigned int len,
- int (*fill)(void*, unsigned int),
- int (*flush)(void*, unsigned int),
- unsigned char *outbuf, unsigned int *posp,
- void (*error)(const char *x));
-
-int xc_dom_decompress_unsafe(
- decompress_fn fn, struct xc_dom_image *dom, void **blob, size_t *size)
- __attribute__((visibility("internal")));
-
-int xc_try_bzip2_decode(struct xc_dom_image *dom, void **blob, size_t *size)
- __attribute__((visibility("internal")));
-int xc_try_lzma_decode(struct xc_dom_image *dom, void **blob, size_t *size)
- __attribute__((visibility("internal")));
-int xc_try_lzo1x_decode(struct xc_dom_image *dom, void **blob, size_t *size)
- __attribute__((visibility("internal")));
-int xc_try_xz_decode(struct xc_dom_image *dom, void **blob, size_t *size)
- __attribute__((visibility("internal")));
+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xg_dom_decompress_unsafe.h"
-
-#include "../../xen/common/bunzip2.c"
-
-int xc_try_bzip2_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- return xc_dom_decompress_unsafe(bunzip2, dom, blob, size);
-}
+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xg_dom_decompress_unsafe.h"
-
-#include "../../xen/common/unlzma.c"
-
-int xc_try_lzma_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- return xc_dom_decompress_unsafe(unlzma, dom, blob, size);
-}
+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <endian.h>
-#include <stdint.h>
-
-#include "xg_private.h"
-#include "xg_dom_decompress_unsafe.h"
-
-typedef uint8_t u8;
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint64_t u64;
-
-#define likely(a) a
-#define noinline
-#define unlikely(a) a
-
-static inline u16 be16_to_cpup(const u16 *p)
-{
- u16 v = *p;
-#if BYTE_ORDER == LITTLE_ENDIAN
- return (((v & 0x00ffU) << 8) |
- ((v & 0xff00U) >> 8));
-#else
- return v;
-#endif
-}
-
-static inline u32 be32_to_cpup(const u32 *p)
-{
- u32 v = *p;
-#if BYTE_ORDER == LITTLE_ENDIAN
- return (((v & 0x000000ffUL) << 24) |
- ((v & 0x0000ff00UL) << 8) |
- ((v & 0x00ff0000UL) >> 8) |
- ((v & 0xff000000UL) >> 24));
-#else
- return v;
-#endif
-}
-
-#include "../../xen/common/lzo.c"
-#include "../../xen/common/unlzo.c"
-
-int xc_try_lzo1x_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- return xc_dom_decompress_unsafe(unlzo, dom, blob, size);
-}
+++ /dev/null
-#include <stdio.h>
-#include <endian.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xg_dom_decompress_unsafe.h"
-
-// TODO
-#define XZ_DEC_X86
-
-typedef char bool_t;
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint32_t __le32;
-
-static inline u32 cpu_to_le32(const u32 v)
-{
-#if BYTE_ORDER == BIG_ENDIAN
- return (((v & 0x000000ffUL) << 24) |
- ((v & 0x0000ff00UL) << 8) |
- ((v & 0x00ff0000UL) >> 8) |
- ((v & 0xff000000UL) >> 24));
-#else
- return v;
-#endif
-}
-
-static inline u32 le32_to_cpup(const u32 *p)
-{
- return cpu_to_le32(*p);
-}
-
-#define __force
-#define always_inline
-
-#include "../../xen/common/unxz.c"
-
-int xc_try_xz_decode(
- struct xc_dom_image *dom, void **blob, size_t *size)
-{
- return xc_dom_decompress_unsafe(unxz, dom, blob, size);
-}
+++ /dev/null
-/*
- * Xen domain builder -- ELF bits.
- *
- * Parse and load ELF kernel images.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-#include "xc_bitops.h"
-
-#define XEN_VER "xen-3.0"
-
-/* ------------------------------------------------------------------------ */
-
-static void log_callback(struct elf_binary *elf, void *caller_data,
- bool iserr, const char *fmt, va_list al) {
- xc_interface *xch = caller_data;
-
- xc_reportv(xch,
- xch->dombuild_logger ? xch->dombuild_logger : xch->error_handler,
- iserr ? XTL_ERROR : XTL_DETAIL,
- iserr ? XC_INVALID_KERNEL : XC_ERROR_NONE,
- fmt, al);
-}
-
-void xc_elf_set_logfile(xc_interface *xch, struct elf_binary *elf,
- int verbose) {
- elf_set_log(elf, log_callback, xch, verbose /* convert to bool */);
-}
-
-/* ------------------------------------------------------------------------ */
-
-static char *xc_dom_guest_type(struct xc_dom_image *dom,
- struct elf_binary *elf)
-{
- uint64_t machine = elf_uval(elf, elf->ehdr, e_machine);
-
- if ( dom->container_type == XC_DOM_HVM_CONTAINER &&
- dom->parms.phys_entry != UNSET_ADDR32 )
- return "hvm-3.0-x86_32";
- if ( dom->container_type == XC_DOM_HVM_CONTAINER )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s: image not capable of booting inside a HVM container",
- __FUNCTION__);
- return NULL;
- }
-
- switch ( machine )
- {
- case EM_386:
- switch ( dom->parms.pae )
- {
- case XEN_PAE_BIMODAL:
- if ( strstr(dom->xen_caps, "xen-3.0-x86_32p") )
- return "xen-3.0-x86_32p";
- return "xen-3.0-x86_32";
- case XEN_PAE_EXTCR3:
- case XEN_PAE_YES:
- return "xen-3.0-x86_32p";
- case XEN_PAE_NO:
- default:
- return "xen-3.0-x86_32";
- }
- case EM_X86_64:
- return "xen-3.0-x86_64";
- default:
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s: unknown image type %"PRIu64,
- __FUNCTION__, machine);
- return NULL;
- }
-}
-
-/* ------------------------------------------------------------------------ */
-/* parse elf binary */
-
-static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose)
-{
- if ( dom->kernel_blob == NULL )
- {
- if ( verbose )
- xc_dom_panic(dom->xch,
- XC_INTERNAL_ERROR, "%s: no kernel image loaded",
- __FUNCTION__);
- return -EINVAL;
- }
-
- if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) )
- {
- if ( verbose )
- xc_dom_panic(dom->xch,
- XC_INVALID_KERNEL, "%s: kernel is not an ELF image",
- __FUNCTION__);
- return -EINVAL;
- }
- return 0;
-}
-
-static elf_negerrnoval xc_dom_probe_elf_kernel(struct xc_dom_image *dom)
-{
- struct elf_binary elf;
- int rc;
-
- rc = check_elf_kernel(dom, 0);
- if ( rc != 0 )
- return rc;
-
- rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size);
- if ( rc != 0 )
- return rc;
-
- /*
- * We need to check that it contains Xen ELFNOTES,
- * or else we might be trying to load a plain ELF.
- */
- elf_parse_binary(&elf);
- rc = elf_xen_parse(&elf, &dom->parms);
- if ( rc != 0 )
- return rc;
-
- return 0;
-}
-
-static elf_negerrnoval xc_dom_parse_elf_kernel(struct xc_dom_image *dom)
-{
- struct elf_binary *elf;
- elf_negerrnoval rc;
-
- rc = check_elf_kernel(dom, 1);
- if ( rc != 0 )
- return rc;
-
- elf = xc_dom_malloc(dom, sizeof(*elf));
- if ( elf == NULL )
- return -ENOMEM;
- dom->private_loader = elf;
- rc = elf_init(elf, dom->kernel_blob, dom->kernel_size) != 0 ? -EINVAL : 0;
- xc_elf_set_logfile(dom->xch, elf, 1);
- if ( rc != 0 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image",
- __FUNCTION__);
- return rc;
- }
-
- /* parse binary and get xen meta info */
- elf_parse_binary(elf);
- if ( elf_xen_parse(elf, &dom->parms) != 0 )
- {
- rc = -EINVAL;
- goto out;
- }
-
- if ( elf_xen_feature_get(XENFEAT_dom0, dom->parms.f_required) )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: Kernel does not"
- " support unprivileged (DomU) operation", __FUNCTION__);
- rc = -EINVAL;
- goto out;
- }
-
- /* find kernel segment */
- dom->kernel_seg.vstart = dom->parms.virt_kstart;
- dom->kernel_seg.vend = dom->parms.virt_kend;
-
- dom->guest_type = xc_dom_guest_type(dom, elf);
- if ( dom->guest_type == NULL )
- return -EINVAL;
- DOMPRINTF("%s: %s: 0x%" PRIx64 " -> 0x%" PRIx64 "",
- __FUNCTION__, dom->guest_type,
- dom->kernel_seg.vstart, dom->kernel_seg.vend);
- rc = 0;
-out:
- if ( elf_check_broken(elf) )
- DOMPRINTF("%s: ELF broken: %s", __FUNCTION__,
- elf_check_broken(elf));
-
- return rc;
-}
-
-static elf_errorstatus xc_dom_load_elf_kernel(struct xc_dom_image *dom)
-{
- struct elf_binary *elf = dom->private_loader;
- elf_errorstatus rc;
- xen_pfn_t pages;
-
- elf->dest_base = xc_dom_seg_to_ptr_pages(dom, &dom->kernel_seg, &pages);
- if ( elf->dest_base == NULL )
- {
- DOMPRINTF("%s: xc_dom_vaddr_to_ptr(dom,dom->kernel_seg)"
- " => NULL", __FUNCTION__);
- return -1;
- }
- elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom);
-
- rc = elf_load_binary(elf);
- if ( rc < 0 )
- {
- DOMPRINTF("%s: failed to load elf binary", __FUNCTION__);
- return rc;
- }
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-
-struct xc_dom_loader elf_loader = {
- .name = "ELF-generic",
- .probe = xc_dom_probe_elf_kernel,
- .parser = xc_dom_parse_elf_kernel,
- .loader = xc_dom_load_elf_kernel,
-};
-
-static void __init register_loader(void)
-{
- xc_dom_register_loader(&elf_loader);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * Xen domain builder -- HVM specific bits.
- *
- * Parse and load ELF firmware images for HVM domains.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <inttypes.h>
-#include <assert.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-#include "xc_bitops.h"
-
-/* ------------------------------------------------------------------------ */
-/* parse elf binary */
-
-static elf_negerrnoval check_elf_kernel(struct xc_dom_image *dom, bool verbose)
-{
- if ( dom->kernel_blob == NULL )
- {
- if ( verbose )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: no kernel image loaded", __func__);
- return -EINVAL;
- }
-
- if ( !elf_is_elfbinary(dom->kernel_blob, dom->kernel_size) )
- {
- if ( verbose )
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL,
- "%s: kernel is not an ELF image", __func__);
- return -EINVAL;
- }
- return 0;
-}
-
-static elf_negerrnoval xc_dom_probe_hvm_kernel(struct xc_dom_image *dom)
-{
- struct elf_binary elf;
- int rc;
-
- /* This loader is designed for HVM guest firmware. */
- if ( dom->container_type != XC_DOM_HVM_CONTAINER )
- return -EINVAL;
-
- rc = check_elf_kernel(dom, 0);
- if ( rc != 0 )
- return rc;
-
- rc = elf_init(&elf, dom->kernel_blob, dom->kernel_size);
- if ( rc != 0 )
- return rc;
-
- /*
- * We need to check that there are no Xen ELFNOTES, or
- * else we might be trying to load a PV kernel.
- */
- elf_parse_binary(&elf);
- rc = elf_xen_parse(&elf, &dom->parms);
- if ( rc == 0 )
- return -EINVAL;
-
- return 0;
-}
-
-static elf_errorstatus xc_dom_parse_hvm_kernel(struct xc_dom_image *dom)
- /*
- * This function sometimes returns -1 for error and sometimes
- * an errno value. ?!?!
- */
-{
- struct elf_binary *elf;
- elf_errorstatus rc;
-
- rc = check_elf_kernel(dom, 1);
- if ( rc != 0 )
- return rc;
-
- elf = xc_dom_malloc(dom, sizeof(*elf));
- if ( elf == NULL )
- return -1;
- dom->private_loader = elf;
- rc = elf_init(elf, dom->kernel_blob, dom->kernel_size);
- xc_elf_set_logfile(dom->xch, elf, 1);
- if ( rc != 0 )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: corrupted ELF image",
- __func__);
- return rc;
- }
-
- if ( !elf_32bit(elf) )
- {
- xc_dom_panic(dom->xch, XC_INVALID_KERNEL, "%s: ELF image is not 32bit",
- __func__);
- return -EINVAL;
- }
-
- /* parse binary and get xen meta info */
- elf_parse_binary(elf);
-
- /* find kernel segment */
- dom->kernel_seg.vstart = elf->pstart;
- dom->kernel_seg.vend = elf->pend;
-
- dom->guest_type = "hvm-3.0-x86_32";
-
- if ( elf_check_broken(elf) )
- DOMPRINTF("%s: ELF broken: %s", __func__, elf_check_broken(elf));
-
- return rc;
-}
-
-static int module_init_one(struct xc_dom_image *dom,
- struct xc_hvm_firmware_module *module,
- char *name)
-{
- struct xc_dom_seg seg;
- void *dest;
-
- if ( module->length && !module->guest_addr_out )
- {
- if ( xc_dom_alloc_segment(dom, &seg, name, 0, module->length) )
- goto err;
- dest = xc_dom_seg_to_ptr(dom, &seg);
- if ( dest == NULL )
- {
- DOMPRINTF("%s: xc_dom_seg_to_ptr(dom, &seg) => NULL",
- __FUNCTION__);
- goto err;
- }
- memcpy(dest, module->data, module->length);
- module->guest_addr_out = seg.vstart;
-
- assert(dom->mmio_start > 0 && dom->mmio_start < UINT32_MAX);
- if ( module->guest_addr_out > dom->mmio_start ||
- module->guest_addr_out + module->length > dom->mmio_start )
- {
- DOMPRINTF("%s: Module %s would be loaded abrove 4GB",
- __FUNCTION__, name);
- goto err;
- }
- }
-
- return 0;
-err:
- return -1;
-}
-
-static int modules_init(struct xc_dom_image *dom)
-{
- int rc;
-
- rc = module_init_one(dom, &dom->system_firmware_module,
- "System Firmware module");
- if ( rc ) goto err;
- /* Only one module can be added */
- rc = module_init_one(dom, &dom->acpi_modules[0], "ACPI module");
- if ( rc ) goto err;
- rc = module_init_one(dom, &dom->smbios_module, "SMBIOS module");
- if ( rc ) goto err;
-
- return 0;
-err:
- return -1;
-}
-
-static elf_errorstatus xc_dom_load_hvm_kernel(struct xc_dom_image *dom)
-{
- struct elf_binary *elf = dom->private_loader;
- privcmd_mmap_entry_t *entries = NULL;
- size_t pages = (elf->pend - elf->pstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
- elf_errorstatus rc;
- int i;
-
- /* Map address space for initial elf image. */
- entries = calloc(pages, sizeof(privcmd_mmap_entry_t));
- if ( entries == NULL )
- return -ENOMEM;
-
- for ( i = 0; i < pages; i++ )
- entries[i].mfn = (elf->pstart >> PAGE_SHIFT) + i;
-
- elf->dest_base = xc_map_foreign_ranges(
- dom->xch, dom->guest_domid, pages << PAGE_SHIFT,
- PROT_READ | PROT_WRITE, 1 << PAGE_SHIFT,
- entries, pages);
- if ( elf->dest_base == NULL )
- {
- DOMPRINTF("%s: unable to map guest memory space", __func__);
- rc = -EFAULT;
- goto error;
- }
-
- elf->dest_size = pages * XC_DOM_PAGE_SIZE(dom);
-
- rc = elf_load_binary(elf);
- if ( rc < 0 )
- {
- DOMPRINTF("%s: failed to load elf binary", __func__);
- goto error;
- }
-
- munmap(elf->dest_base, elf->dest_size);
-
- rc = modules_init(dom);
- if ( rc != 0 )
- {
- DOMPRINTF("%s: unable to load modules.", __func__);
- goto error;
- }
-
- dom->parms.phys_entry = elf_uval(elf, elf->ehdr, e_entry);
-
- free(entries);
- return 0;
-
- error:
- assert(rc != 0);
- free(entries);
- return rc;
-}
-
-/* ------------------------------------------------------------------------ */
-
-struct xc_dom_loader hvm_loader = {
- .name = "HVM-generic",
- .probe = xc_dom_probe_hvm_kernel,
- .parser = xc_dom_parse_hvm_kernel,
- .loader = xc_dom_load_hvm_kernel,
-};
-
-static void __init register_loader(void)
-{
- xc_dom_register_loader(&hvm_loader);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * Xen domain builder -- i386 and x86_64 bits.
- *
- * Most architecture-specific code for x86 goes here.
- * - prepare page tables.
- * - fill architecture-specific structs.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * written 2006 by Gerd Hoffmann <kraxel@suse.de>.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include <assert.h>
-
-#include <xen/xen.h>
-#include <xen/foreign/x86_32.h>
-#include <xen/foreign/x86_64.h>
-#include <xen/hvm/hvm_info_table.h>
-#include <xen/arch-x86/hvm/start_info.h>
-#include <xen/io/protocols.h>
-
-#include <xen-tools/libs.h>
-
-#include "xg_private.h"
-#include "xenctrl_dom.h"
-#include "xenctrl.h"
-
-/* ------------------------------------------------------------------------ */
-
-#define SUPERPAGE_BATCH_SIZE 512
-
-#define SUPERPAGE_2MB_SHIFT 9
-#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
-#define SUPERPAGE_1GB_SHIFT 18
-#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
-
-#define X86_CR0_PE 0x01
-#define X86_CR0_ET 0x10
-
-#define X86_DR6_DEFAULT 0xffff0ff0u
-#define X86_DR7_DEFAULT 0x00000400u
-
-#define MTRR_TYPE_WRBACK 6
-#define MTRR_DEF_TYPE_ENABLE (1u << 11)
-
-#define SPECIALPAGE_PAGING 0
-#define SPECIALPAGE_ACCESS 1
-#define SPECIALPAGE_SHARING 2
-#define SPECIALPAGE_BUFIOREQ 3
-#define SPECIALPAGE_XENSTORE 4
-#define SPECIALPAGE_IOREQ 5
-#define SPECIALPAGE_IDENT_PT 6
-#define SPECIALPAGE_CONSOLE 7
-#define special_pfn(x) \
- (X86_HVM_END_SPECIAL_REGION - X86_HVM_NR_SPECIAL_PAGES + (x))
-
-#define NR_IOREQ_SERVER_PAGES 8
-#define ioreq_server_pfn(x) (special_pfn(0) - NR_IOREQ_SERVER_PAGES + (x))
-
-#define bits_to_mask(bits) (((xen_vaddr_t)1 << (bits))-1)
-#define round_down(addr, mask) ((addr) & ~(mask))
-#define round_up(addr, mask) ((addr) | (mask))
-#define round_pg_up(addr) (((addr) + PAGE_SIZE_X86 - 1) & ~(PAGE_SIZE_X86 - 1))
-
-#define HVMLOADER_MODULE_MAX_COUNT 2
-#define HVMLOADER_MODULE_CMDLINE_SIZE MAX_GUEST_CMDLINE
-
-struct xc_dom_params {
- unsigned levels;
- xen_vaddr_t vaddr_mask;
- x86_pgentry_t lvl_prot[4];
-};
-
-struct xc_dom_x86_mapping_lvl {
- xen_vaddr_t from;
- xen_vaddr_t to;
- xen_pfn_t pfn;
- unsigned int pgtables;
-};
-
-struct xc_dom_x86_mapping {
- struct xc_dom_x86_mapping_lvl area;
- struct xc_dom_x86_mapping_lvl lvls[4];
-};
-
-struct xc_dom_image_x86 {
- unsigned n_mappings;
-#define MAPPING_MAX 2
- struct xc_dom_x86_mapping maps[MAPPING_MAX];
- const struct xc_dom_params *params;
-
- /* PV: Pointer to the in-guest P2M. */
- void *p2m_guest;
-};
-
-/* get guest IO ABI protocol */
-const char *xc_domain_get_native_protocol(xc_interface *xch,
- uint32_t domid)
-{
- int ret;
- uint32_t guest_width;
- const char *protocol;
-
- ret = xc_domain_get_guest_width(xch, domid, &guest_width);
-
- if ( ret )
- return NULL;
-
- switch (guest_width) {
- case 4: /* 32 bit guest */
- protocol = XEN_IO_PROTO_ABI_X86_32;
- break;
- case 8: /* 64 bit guest */
- protocol = XEN_IO_PROTO_ABI_X86_64;
- break;
- default:
- protocol = NULL;
- }
-
- return protocol;
-}
-
-static int count_pgtables(struct xc_dom_image *dom, xen_vaddr_t from,
- xen_vaddr_t to, xen_pfn_t pfn)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- struct xc_dom_x86_mapping *map, *map_cmp;
- xen_pfn_t pfn_end;
- xen_vaddr_t mask;
- unsigned bits;
- int l, m;
-
- if ( domx86->n_mappings == MAPPING_MAX )
- {
- xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
- "%s: too many mappings\n", __FUNCTION__);
- return -ENOMEM;
- }
- map = domx86->maps + domx86->n_mappings;
-
- pfn_end = pfn + ((to - from) >> PAGE_SHIFT_X86);
- if ( pfn_end >= dom->p2m_size )
- {
- xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
- "%s: not enough memory for initial mapping (%#"PRIpfn" > %#"PRIpfn")",
- __FUNCTION__, pfn_end, dom->p2m_size);
- return -ENOMEM;
- }
- for ( m = 0; m < domx86->n_mappings; m++ )
- {
- map_cmp = domx86->maps + m;
- if ( from < map_cmp->area.to && to > map_cmp->area.from )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: overlapping mappings\n", __FUNCTION__);
- return -EINVAL;
- }
- }
-
- memset(map, 0, sizeof(*map));
- map->area.from = from & domx86->params->vaddr_mask;
- map->area.to = to & domx86->params->vaddr_mask;
-
- for ( l = domx86->params->levels - 1; l >= 0; l-- )
- {
- map->lvls[l].pfn = dom->pfn_alloc_end + map->area.pgtables;
- if ( l == domx86->params->levels - 1 )
- {
- /* Top level page table in first mapping only. */
- if ( domx86->n_mappings == 0 )
- {
- map->lvls[l].from = 0;
- map->lvls[l].to = domx86->params->vaddr_mask;
- map->lvls[l].pgtables = 1;
- map->area.pgtables++;
- }
- continue;
- }
-
- bits = PAGE_SHIFT_X86 + (l + 1) * PGTBL_LEVEL_SHIFT_X86;
- mask = bits_to_mask(bits);
- map->lvls[l].from = map->area.from & ~mask;
- map->lvls[l].to = map->area.to | mask;
-
- if ( domx86->params->levels == PGTBL_LEVELS_I386 &&
- domx86->n_mappings == 0 && to < 0xc0000000 && l == 1 )
- {
- DOMPRINTF("%s: PAE: extra l2 page table for l3#3", __FUNCTION__);
- map->lvls[l].to = domx86->params->vaddr_mask;
- }
-
- for ( m = 0; m < domx86->n_mappings; m++ )
- {
- map_cmp = domx86->maps + m;
- if ( map_cmp->lvls[l].from == map_cmp->lvls[l].to )
- continue;
- if ( map->lvls[l].from >= map_cmp->lvls[l].from &&
- map->lvls[l].to <= map_cmp->lvls[l].to )
- {
- map->lvls[l].from = 0;
- map->lvls[l].to = 0;
- break;
- }
- assert(map->lvls[l].from >= map_cmp->lvls[l].from ||
- map->lvls[l].to <= map_cmp->lvls[l].to);
- if ( map->lvls[l].from >= map_cmp->lvls[l].from &&
- map->lvls[l].from <= map_cmp->lvls[l].to )
- map->lvls[l].from = map_cmp->lvls[l].to + 1;
- if ( map->lvls[l].to >= map_cmp->lvls[l].from &&
- map->lvls[l].to <= map_cmp->lvls[l].to )
- map->lvls[l].to = map_cmp->lvls[l].from - 1;
- }
- if ( map->lvls[l].from < map->lvls[l].to )
- map->lvls[l].pgtables =
- ((map->lvls[l].to - map->lvls[l].from) >> bits) + 1;
- DOMPRINTF("%s: 0x%016" PRIx64 "/%d: 0x%016" PRIx64 " -> 0x%016" PRIx64
- ", %d table(s)", __FUNCTION__, mask, bits,
- map->lvls[l].from, map->lvls[l].to, map->lvls[l].pgtables);
- map->area.pgtables += map->lvls[l].pgtables;
- }
-
- return 0;
-}
-
-static int alloc_pgtables_pv(struct xc_dom_image *dom)
-{
- int pages, extra_pages;
- xen_vaddr_t try_virt_end;
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;
-
- extra_pages = dom->alloc_bootstack ? 1 : 0;
- extra_pages += (512 * 1024) / PAGE_SIZE_X86; /* 512kB padding */
- pages = extra_pages;
- for ( ; ; )
- {
- try_virt_end = round_up(dom->virt_alloc_end + pages * PAGE_SIZE_X86,
- bits_to_mask(22)); /* 4MB alignment */
-
- if ( count_pgtables(dom, dom->parms.virt_base, try_virt_end, 0) )
- return -1;
-
- pages = map->area.pgtables + extra_pages;
- if ( dom->virt_alloc_end + pages * PAGE_SIZE_X86 <= try_virt_end + 1 )
- break;
- }
- map->area.pfn = 0;
- domx86->n_mappings++;
- dom->virt_pgtab_end = try_virt_end + 1;
-
- return xc_dom_alloc_segment(dom, &dom->pgtables_seg, "page tables", 0,
- map->area.pgtables * PAGE_SIZE_X86);
-}
-
-/* ------------------------------------------------------------------------ */
-/* i386 pagetables */
-
-static int alloc_pgtables_x86_32_pae(struct xc_dom_image *dom)
-{
- static const struct xc_dom_params x86_32_params = {
- .levels = PGTBL_LEVELS_I386,
- .vaddr_mask = bits_to_mask(VIRT_BITS_I386),
- .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED,
- /*
- * 64bit Xen runs 32bit PV guests with the PAE entries in an L3
- * pagetable. They don't behave exactly like native PAE paging.
- */
- .lvl_prot[1 ... 2] =
- _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
- };
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
-
- domx86->params = &x86_32_params;
-
- return alloc_pgtables_pv(dom);
-}
-
-#define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86)
-#define pgentry_to_pfn(entry) ((xen_pfn_t)((entry) >> PAGE_SHIFT_X86))
-
-/*
- * Move the l3 page table page below 4G for guests which do not
- * support the extended-cr3 format. The l3 is currently empty so we
- * do not need to preserve the current contents.
- */
-static xen_pfn_t move_l3_below_4G(struct xc_dom_image *dom,
- xen_pfn_t l3pfn,
- xen_pfn_t l3mfn)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- uint32_t *p2m_guest = domx86->p2m_guest;
- xen_pfn_t new_l3mfn;
- struct xc_mmu *mmu;
- void *l3tab;
-
- mmu = xc_alloc_mmu_updates(dom->xch, dom->guest_domid);
- if ( mmu == NULL )
- {
- DOMPRINTF("%s: failed at %d", __FUNCTION__, __LINE__);
- return l3mfn;
- }
-
- xc_dom_unmap_one(dom, l3pfn);
-
- new_l3mfn = xc_make_page_below_4G(dom->xch, dom->guest_domid, l3mfn);
- if ( !new_l3mfn )
- goto out;
-
- p2m_guest[l3pfn] = dom->pv_p2m[l3pfn] = new_l3mfn;
-
- if ( xc_add_mmu_update(dom->xch, mmu,
- (((unsigned long long)new_l3mfn)
- << XC_DOM_PAGE_SHIFT(dom)) |
- MMU_MACHPHYS_UPDATE, l3pfn) )
- goto out;
-
- if ( xc_flush_mmu_updates(dom->xch, mmu) )
- goto out;
-
- /*
- * This ensures that the entire pgtables_seg is mapped by a single
- * mmap region. arch_setup_bootlate() relies on this to be able to
- * unmap and pin the pagetables.
- */
- if ( xc_dom_seg_to_ptr(dom, &dom->pgtables_seg) == NULL )
- goto out;
-
- l3tab = xc_dom_pfn_to_ptr(dom, l3pfn, 1);
- if ( l3tab == NULL )
- {
- DOMPRINTF("%s: xc_dom_pfn_to_ptr(dom, l3pfn, 1) => NULL",
- __FUNCTION__);
- goto out; /* our one call site will call xc_dom_panic and fail */
- }
- memset(l3tab, 0, XC_DOM_PAGE_SIZE(dom));
-
- DOMPRINTF("%s: successfully relocated L3 below 4G. "
- "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn"=>%#"PRIpfn")",
- __FUNCTION__, l3pfn, l3mfn, new_l3mfn);
-
- l3mfn = new_l3mfn;
-
- out:
- free(mmu);
-
- return l3mfn;
-}
-
-static x86_pgentry_t *get_pg_table(struct xc_dom_image *dom, int m, int l)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- struct xc_dom_x86_mapping *map;
- x86_pgentry_t *pg;
-
- map = domx86->maps + m;
- pg = xc_dom_pfn_to_ptr(dom, map->lvls[l].pfn, 0);
- if ( pg )
- return pg;
-
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: xc_dom_pfn_to_ptr failed", __FUNCTION__);
- return NULL;
-}
-
-static x86_pgentry_t get_pg_prot(struct xc_dom_image *dom, int l, xen_pfn_t pfn)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- struct xc_dom_x86_mapping *map;
- xen_pfn_t pfn_s, pfn_e;
- x86_pgentry_t prot;
- unsigned m;
-
- prot = domx86->params->lvl_prot[l];
- if ( l > 0 )
- return prot;
-
- for ( m = 0; m < domx86->n_mappings; m++ )
- {
- map = domx86->maps + m;
- pfn_s = map->lvls[domx86->params->levels - 1].pfn;
- pfn_e = map->area.pgtables + pfn_s;
- if ( pfn >= pfn_s && pfn < pfn_e )
- return prot & ~_PAGE_RW;
- }
-
- return prot;
-}
-
-static int setup_pgtables_pv(struct xc_dom_image *dom)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- struct xc_dom_x86_mapping *map1, *map2;
- struct xc_dom_x86_mapping_lvl *lvl;
- xen_vaddr_t from, to;
- xen_pfn_t pfn, p, p_s, p_e;
- x86_pgentry_t *pg;
- unsigned m1, m2;
- int l;
-
- for ( l = domx86->params->levels - 1; l >= 0; l-- )
- for ( m1 = 0; m1 < domx86->n_mappings; m1++ )
- {
- map1 = domx86->maps + m1;
- from = map1->lvls[l].from;
- to = map1->lvls[l].to;
- pg = get_pg_table(dom, m1, l);
- if ( !pg )
- return -1;
- for ( m2 = 0; m2 < domx86->n_mappings; m2++ )
- {
- map2 = domx86->maps + m2;
- lvl = (l > 0) ? map2->lvls + l - 1 : &map2->area;
- if ( l > 0 && lvl->pgtables == 0 )
- continue;
- if ( lvl->from >= to || lvl->to <= from )
- continue;
- p_s = (max(from, lvl->from) - from) >>
- (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86);
- p_e = (min(to, lvl->to) - from) >>
- (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86);
- pfn = ((max(from, lvl->from) - lvl->from) >>
- (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86)) + lvl->pfn;
- for ( p = p_s; p <= p_e; p++ )
- {
- pg[p] = pfn_to_paddr(xc_dom_p2m(dom, pfn)) |
- get_pg_prot(dom, l, pfn);
- pfn++;
- }
- }
- }
-
- return 0;
-}
-
-static int setup_pgtables_x86_32_pae(struct xc_dom_image *dom)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- uint32_t *p2m_guest = domx86->p2m_guest;
- xen_pfn_t l3mfn, l3pfn, i;
-
- /* Copy dom->pv_p2m[] into the guest. */
- for ( i = 0; i < dom->p2m_size; ++i )
- {
- if ( dom->pv_p2m[i] != INVALID_PFN )
- p2m_guest[i] = dom->pv_p2m[i];
- else
- p2m_guest[i] = -1;
- }
-
- l3pfn = domx86->maps[0].lvls[2].pfn;
- l3mfn = xc_dom_p2m(dom, l3pfn);
- if ( dom->parms.pae == XEN_PAE_YES )
- {
- if ( l3mfn >= 0x100000 )
- l3mfn = move_l3_below_4G(dom, l3pfn, l3mfn);
-
- if ( l3mfn >= 0x100000 )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,"%s: cannot move L3"
- " below 4G. extended-cr3 not supported by guest. "
- "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn")",
- __FUNCTION__, l3pfn, l3mfn);
- return -EINVAL;
- }
- }
-
- return setup_pgtables_pv(dom);
-}
-
-/* ------------------------------------------------------------------------ */
-/* x86_64 pagetables */
-
-static int alloc_pgtables_x86_64(struct xc_dom_image *dom)
-{
- const static struct xc_dom_params x86_64_params = {
- .levels = PGTBL_LEVELS_X86_64,
- .vaddr_mask = bits_to_mask(VIRT_BITS_X86_64),
- .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED,
- .lvl_prot[1 ... 3] =
- _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
- };
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
-
- domx86->params = &x86_64_params;
-
- return alloc_pgtables_pv(dom);
-}
-
-static int setup_pgtables_x86_64(struct xc_dom_image *dom)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- uint64_t *p2m_guest = domx86->p2m_guest;
- xen_pfn_t i;
-
- /* Copy dom->pv_p2m[] into the guest. */
- for ( i = 0; i < dom->p2m_size; ++i )
- {
- if ( dom->pv_p2m[i] != INVALID_PFN )
- p2m_guest[i] = dom->pv_p2m[i];
- else
- p2m_guest[i] = -1;
- }
-
- return setup_pgtables_pv(dom);
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int alloc_p2m_list(struct xc_dom_image *dom, size_t p2m_alloc_size)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
-
- if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach",
- 0, p2m_alloc_size) )
- return -1;
-
- domx86->p2m_guest = xc_dom_seg_to_ptr(dom, &dom->p2m_seg);
- if ( domx86->p2m_guest == NULL )
- return -1;
-
- return 0;
-}
-
-static int alloc_p2m_list_x86_32(struct xc_dom_image *dom)
-{
- size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn;
-
- p2m_alloc_size = round_pg_up(p2m_alloc_size);
- return alloc_p2m_list(dom, p2m_alloc_size);
-}
-
-static int alloc_p2m_list_x86_64(struct xc_dom_image *dom)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;
- size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn;
- xen_vaddr_t from, to;
- unsigned lvl;
-
- p2m_alloc_size = round_pg_up(p2m_alloc_size);
- if ( dom->parms.p2m_base != UNSET_ADDR )
- {
- from = dom->parms.p2m_base;
- to = from + p2m_alloc_size - 1;
- if ( count_pgtables(dom, from, to, dom->pfn_alloc_end) )
- return -1;
-
- map->area.pfn = dom->pfn_alloc_end;
- for ( lvl = 0; lvl < 4; lvl++ )
- map->lvls[lvl].pfn += p2m_alloc_size >> PAGE_SHIFT_X86;
- domx86->n_mappings++;
- p2m_alloc_size += map->area.pgtables << PAGE_SHIFT_X86;
- }
-
- return alloc_p2m_list(dom, p2m_alloc_size);
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int alloc_magic_pages_pv(struct xc_dom_image *dom)
-{
- dom->start_info_pfn = xc_dom_alloc_page(dom, "start info");
- if ( dom->start_info_pfn == INVALID_PFN )
- return -1;
-
- dom->xenstore_pfn = xc_dom_alloc_page(dom, "xenstore");
- if ( dom->xenstore_pfn == INVALID_PFN )
- return -1;
- xc_clear_domain_page(dom->xch, dom->guest_domid,
- xc_dom_p2m(dom, dom->xenstore_pfn));
-
- dom->console_pfn = xc_dom_alloc_page(dom, "console");
- if ( dom->console_pfn == INVALID_PFN )
- return -1;
- xc_clear_domain_page(dom->xch, dom->guest_domid,
- xc_dom_p2m(dom, dom->console_pfn));
-
- dom->alloc_bootstack = 1;
-
- return 0;
-}
-
-static void build_hvm_info(void *hvm_info_page, struct xc_dom_image *dom)
-{
- struct hvm_info_table *hvm_info = (struct hvm_info_table *)
- (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
- uint8_t sum;
- int i;
-
- memset(hvm_info_page, 0, PAGE_SIZE);
-
- /* Fill in the header. */
- memcpy(hvm_info->signature, "HVM INFO", sizeof(hvm_info->signature));
- hvm_info->length = sizeof(struct hvm_info_table);
-
- /* Sensible defaults: these can be overridden by the caller. */
- hvm_info->apic_mode = 1;
- hvm_info->nr_vcpus = 1;
- memset(hvm_info->vcpu_online, 0xff, sizeof(hvm_info->vcpu_online));
-
- /* Memory parameters. */
- hvm_info->low_mem_pgend = dom->lowmem_end >> PAGE_SHIFT;
- hvm_info->high_mem_pgend = dom->highmem_end >> PAGE_SHIFT;
- hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0);
-
- /* Finish with the checksum. */
- for ( i = 0, sum = 0; i < hvm_info->length; i++ )
- sum += ((uint8_t *)hvm_info)[i];
- hvm_info->checksum = -sum;
-}
-
-static int alloc_magic_pages_hvm(struct xc_dom_image *dom)
-{
- unsigned long i;
- uint32_t *ident_pt, domid = dom->guest_domid;
- int rc;
- xen_pfn_t special_array[X86_HVM_NR_SPECIAL_PAGES];
- xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES];
- xc_interface *xch = dom->xch;
- size_t start_info_size = sizeof(struct hvm_start_info);
-
- /* Allocate and clear special pages. */
- for ( i = 0; i < X86_HVM_NR_SPECIAL_PAGES; i++ )
- special_array[i] = special_pfn(i);
-
- rc = xc_domain_populate_physmap_exact(xch, domid, X86_HVM_NR_SPECIAL_PAGES,
- 0, 0, special_array);
- if ( rc != 0 )
- {
- DOMPRINTF("Could not allocate special pages.");
- goto error_out;
- }
-
- if ( xc_clear_domain_pages(xch, domid, special_pfn(0),
- X86_HVM_NR_SPECIAL_PAGES) )
- goto error_out;
-
- xc_hvm_param_set(xch, domid, HVM_PARAM_STORE_PFN,
- special_pfn(SPECIALPAGE_XENSTORE));
- xc_hvm_param_set(xch, domid, HVM_PARAM_BUFIOREQ_PFN,
- special_pfn(SPECIALPAGE_BUFIOREQ));
- xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_PFN,
- special_pfn(SPECIALPAGE_IOREQ));
- xc_hvm_param_set(xch, domid, HVM_PARAM_CONSOLE_PFN,
- special_pfn(SPECIALPAGE_CONSOLE));
- xc_hvm_param_set(xch, domid, HVM_PARAM_PAGING_RING_PFN,
- special_pfn(SPECIALPAGE_PAGING));
- xc_hvm_param_set(xch, domid, HVM_PARAM_MONITOR_RING_PFN,
- special_pfn(SPECIALPAGE_ACCESS));
- xc_hvm_param_set(xch, domid, HVM_PARAM_SHARING_RING_PFN,
- special_pfn(SPECIALPAGE_SHARING));
-
- start_info_size +=
- sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT;
-
- start_info_size +=
- HVMLOADER_MODULE_CMDLINE_SIZE * HVMLOADER_MODULE_MAX_COUNT;
-
- start_info_size +=
- dom->e820_entries * sizeof(struct hvm_memmap_table_entry);
-
- if ( !dom->device_model )
- {
- if ( dom->cmdline )
- {
- dom->cmdline_size = ROUNDUP(strlen(dom->cmdline) + 1, 8);
- start_info_size += dom->cmdline_size;
- }
- }
- else
- {
- /*
- * Allocate and clear additional ioreq server pages. The default
- * server will use the IOREQ and BUFIOREQ special pages above.
- */
- for ( i = 0; i < NR_IOREQ_SERVER_PAGES; i++ )
- ioreq_server_array[i] = ioreq_server_pfn(i);
-
- rc = xc_domain_populate_physmap_exact(xch, domid, NR_IOREQ_SERVER_PAGES, 0,
- 0, ioreq_server_array);
- if ( rc != 0 )
- {
- DOMPRINTF("Could not allocate ioreq server pages.");
- goto error_out;
- }
-
- if ( xc_clear_domain_pages(xch, domid, ioreq_server_pfn(0),
- NR_IOREQ_SERVER_PAGES) )
- goto error_out;
-
- /* Tell the domain where the pages are and how many there are */
- xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_SERVER_PFN,
- ioreq_server_pfn(0));
- xc_hvm_param_set(xch, domid, HVM_PARAM_NR_IOREQ_SERVER_PAGES,
- NR_IOREQ_SERVER_PAGES);
- }
-
- rc = xc_dom_alloc_segment(dom, &dom->start_info_seg,
- "HVM start info", 0, start_info_size);
- if ( rc != 0 )
- {
- DOMPRINTF("Unable to reserve memory for the start info");
- goto out;
- }
-
- /*
- * Identity-map page table is required for running with CR0.PG=0 when
- * using Intel EPT. Create a 32-bit non-PAE page directory of superpages.
- */
- if ( (ident_pt = xc_map_foreign_range(
- xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
- special_pfn(SPECIALPAGE_IDENT_PT))) == NULL )
- goto error_out;
- for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
- ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
- munmap(ident_pt, PAGE_SIZE);
- xc_hvm_param_set(xch, domid, HVM_PARAM_IDENT_PT,
- special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT);
-
- dom->console_pfn = special_pfn(SPECIALPAGE_CONSOLE);
- xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn);
-
- dom->xenstore_pfn = special_pfn(SPECIALPAGE_XENSTORE);
- xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn);
-
- dom->parms.virt_hypercall = -1;
-
- rc = 0;
- goto out;
- error_out:
- rc = -1;
- out:
-
- return rc;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int start_info_x86_32(struct xc_dom_image *dom)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- start_info_x86_32_t *start_info =
- xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1);
- xen_pfn_t shinfo =
- xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn;
-
- DOMPRINTF_CALLED(dom->xch);
-
- if ( start_info == NULL )
- {
- DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__);
- return -1; /* our caller throws away our return value :-/ */
- }
-
- memset(start_info, 0, sizeof(*start_info));
- strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
- start_info->magic[sizeof(start_info->magic) - 1] = '\0';
- start_info->nr_pages = dom->total_pages;
- start_info->shared_info = shinfo << PAGE_SHIFT_X86;
- start_info->pt_base = dom->pgtables_seg.vstart;
- start_info->nr_pt_frames = domx86->maps[0].area.pgtables;
- start_info->mfn_list = dom->p2m_seg.vstart;
-
- start_info->flags = dom->flags;
- start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
- start_info->store_evtchn = dom->xenstore_evtchn;
- start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn);
- start_info->console.domU.evtchn = dom->console_evtchn;
-
- if ( dom->modules[0].blob )
- {
- start_info->mod_start = dom->initrd_start;
- start_info->mod_len = dom->initrd_len;
- }
-
- if ( dom->cmdline )
- {
- strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE);
- start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0';
- }
-
- return 0;
-}
-
-static int start_info_x86_64(struct xc_dom_image *dom)
-{
- struct xc_dom_image_x86 *domx86 = dom->arch_private;
- start_info_x86_64_t *start_info =
- xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1);
- xen_pfn_t shinfo =
- xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn;
-
- DOMPRINTF_CALLED(dom->xch);
-
- if ( start_info == NULL )
- {
- DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__);
- return -1; /* our caller throws away our return value :-/ */
- }
-
- memset(start_info, 0, sizeof(*start_info));
- strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
- start_info->magic[sizeof(start_info->magic) - 1] = '\0';
- start_info->nr_pages = dom->total_pages;
- start_info->shared_info = shinfo << PAGE_SHIFT_X86;
- start_info->pt_base = dom->pgtables_seg.vstart;
- start_info->nr_pt_frames = domx86->maps[0].area.pgtables;
- start_info->mfn_list = dom->p2m_seg.vstart;
- if ( dom->parms.p2m_base != UNSET_ADDR )
- {
- start_info->first_p2m_pfn = dom->p2m_seg.pfn;
- start_info->nr_p2m_frames = dom->p2m_seg.pages;
- }
-
- start_info->flags = dom->flags;
- start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
- start_info->store_evtchn = dom->xenstore_evtchn;
- start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn);
- start_info->console.domU.evtchn = dom->console_evtchn;
-
- if ( dom->modules[0].blob )
- {
- start_info->mod_start = dom->initrd_start;
- start_info->mod_len = dom->initrd_len;
- }
-
- if ( dom->cmdline )
- {
- strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE);
- start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0';
- }
-
- return 0;
-}
-
-static int shared_info_x86_32(struct xc_dom_image *dom, void *ptr)
-{
- shared_info_x86_32_t *shared_info = ptr;
- int i;
-
- DOMPRINTF_CALLED(dom->xch);
-
- memset(shared_info, 0, sizeof(*shared_info));
- for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
- shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
- return 0;
-}
-
-static int shared_info_x86_64(struct xc_dom_image *dom, void *ptr)
-{
- shared_info_x86_64_t *shared_info = ptr;
- int i;
-
- DOMPRINTF_CALLED(dom->xch);
-
- memset(shared_info, 0, sizeof(*shared_info));
- for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
- shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
- return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int vcpu_x86_32(struct xc_dom_image *dom)
-{
- vcpu_guest_context_any_t any_ctx;
- vcpu_guest_context_x86_32_t *ctxt = &any_ctx.x32;
- xen_pfn_t cr3_pfn;
- int rc;
-
- DOMPRINTF_CALLED(dom->xch);
-
- /* clear everything */
- memset(ctxt, 0, sizeof(*ctxt));
-
- ctxt->user_regs.eip = dom->parms.virt_entry;
- ctxt->user_regs.esp =
- dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86;
- ctxt->user_regs.esi =
- dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
- ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */
-
- ctxt->debugreg[6] = X86_DR6_DEFAULT;
- ctxt->debugreg[7] = X86_DR7_DEFAULT;
-
- ctxt->flags = VGCF_in_kernel_X86_32 | VGCF_online_X86_32;
- if ( dom->parms.pae == XEN_PAE_EXTCR3 ||
- dom->parms.pae == XEN_PAE_BIMODAL )
- ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3);
-
- cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn);
- ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_32(cr3_pfn);
- DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "",
- __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn);
-
- ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_32;
- ctxt->user_regs.es = FLAT_KERNEL_DS_X86_32;
- ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_32;
- ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_32;
- ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_32;
- ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_32;
-
- ctxt->kernel_ss = ctxt->user_regs.ss;
- ctxt->kernel_sp = ctxt->user_regs.esp;
-
- rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
- if ( rc != 0 )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
-
- return rc;
-}
-
-static int vcpu_x86_64(struct xc_dom_image *dom)
-{
- vcpu_guest_context_any_t any_ctx;
- vcpu_guest_context_x86_64_t *ctxt = &any_ctx.x64;
- xen_pfn_t cr3_pfn;
- int rc;
-
- DOMPRINTF_CALLED(dom->xch);
-
- /* clear everything */
- memset(ctxt, 0, sizeof(*ctxt));
-
- ctxt->user_regs.rip = dom->parms.virt_entry;
- ctxt->user_regs.rsp =
- dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86;
- ctxt->user_regs.rsi =
- dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
- ctxt->user_regs.rflags = 1 << 9; /* Interrupt Enable */
-
- ctxt->debugreg[6] = X86_DR6_DEFAULT;
- ctxt->debugreg[7] = X86_DR7_DEFAULT;
-
- ctxt->flags = VGCF_in_kernel_X86_64 | VGCF_online_X86_64;
- cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn);
- ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_64(cr3_pfn);
- DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "",
- __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn);
-
- ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_64;
- ctxt->user_regs.es = FLAT_KERNEL_DS_X86_64;
- ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_64;
- ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_64;
- ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_64;
- ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_64;
-
- ctxt->kernel_ss = ctxt->user_regs.ss;
- ctxt->kernel_sp = ctxt->user_regs.esp;
-
- rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
- if ( rc != 0 )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
-
- return rc;
-}
-
-const static void *hvm_get_save_record(const void *ctx, unsigned int type,
- unsigned int instance)
-{
- const struct hvm_save_descriptor *header;
-
- for ( header = ctx;
- header->typecode != HVM_SAVE_CODE(END);
- ctx += sizeof(*header) + header->length, header = ctx )
- if ( header->typecode == type && header->instance == instance )
- return ctx + sizeof(*header);
-
- return NULL;
-}
-
-static int vcpu_hvm(struct xc_dom_image *dom)
-{
- struct {
- struct hvm_save_descriptor header_d;
- HVM_SAVE_TYPE(HEADER) header;
- struct hvm_save_descriptor cpu_d;
- HVM_SAVE_TYPE(CPU) cpu;
- struct hvm_save_descriptor end_d;
- HVM_SAVE_TYPE(END) end;
- } bsp_ctx;
- uint8_t *full_ctx = NULL;
- int rc;
-
- DOMPRINTF_CALLED(dom->xch);
-
- assert(dom->max_vcpus);
-
- /*
- * Get the full HVM context in order to have the header, it is not
- * possible to get the header with getcontext_partial, and crafting one
- * from userspace is also not an option since cpuid is trapped and
- * modified by Xen.
- */
-
- rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, NULL, 0);
- if ( rc <= 0 )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: unable to fetch HVM context size (rc=%d)",
- __func__, rc);
- goto out;
- }
-
- full_ctx = calloc(1, rc);
- if ( full_ctx == NULL )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: unable to allocate memory for HVM context (rc=%d)",
- __func__, rc);
- rc = -ENOMEM;
- goto out;
- }
-
- rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, full_ctx, rc);
- if ( rc <= 0 )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: unable to fetch HVM context (rc=%d)",
- __func__, rc);
- goto out;
- }
-
- /* Copy the header to our partial context. */
- memset(&bsp_ctx, 0, sizeof(bsp_ctx));
- memcpy(&bsp_ctx, full_ctx,
- sizeof(struct hvm_save_descriptor) + HVM_SAVE_LENGTH(HEADER));
-
- /* Set the CPU descriptor. */
- bsp_ctx.cpu_d.typecode = HVM_SAVE_CODE(CPU);
- bsp_ctx.cpu_d.instance = 0;
- bsp_ctx.cpu_d.length = HVM_SAVE_LENGTH(CPU);
-
- /* Set the cached part of the relevant segment registers. */
- bsp_ctx.cpu.cs_base = 0;
- bsp_ctx.cpu.ds_base = 0;
- bsp_ctx.cpu.es_base = 0;
- bsp_ctx.cpu.ss_base = 0;
- bsp_ctx.cpu.tr_base = 0;
- bsp_ctx.cpu.cs_limit = ~0u;
- bsp_ctx.cpu.ds_limit = ~0u;
- bsp_ctx.cpu.es_limit = ~0u;
- bsp_ctx.cpu.ss_limit = ~0u;
- bsp_ctx.cpu.tr_limit = 0x67;
- bsp_ctx.cpu.cs_arbytes = 0xc9b;
- bsp_ctx.cpu.ds_arbytes = 0xc93;
- bsp_ctx.cpu.es_arbytes = 0xc93;
- bsp_ctx.cpu.ss_arbytes = 0xc93;
- bsp_ctx.cpu.tr_arbytes = 0x8b;
-
- /* Set the control registers. */
- bsp_ctx.cpu.cr0 = X86_CR0_PE | X86_CR0_ET;
-
- /* Set the IP. */
- bsp_ctx.cpu.rip = dom->parms.phys_entry;
-
- bsp_ctx.cpu.dr6 = X86_DR6_DEFAULT;
- bsp_ctx.cpu.dr7 = X86_DR7_DEFAULT;
-
- if ( dom->start_info_seg.pfn )
- bsp_ctx.cpu.rbx = dom->start_info_seg.pfn << PAGE_SHIFT;
-
- /* Set the end descriptor. */
- bsp_ctx.end_d.typecode = HVM_SAVE_CODE(END);
- bsp_ctx.end_d.instance = 0;
- bsp_ctx.end_d.length = HVM_SAVE_LENGTH(END);
-
- /* TODO: maybe this should be a firmware option instead? */
- if ( !dom->device_model )
- {
- struct {
- struct hvm_save_descriptor header_d;
- HVM_SAVE_TYPE(HEADER) header;
- struct hvm_save_descriptor mtrr_d;
- HVM_SAVE_TYPE(MTRR) mtrr;
- struct hvm_save_descriptor end_d;
- HVM_SAVE_TYPE(END) end;
- } mtrr = {
- .header_d = bsp_ctx.header_d,
- .header = bsp_ctx.header,
- .mtrr_d.typecode = HVM_SAVE_CODE(MTRR),
- .mtrr_d.length = HVM_SAVE_LENGTH(MTRR),
- .end_d = bsp_ctx.end_d,
- .end = bsp_ctx.end,
- };
- const HVM_SAVE_TYPE(MTRR) *mtrr_record =
- hvm_get_save_record(full_ctx, HVM_SAVE_CODE(MTRR), 0);
- unsigned int i;
-
- if ( !mtrr_record )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: unable to get MTRR save record", __func__);
- goto out;
- }
-
- memcpy(&mtrr.mtrr, mtrr_record, sizeof(mtrr.mtrr));
-
- /*
- * Enable MTRR, set default type to WB.
- * TODO: add MMIO areas as UC when passthrough is supported.
- */
- mtrr.mtrr.msr_mtrr_def_type = MTRR_TYPE_WRBACK | MTRR_DEF_TYPE_ENABLE;
-
- for ( i = 0; i < dom->max_vcpus; i++ )
- {
- mtrr.mtrr_d.instance = i;
- rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid,
- (uint8_t *)&mtrr, sizeof(mtrr));
- if ( rc != 0 )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc);
- }
- }
-
- /*
- * Loading the BSP context should be done in the last call to setcontext,
- * since each setcontext call will put all vCPUs down.
- */
- rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid,
- (uint8_t *)&bsp_ctx, sizeof(bsp_ctx));
- if ( rc != 0 )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc);
-
- out:
- free(full_ctx);
- return rc;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int x86_compat(xc_interface *xch, uint32_t domid, char *guest_type)
-{
- static const struct {
- char *guest;
- uint32_t size;
- } types[] = {
- { "xen-3.0-x86_32p", 32 },
- { "xen-3.0-x86_64", 64 },
- };
- DECLARE_DOMCTL;
- int i,rc;
-
- memset(&domctl, 0, sizeof(domctl));
- domctl.domain = domid;
- domctl.cmd = XEN_DOMCTL_set_address_size;
- for ( i = 0; i < ARRAY_SIZE(types); i++ )
- if ( !strcmp(types[i].guest, guest_type) )
- domctl.u.address_size.size = types[i].size;
- if ( domctl.u.address_size.size == 0 )
- /* nothing to do */
- return 0;
-
- xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__,
- guest_type, domctl.u.address_size.size);
- rc = do_domctl(xch, &domctl);
- if ( rc != 0 )
- xc_dom_printf(xch, "%s: warning: failed (rc=%d)",
- __FUNCTION__, rc);
- return rc;
-}
-
-static int meminit_pv(struct xc_dom_image *dom)
-{
- int rc;
- xen_pfn_t pfn, allocsz, mfn, total, pfn_base;
- int i, j, k;
- xen_vmemrange_t dummy_vmemrange[1];
- unsigned int dummy_vnode_to_pnode[1];
- xen_vmemrange_t *vmemranges;
- unsigned int *vnode_to_pnode;
- unsigned int nr_vmemranges, nr_vnodes;
-
- rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
- if ( rc )
- return rc;
-
- /* try to claim pages for early warning of insufficient memory avail */
- if ( dom->claim_enabled )
- {
- rc = xc_domain_claim_pages(dom->xch, dom->guest_domid,
- dom->total_pages);
- if ( rc )
- return rc;
- }
-
- /* Setup dummy vNUMA information if it's not provided. Note
- * that this is a valid state if libxl doesn't provide any
- * vNUMA information.
- *
- * The dummy values make libxc allocate all pages from
- * arbitrary physical nodes. This is the expected behaviour if
- * no vNUMA configuration is provided to libxc.
- *
- * Note that the following hunk is just for the convenience of
- * allocation code. No defaulting happens in libxc.
- */
- if ( dom->nr_vmemranges == 0 )
- {
- nr_vmemranges = 1;
- vmemranges = dummy_vmemrange;
- vmemranges[0].start = 0;
- vmemranges[0].end = (uint64_t)dom->total_pages << PAGE_SHIFT;
- vmemranges[0].flags = 0;
- vmemranges[0].nid = 0;
-
- nr_vnodes = 1;
- vnode_to_pnode = dummy_vnode_to_pnode;
- vnode_to_pnode[0] = XC_NUMA_NO_NODE;
- }
- else
- {
- nr_vmemranges = dom->nr_vmemranges;
- nr_vnodes = dom->nr_vnodes;
- vmemranges = dom->vmemranges;
- vnode_to_pnode = dom->vnode_to_pnode;
- }
-
- total = dom->p2m_size = 0;
- for ( i = 0; i < nr_vmemranges; i++ )
- {
- total += ((vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT);
- dom->p2m_size = max(dom->p2m_size,
- (xen_pfn_t)(vmemranges[i].end >> PAGE_SHIFT));
- }
- if ( total != dom->total_pages )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: vNUMA page count mismatch (0x%"PRIpfn" != 0x%"PRIpfn")",
- __func__, total, dom->total_pages);
- return -EINVAL;
- }
-
- dom->pv_p2m = xc_dom_malloc(dom, sizeof(*dom->pv_p2m) * dom->p2m_size);
- if ( dom->pv_p2m == NULL )
- return -EINVAL;
- for ( pfn = 0; pfn < dom->p2m_size; pfn++ )
- dom->pv_p2m[pfn] = INVALID_PFN;
-
- /* allocate guest memory */
- for ( i = 0; i < nr_vmemranges; i++ )
- {
- unsigned int memflags;
- uint64_t pages, super_pages;
- unsigned int pnode = vnode_to_pnode[vmemranges[i].nid];
- xen_pfn_t extents[SUPERPAGE_BATCH_SIZE];
- xen_pfn_t pfn_base_idx;
-
- memflags = 0;
- if ( pnode != XC_NUMA_NO_NODE )
- memflags |= XENMEMF_exact_node(pnode);
-
- pages = (vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT;
- super_pages = pages >> SUPERPAGE_2MB_SHIFT;
- pfn_base = vmemranges[i].start >> PAGE_SHIFT;
-
- for ( pfn = pfn_base; pfn < pfn_base+pages; pfn++ )
- dom->pv_p2m[pfn] = pfn;
-
- pfn_base_idx = pfn_base;
- while ( super_pages ) {
- uint64_t count = min_t(uint64_t, super_pages, SUPERPAGE_BATCH_SIZE);
- super_pages -= count;
-
- for ( pfn = pfn_base_idx, j = 0;
- pfn < pfn_base_idx + (count << SUPERPAGE_2MB_SHIFT);
- pfn += SUPERPAGE_2MB_NR_PFNS, j++ )
- extents[j] = dom->pv_p2m[pfn];
- rc = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count,
- SUPERPAGE_2MB_SHIFT, memflags,
- extents);
- if ( rc < 0 )
- return rc;
-
- /* Expand the returned mfns into the p2m array. */
- pfn = pfn_base_idx;
- for ( j = 0; j < rc; j++ )
- {
- mfn = extents[j];
- for ( k = 0; k < SUPERPAGE_2MB_NR_PFNS; k++, pfn++ )
- dom->pv_p2m[pfn] = mfn + k;
- }
- pfn_base_idx = pfn;
- }
-
- for ( j = pfn_base_idx - pfn_base; j < pages; j += allocsz )
- {
- allocsz = min_t(uint64_t, 1024 * 1024, pages - j);
- rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
- allocsz, 0, memflags, &dom->pv_p2m[pfn_base + j]);
-
- if ( rc )
- {
- if ( pnode != XC_NUMA_NO_NODE )
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: failed to allocate 0x%"PRIx64" pages (v=%d, p=%d)",
- __func__, pages, i, pnode);
- else
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: failed to allocate 0x%"PRIx64" pages",
- __func__, pages);
- return rc;
- }
- }
- rc = 0;
- }
-
- /* Ensure no unclaimed pages are left unused.
- * OK to call if hadn't done the earlier claim call. */
- xc_domain_claim_pages(dom->xch, dom->guest_domid, 0 /* cancel claim */);
-
- return rc;
-}
-
-/*
- * Check whether there exists mmio hole in the specified memory range.
- * Returns 1 if exists, else returns 0.
- */
-static int check_mmio_hole(uint64_t start, uint64_t memsize,
- uint64_t mmio_start, uint64_t mmio_size)
-{
- if ( start + memsize <= mmio_start || start >= mmio_start + mmio_size )
- return 0;
- else
- return 1;
-}
-
-static int meminit_hvm(struct xc_dom_image *dom)
-{
- unsigned long i, vmemid, nr_pages = dom->total_pages;
- unsigned long p2m_size;
- unsigned long target_pages = dom->target_pages;
- unsigned long cur_pages, cur_pfn;
- int rc;
- unsigned long stat_normal_pages = 0, stat_2mb_pages = 0,
- stat_1gb_pages = 0;
- unsigned int memflags = 0;
- int claim_enabled = dom->claim_enabled;
- uint64_t total_pages;
- xen_vmemrange_t dummy_vmemrange[2];
- unsigned int dummy_vnode_to_pnode[1];
- xen_vmemrange_t *vmemranges;
- unsigned int *vnode_to_pnode;
- unsigned int nr_vmemranges, nr_vnodes;
- xc_interface *xch = dom->xch;
- uint32_t domid = dom->guest_domid;
-
- if ( nr_pages > target_pages )
- memflags |= XENMEMF_populate_on_demand;
-
- if ( dom->nr_vmemranges == 0 )
- {
- /* Build dummy vnode information
- *
- * Guest physical address space layout:
- * [0, hole_start) [hole_start, 4G) [4G, highmem_end)
- *
- * Of course if there is no high memory, the second vmemrange
- * has no effect on the actual result.
- */
-
- dummy_vmemrange[0].start = 0;
- dummy_vmemrange[0].end = dom->lowmem_end;
- dummy_vmemrange[0].flags = 0;
- dummy_vmemrange[0].nid = 0;
- nr_vmemranges = 1;
-
- if ( dom->highmem_end > (1ULL << 32) )
- {
- dummy_vmemrange[1].start = 1ULL << 32;
- dummy_vmemrange[1].end = dom->highmem_end;
- dummy_vmemrange[1].flags = 0;
- dummy_vmemrange[1].nid = 0;
-
- nr_vmemranges++;
- }
-
- dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE;
- nr_vnodes = 1;
- vmemranges = dummy_vmemrange;
- vnode_to_pnode = dummy_vnode_to_pnode;
- }
- else
- {
- if ( nr_pages > target_pages )
- {
- DOMPRINTF("Cannot enable vNUMA and PoD at the same time");
- goto error_out;
- }
-
- nr_vmemranges = dom->nr_vmemranges;
- nr_vnodes = dom->nr_vnodes;
- vmemranges = dom->vmemranges;
- vnode_to_pnode = dom->vnode_to_pnode;
- }
-
- total_pages = 0;
- p2m_size = 0;
- for ( i = 0; i < nr_vmemranges; i++ )
- {
- DOMPRINTF("range: start=0x%"PRIx64" end=0x%"PRIx64, vmemranges[i].start, vmemranges[i].end);
-
- total_pages += ((vmemranges[i].end - vmemranges[i].start)
- >> PAGE_SHIFT);
- p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ?
- p2m_size : (vmemranges[i].end >> PAGE_SHIFT);
- }
-
- if ( total_pages != nr_pages )
- {
- DOMPRINTF("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%lx)",
- total_pages, nr_pages);
- goto error_out;
- }
-
- dom->p2m_size = p2m_size;
-
- /*
- * Try to claim pages for early warning of insufficient memory available.
- * This should go before xc_domain_set_pod_target, becuase that function
- * actually allocates memory for the guest. Claiming after memory has been
- * allocated is pointless.
- */
- if ( claim_enabled ) {
- rc = xc_domain_claim_pages(xch, domid,
- target_pages - dom->vga_hole_size);
- if ( rc != 0 )
- {
- DOMPRINTF("Could not allocate memory for HVM guest as we cannot claim memory!");
- goto error_out;
- }
- }
-
- if ( memflags & XENMEMF_populate_on_demand )
- {
- /*
- * Subtract VGA_HOLE_SIZE from target_pages for the VGA
- * "hole". Xen will adjust the PoD cache size so that domain
- * tot_pages will be target_pages - VGA_HOLE_SIZE after
- * this call.
- */
- rc = xc_domain_set_pod_target(xch, domid,
- target_pages - dom->vga_hole_size,
- NULL, NULL, NULL);
- if ( rc != 0 )
- {
- DOMPRINTF("Could not set PoD target for HVM guest.\n");
- goto error_out;
- }
- }
-
- /*
- * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
- *
- * We attempt to allocate 1GB pages if possible. It falls back on 2MB
- * pages if 1GB allocation fails. 4KB pages will be used eventually if
- * both fail.
- */
- if ( dom->device_model )
- {
- xen_pfn_t extents[0xa0];
-
- for ( i = 0; i < ARRAY_SIZE(extents); ++i )
- extents[i] = i;
-
- rc = xc_domain_populate_physmap_exact(
- xch, domid, 0xa0, 0, memflags, extents);
- if ( rc != 0 )
- {
- DOMPRINTF("Could not populate low memory (< 0xA0).\n");
- goto error_out;
- }
- }
-
- stat_normal_pages = 0;
- for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
- {
- unsigned int new_memflags = memflags;
- uint64_t end_pages;
- unsigned int vnode = vmemranges[vmemid].nid;
- unsigned int pnode = vnode_to_pnode[vnode];
-
- if ( pnode != XC_NUMA_NO_NODE )
- new_memflags |= XENMEMF_exact_node(pnode);
-
- end_pages = vmemranges[vmemid].end >> PAGE_SHIFT;
- /*
- * Consider vga hole belongs to the vmemrange that covers
- * 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just
- * before this loop.
- */
- if ( vmemranges[vmemid].start == 0 && dom->device_model )
- {
- cur_pages = 0xc0;
- stat_normal_pages += 0xc0;
- }
- else
- cur_pages = vmemranges[vmemid].start >> PAGE_SHIFT;
-
- rc = 0;
- while ( (rc == 0) && (end_pages > cur_pages) )
- {
- /* Clip count to maximum 1GB extent. */
- unsigned long count = end_pages - cur_pages;
- unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
-
- if ( count > max_pages )
- count = max_pages;
-
- cur_pfn = cur_pages;
-
- /* Take care the corner cases of super page tails */
- if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
- (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) )
- count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1);
- else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
- (count > SUPERPAGE_1GB_NR_PFNS) )
- count &= ~(SUPERPAGE_1GB_NR_PFNS - 1);
-
- /* Attemp to allocate 1GB super page. Because in each pass
- * we only allocate at most 1GB, we don't have to clip
- * super page boundaries.
- */
- if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 &&
- /* Check if there exists MMIO hole in the 1GB memory
- * range */
- !check_mmio_hole(cur_pfn << PAGE_SHIFT,
- SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT,
- dom->mmio_start, dom->mmio_size) )
- {
- long done;
- unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT;
- xen_pfn_t sp_extents[nr_extents];
-
- for ( i = 0; i < nr_extents; i++ )
- sp_extents[i] = cur_pages + (i << SUPERPAGE_1GB_SHIFT);
-
- done = xc_domain_populate_physmap(xch, domid, nr_extents,
- SUPERPAGE_1GB_SHIFT,
- new_memflags, sp_extents);
-
- if ( done > 0 )
- {
- stat_1gb_pages += done;
- done <<= SUPERPAGE_1GB_SHIFT;
- cur_pages += done;
- count -= done;
- }
- }
-
- if ( count != 0 )
- {
- /* Clip count to maximum 8MB extent. */
- max_pages = SUPERPAGE_2MB_NR_PFNS * 4;
- if ( count > max_pages )
- count = max_pages;
-
- /* Clip partial superpage extents to superpage
- * boundaries. */
- if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
- (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) )
- count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1);
- else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
- (count > SUPERPAGE_2MB_NR_PFNS) )
- count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */
-
- /* Attempt to allocate superpage extents. */
- if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 )
- {
- long done;
- unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT;
- xen_pfn_t sp_extents[nr_extents];
-
- for ( i = 0; i < nr_extents; i++ )
- sp_extents[i] = cur_pages + (i << SUPERPAGE_2MB_SHIFT);
-
- done = xc_domain_populate_physmap(xch, domid, nr_extents,
- SUPERPAGE_2MB_SHIFT,
- new_memflags, sp_extents);
-
- if ( done > 0 )
- {
- stat_2mb_pages += done;
- done <<= SUPERPAGE_2MB_SHIFT;
- cur_pages += done;
- count -= done;
- }
- }
- }
-
- /* Fall back to 4kB extents. */
- if ( count != 0 )
- {
- xen_pfn_t extents[count];
-
- for ( i = 0; i < count; ++i )
- extents[i] = cur_pages + i;
-
- rc = xc_domain_populate_physmap_exact(
- xch, domid, count, 0, new_memflags, extents);
- cur_pages += count;
- stat_normal_pages += count;
- }
- }
-
- if ( rc != 0 )
- {
- DOMPRINTF("Could not allocate memory for HVM guest.");
- goto error_out;
- }
- }
-
- DPRINTF("PHYSICAL MEMORY ALLOCATION:\n");
- DPRINTF(" 4KB PAGES: 0x%016lx\n", stat_normal_pages);
- DPRINTF(" 2MB PAGES: 0x%016lx\n", stat_2mb_pages);
- DPRINTF(" 1GB PAGES: 0x%016lx\n", stat_1gb_pages);
-
- rc = 0;
- goto out;
- error_out:
- rc = -1;
- out:
-
- /* ensure no unclaimed pages are left unused */
- xc_domain_claim_pages(xch, domid, 0 /* cancels the claim */);
-
- return rc;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static int bootearly(struct xc_dom_image *dom)
-{
- if ( dom->container_type == XC_DOM_PV_CONTAINER &&
- elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active) )
- {
- DOMPRINTF("PV Autotranslate guests no longer supported");
- errno = EOPNOTSUPP;
- return -1;
- }
-
- return 0;
-}
-
-static int bootlate_pv(struct xc_dom_image *dom)
-{
- static const struct {
- char *guest;
- unsigned long pgd_type;
- } types[] = {
- { "xen-3.0-x86_32", MMUEXT_PIN_L2_TABLE},
- { "xen-3.0-x86_32p", MMUEXT_PIN_L3_TABLE},
- { "xen-3.0-x86_64", MMUEXT_PIN_L4_TABLE},
- };
- unsigned long pgd_type = 0;
- shared_info_t *shared_info;
- xen_pfn_t shinfo;
- int i, rc;
-
- for ( i = 0; i < ARRAY_SIZE(types); i++ )
- if ( !strcmp(types[i].guest, dom->guest_type) )
- pgd_type = types[i].pgd_type;
-
- /* Drop references to all initial page tables before pinning. */
- xc_dom_unmap_one(dom, dom->pgtables_seg.pfn);
- xc_dom_unmap_one(dom, dom->p2m_seg.pfn);
- rc = pin_table(dom->xch, pgd_type,
- xc_dom_p2m(dom, dom->pgtables_seg.pfn),
- dom->guest_domid);
- if ( rc != 0 )
- {
- xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
- "%s: pin_table failed (pfn 0x%" PRIpfn ", rc=%d)",
- __FUNCTION__, dom->pgtables_seg.pfn, rc);
- return rc;
- }
- shinfo = dom->shared_info_mfn;
-
- /* setup shared_info page */
- DOMPRINTF("%s: shared_info: pfn 0x%" PRIpfn ", mfn 0x%" PRIpfn "",
- __FUNCTION__, dom->shared_info_pfn, dom->shared_info_mfn);
- shared_info = xc_map_foreign_range(dom->xch, dom->guest_domid,
- PAGE_SIZE_X86,
- PROT_READ | PROT_WRITE,
- shinfo);
- if ( shared_info == NULL )
- return -1;
- dom->arch_hooks->shared_info(dom, shared_info);
- munmap(shared_info, PAGE_SIZE_X86);
-
- return 0;
-}
-
-/*
- * The memory layout of the start_info page and the modules, and where the
- * addresses are stored:
- *
- * /----------------------------------\
- * | struct hvm_start_info |
- * +----------------------------------+ <- start_info->modlist_paddr
- * | struct hvm_modlist_entry[0] |
- * +----------------------------------+
- * | struct hvm_modlist_entry[1] |
- * +----------------------------------+ <- modlist[0].cmdline_paddr
- * | cmdline of module 0 |
- * | char[HVMLOADER_MODULE_NAME_SIZE] |
- * +----------------------------------+ <- modlist[1].cmdline_paddr
- * | cmdline of module 1 |
- * +----------------------------------+
- */
-static void add_module_to_list(struct xc_dom_image *dom,
- struct xc_hvm_firmware_module *module,
- const char *cmdline,
- struct hvm_modlist_entry *modlist,
- struct hvm_start_info *start_info)
-{
- uint32_t index = start_info->nr_modules;
- void *modules_cmdline_start = modlist + HVMLOADER_MODULE_MAX_COUNT;
- uint64_t modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
- ((uintptr_t)modlist - (uintptr_t)start_info);
- uint64_t modules_cmdline_paddr = modlist_paddr +
- sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT;
-
- if ( module->length == 0 )
- return;
-
- assert(start_info->nr_modules < HVMLOADER_MODULE_MAX_COUNT);
-
- modlist[index].paddr = module->guest_addr_out;
- modlist[index].size = module->length;
-
- if ( cmdline )
- {
- assert(strnlen(cmdline, HVMLOADER_MODULE_CMDLINE_SIZE)
- < HVMLOADER_MODULE_CMDLINE_SIZE);
- strncpy(modules_cmdline_start + HVMLOADER_MODULE_CMDLINE_SIZE * index,
- cmdline, HVMLOADER_MODULE_CMDLINE_SIZE);
- modlist[index].cmdline_paddr = modules_cmdline_paddr +
- HVMLOADER_MODULE_CMDLINE_SIZE * index;
- }
-
- start_info->nr_modules++;
-}
-
-static int bootlate_hvm(struct xc_dom_image *dom)
-{
- uint32_t domid = dom->guest_domid;
- xc_interface *xch = dom->xch;
- struct hvm_start_info *start_info;
- size_t modsize;
- struct hvm_modlist_entry *modlist;
- struct hvm_memmap_table_entry *memmap;
- unsigned int i;
-
- start_info = xc_map_foreign_range(xch, domid, dom->start_info_seg.pages <<
- XC_DOM_PAGE_SHIFT(dom),
- PROT_READ | PROT_WRITE,
- dom->start_info_seg.pfn);
- if ( start_info == NULL )
- {
- DOMPRINTF("Unable to map HVM start info page");
- return -1;
- }
-
- modlist = (void*)(start_info + 1) + dom->cmdline_size;
-
- if ( !dom->device_model )
- {
- if ( dom->cmdline )
- {
- char *cmdline = (void*)(start_info + 1);
-
- strncpy(cmdline, dom->cmdline, dom->cmdline_size);
- start_info->cmdline_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
- ((uintptr_t)cmdline - (uintptr_t)start_info);
- }
-
- /* ACPI module 0 is the RSDP */
- start_info->rsdp_paddr = dom->acpi_modules[0].guest_addr_out ? : 0;
- }
- else
- {
- add_module_to_list(dom, &dom->system_firmware_module, "firmware",
- modlist, start_info);
- }
-
- for ( i = 0; i < dom->num_modules; i++ )
- {
- struct xc_hvm_firmware_module mod;
- uint64_t base = dom->parms.virt_base != UNSET_ADDR ?
- dom->parms.virt_base : 0;
-
- mod.guest_addr_out =
- dom->modules[i].seg.vstart - base;
- mod.length =
- dom->modules[i].seg.vend - dom->modules[i].seg.vstart;
-
- DOMPRINTF("Adding module %u guest_addr %"PRIx64" len %u",
- i, mod.guest_addr_out, mod.length);
-
- add_module_to_list(dom, &mod, dom->modules[i].cmdline,
- modlist, start_info);
- }
-
- if ( start_info->nr_modules )
- {
- start_info->modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
- ((uintptr_t)modlist - (uintptr_t)start_info);
- }
-
- /*
- * Check a couple of XEN_HVM_MEMMAP_TYPEs to verify consistency with
- * their corresponding e820 numerical values.
- */
- BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_RAM != E820_RAM);
- BUILD_BUG_ON(XEN_HVM_MEMMAP_TYPE_ACPI != E820_ACPI);
-
- modsize = HVMLOADER_MODULE_MAX_COUNT *
- (sizeof(*modlist) + HVMLOADER_MODULE_CMDLINE_SIZE);
- memmap = (void*)modlist + modsize;
-
- start_info->memmap_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
- ((uintptr_t)modlist - (uintptr_t)start_info) + modsize;
- start_info->memmap_entries = dom->e820_entries;
- for ( i = 0; i < dom->e820_entries; i++ )
- {
- memmap[i].addr = dom->e820[i].addr;
- memmap[i].size = dom->e820[i].size;
- memmap[i].type = dom->e820[i].type;
- }
-
- start_info->magic = XEN_HVM_START_MAGIC_VALUE;
- start_info->version = 1;
-
- munmap(start_info, dom->start_info_seg.pages << XC_DOM_PAGE_SHIFT(dom));
-
- if ( dom->device_model )
- {
- void *hvm_info_page;
-
- if ( (hvm_info_page = xc_map_foreign_range(
- xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
- HVM_INFO_PFN)) == NULL )
- return -1;
- build_hvm_info(hvm_info_page, dom);
- munmap(hvm_info_page, PAGE_SIZE);
- }
-
- return 0;
-}
-
-bool xc_dom_translated(const struct xc_dom_image *dom)
-{
- /* HVM guests are translated. PV guests are not. */
- return dom->container_type == XC_DOM_HVM_CONTAINER;
-}
-
-/* ------------------------------------------------------------------------ */
-
-static struct xc_dom_arch xc_dom_32_pae = {
- .guest_type = "xen-3.0-x86_32p",
- .native_protocol = XEN_IO_PROTO_ABI_X86_32,
- .page_shift = PAGE_SHIFT_X86,
- .sizeof_pfn = 4,
- .p2m_base_supported = 0,
- .arch_private_size = sizeof(struct xc_dom_image_x86),
- .alloc_magic_pages = alloc_magic_pages_pv,
- .alloc_pgtables = alloc_pgtables_x86_32_pae,
- .alloc_p2m_list = alloc_p2m_list_x86_32,
- .setup_pgtables = setup_pgtables_x86_32_pae,
- .start_info = start_info_x86_32,
- .shared_info = shared_info_x86_32,
- .vcpu = vcpu_x86_32,
- .meminit = meminit_pv,
- .bootearly = bootearly,
- .bootlate = bootlate_pv,
-};
-
-static struct xc_dom_arch xc_dom_64 = {
- .guest_type = "xen-3.0-x86_64",
- .native_protocol = XEN_IO_PROTO_ABI_X86_64,
- .page_shift = PAGE_SHIFT_X86,
- .sizeof_pfn = 8,
- .p2m_base_supported = 1,
- .arch_private_size = sizeof(struct xc_dom_image_x86),
- .alloc_magic_pages = alloc_magic_pages_pv,
- .alloc_pgtables = alloc_pgtables_x86_64,
- .alloc_p2m_list = alloc_p2m_list_x86_64,
- .setup_pgtables = setup_pgtables_x86_64,
- .start_info = start_info_x86_64,
- .shared_info = shared_info_x86_64,
- .vcpu = vcpu_x86_64,
- .meminit = meminit_pv,
- .bootearly = bootearly,
- .bootlate = bootlate_pv,
-};
-
-static struct xc_dom_arch xc_hvm_32 = {
- .guest_type = "hvm-3.0-x86_32",
- .native_protocol = XEN_IO_PROTO_ABI_X86_32,
- .page_shift = PAGE_SHIFT_X86,
- .sizeof_pfn = 4,
- .alloc_magic_pages = alloc_magic_pages_hvm,
- .vcpu = vcpu_hvm,
- .meminit = meminit_hvm,
- .bootearly = bootearly,
- .bootlate = bootlate_hvm,
-};
-
-static void __init register_arch_hooks(void)
-{
- xc_dom_register_arch_hooks(&xc_dom_32_pae);
- xc_dom_register_arch_hooks(&xc_dom_64);
- xc_dom_register_arch_hooks(&xc_hvm_32);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/******************************************************************************
- * xg_domain.c
- *
- * API for manipulating and obtaining information on domains.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * Copyright (c) 2003, K A Fraser.
- */
-
-#include "xg_private.h"
-#include "xc_core.h"
-
-int xc_unmap_domain_meminfo(xc_interface *xch, struct xc_domain_meminfo *minfo)
-{
- struct domain_info_context _di = { .guest_width = minfo->guest_width,
- .p2m_size = minfo->p2m_size};
- struct domain_info_context *dinfo = &_di;
-
- free(minfo->pfn_type);
- if ( minfo->p2m_table )
- munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE);
- minfo->p2m_table = NULL;
-
- return 0;
-}
-
-int xc_map_domain_meminfo(xc_interface *xch, uint32_t domid,
- struct xc_domain_meminfo *minfo)
-{
- struct domain_info_context _di;
- struct domain_info_context *dinfo = &_di;
-
- xc_dominfo_t info;
- shared_info_any_t *live_shinfo;
- xen_capabilities_info_t xen_caps = "";
- int i;
-
- /* Only be initialized once */
- if ( minfo->pfn_type || minfo->p2m_table )
- {
- errno = EINVAL;
- return -1;
- }
-
- if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 )
- {
- PERROR("Could not get domain info");
- return -1;
- }
-
- if ( xc_domain_get_guest_width(xch, domid, &minfo->guest_width) )
- {
- PERROR("Could not get domain address size");
- return -1;
- }
- _di.guest_width = minfo->guest_width;
-
- /* Get page table levels (see get_platform_info() in xg_save_restore.h */
- if ( xc_version(xch, XENVER_capabilities, &xen_caps) )
- {
- PERROR("Could not get Xen capabilities (for page table levels)");
- return -1;
- }
- if ( strstr(xen_caps, "xen-3.0-x86_64") )
- /* Depends on whether it's a compat 32-on-64 guest */
- minfo->pt_levels = ( (minfo->guest_width == 8) ? 4 : 3 );
- else if ( strstr(xen_caps, "xen-3.0-x86_32p") )
- minfo->pt_levels = 3;
- else if ( strstr(xen_caps, "xen-3.0-x86_32") )
- minfo->pt_levels = 2;
- else
- {
- errno = EFAULT;
- return -1;
- }
-
- /* We need the shared info page for mapping the P2M */
- live_shinfo = xc_map_foreign_range(xch, domid, PAGE_SIZE, PROT_READ,
- info.shared_info_frame);
- if ( !live_shinfo )
- {
- PERROR("Could not map the shared info frame (MFN 0x%lx)",
- info.shared_info_frame);
- return -1;
- }
-
- if ( xc_core_arch_map_p2m_writable(xch, minfo->guest_width, &info,
- live_shinfo, &minfo->p2m_table,
- &minfo->p2m_size) )
- {
- PERROR("Could not map the P2M table");
- munmap(live_shinfo, PAGE_SIZE);
- return -1;
- }
- munmap(live_shinfo, PAGE_SIZE);
- _di.p2m_size = minfo->p2m_size;
-
- /* Make space and prepare for getting the PFN types */
- minfo->pfn_type = calloc(sizeof(*minfo->pfn_type), minfo->p2m_size);
- if ( !minfo->pfn_type )
- {
- PERROR("Could not allocate memory for the PFN types");
- goto failed;
- }
- for ( i = 0; i < minfo->p2m_size; i++ )
- minfo->pfn_type[i] = xc_pfn_to_mfn(i, minfo->p2m_table,
- minfo->guest_width);
-
- /* Retrieve PFN types in batches */
- for ( i = 0; i < minfo->p2m_size ; i+=1024 )
- {
- int count = ((minfo->p2m_size - i ) > 1024 ) ?
- 1024: (minfo->p2m_size - i);
-
- if ( xc_get_pfn_type_batch(xch, domid, count, minfo->pfn_type + i) )
- {
- PERROR("Could not get %d-eth batch of PFN types", (i+1)/1024);
- goto failed;
- }
- }
-
- return 0;
-
-failed:
- if ( minfo->pfn_type )
- {
- free(minfo->pfn_type);
- minfo->pfn_type = NULL;
- }
- if ( minfo->p2m_table )
- {
- munmap(minfo->p2m_table, P2M_FL_ENTRIES * PAGE_SIZE);
- minfo->p2m_table = NULL;
- }
-
- return -1;
-}
+++ /dev/null
-/******************************************************************************
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- *
- * Copyright (c) 2011, Citrix Systems
- */
-
-#include <inttypes.h>
-#include <errno.h>
-#include <xenctrl.h>
-#include <xenguest.h>
-
-int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t flags,
- struct save_callbacks *callbacks,
- xc_stream_type_t stream_type, int recv_fd)
-{
- errno = ENOSYS;
- return -1;
-}
-
-int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
- unsigned int store_evtchn, unsigned long *store_mfn,
- uint32_t store_domid, unsigned int console_evtchn,
- unsigned long *console_mfn, uint32_t console_domid,
- xc_stream_type_t stream_type,
- struct restore_callbacks *callbacks, int send_back_fd)
-{
- errno = ENOSYS;
- return -1;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/******************************************************************************
- * xc_offline_page.c
- *
- * Helper functions to offline/online one page
- *
- * Copyright (c) 2003, K A Fraser.
- * Copyright (c) 2009, Intel Corporation.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <xc_core.h>
-
-#include "xc_private.h"
-#include "xenctrl_dom.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-struct pte_backup_entry
-{
- xen_pfn_t table_mfn;
- int offset;
-};
-
-#define DEFAULT_BACKUP_COUNT 1024
-struct pte_backup
-{
- struct pte_backup_entry *entries;
- int max;
- int cur;
-};
-
-static struct domain_info_context _dinfo;
-static struct domain_info_context *dinfo = &_dinfo;
-
-int xc_mark_page_online(xc_interface *xch, unsigned long start,
- unsigned long end, uint32_t *status)
-{
- DECLARE_SYSCTL;
- DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
- int ret = -1;
-
- if ( !status || (end < start) )
- {
- errno = EINVAL;
- return -1;
- }
- if ( xc_hypercall_bounce_pre(xch, status) )
- {
- ERROR("Could not bounce memory for xc_mark_page_online\n");
- return -1;
- }
-
- sysctl.cmd = XEN_SYSCTL_page_offline_op;
- sysctl.u.page_offline.start = start;
- sysctl.u.page_offline.cmd = sysctl_page_online;
- sysctl.u.page_offline.end = end;
- set_xen_guest_handle(sysctl.u.page_offline.status, status);
- ret = xc_sysctl(xch, &sysctl);
-
- xc_hypercall_bounce_post(xch, status);
-
- return ret;
-}
-
-int xc_mark_page_offline(xc_interface *xch, unsigned long start,
- unsigned long end, uint32_t *status)
-{
- DECLARE_SYSCTL;
- DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
- int ret = -1;
-
- if ( !status || (end < start) )
- {
- errno = EINVAL;
- return -1;
- }
- if ( xc_hypercall_bounce_pre(xch, status) )
- {
- ERROR("Could not bounce memory for xc_mark_page_offline");
- return -1;
- }
-
- sysctl.cmd = XEN_SYSCTL_page_offline_op;
- sysctl.u.page_offline.start = start;
- sysctl.u.page_offline.cmd = sysctl_page_offline;
- sysctl.u.page_offline.end = end;
- set_xen_guest_handle(sysctl.u.page_offline.status, status);
- ret = xc_sysctl(xch, &sysctl);
-
- xc_hypercall_bounce_post(xch, status);
-
- return ret;
-}
-
-int xc_query_page_offline_status(xc_interface *xch, unsigned long start,
- unsigned long end, uint32_t *status)
-{
- DECLARE_SYSCTL;
- DECLARE_HYPERCALL_BOUNCE(status, sizeof(uint32_t)*(end - start + 1), XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
- int ret = -1;
-
- if ( !status || (end < start) )
- {
- errno = EINVAL;
- return -1;
- }
- if ( xc_hypercall_bounce_pre(xch, status) )
- {
- ERROR("Could not bounce memory for xc_query_page_offline_status\n");
- return -1;
- }
-
- sysctl.cmd = XEN_SYSCTL_page_offline_op;
- sysctl.u.page_offline.start = start;
- sysctl.u.page_offline.cmd = sysctl_query_page_offline;
- sysctl.u.page_offline.end = end;
- set_xen_guest_handle(sysctl.u.page_offline.status, status);
- ret = xc_sysctl(xch, &sysctl);
-
- xc_hypercall_bounce_post(xch, status);
-
- return ret;
-}
-
- /*
- * There should no update to the grant when domain paused
- */
-static int xc_is_page_granted_v1(xc_interface *xch, xen_pfn_t gpfn,
- grant_entry_v1_t *gnttab, int gnt_num)
-{
- int i = 0;
-
- if (!gnttab)
- return 0;
-
- for (i = 0; i < gnt_num; i++)
- if ( ((gnttab[i].flags & GTF_type_mask) != GTF_invalid) &&
- (gnttab[i].frame == gpfn) )
- break;
-
- return (i != gnt_num);
-}
-
-static int xc_is_page_granted_v2(xc_interface *xch, xen_pfn_t gpfn,
- grant_entry_v2_t *gnttab, int gnt_num)
-{
- int i = 0;
-
- if (!gnttab)
- return 0;
-
- for (i = 0; i < gnt_num; i++)
- if ( ((gnttab[i].hdr.flags & GTF_type_mask) != GTF_invalid) &&
- (gnttab[i].full_page.frame == gpfn) )
- break;
-
- return (i != gnt_num);
-}
-
-static int backup_ptes(xen_pfn_t table_mfn, int offset,
- struct pte_backup *backup)
-{
- if (!backup)
- return -EINVAL;
-
- if (backup->max == backup->cur)
- {
- backup->entries = realloc(backup->entries,
- backup->max * 2 * sizeof(struct pte_backup_entry));
- if (backup->entries == NULL)
- return -1;
- else
- backup->max *= 2;
- }
-
- backup->entries[backup->cur].table_mfn = table_mfn;
- backup->entries[backup->cur++].offset = offset;
-
- return 0;
-}
-
-/*
- * return:
- * 1 when MMU update is required
- * 0 when no changes
- * <0 when error happen
- */
-typedef int (*pte_func)(xc_interface *xch,
- uint64_t pte, uint64_t *new_pte,
- unsigned long table_mfn, int table_offset,
- struct pte_backup *backup,
- unsigned long no_use);
-
-static int __clear_pte(xc_interface *xch,
- uint64_t pte, uint64_t *new_pte,
- unsigned long table_mfn, int table_offset,
- struct pte_backup *backup,
- unsigned long mfn)
-{
- /* If no new_pte pointer, same as no changes needed */
- if (!new_pte || !backup)
- return -EINVAL;
-
- if ( !(pte & _PAGE_PRESENT))
- return 0;
-
- /* XXX Check for PSE bit here */
- /* Hit one entry */
- if ( ((pte >> PAGE_SHIFT_X86) & MFN_MASK_X86) == mfn)
- {
- *new_pte = pte & ~_PAGE_PRESENT;
- if (!backup_ptes(table_mfn, table_offset, backup))
- return 1;
- }
-
- return 0;
-}
-
-static int __update_pte(xc_interface *xch,
- uint64_t pte, uint64_t *new_pte,
- unsigned long table_mfn, int table_offset,
- struct pte_backup *backup,
- unsigned long new_mfn)
-{
- int index;
-
- if (!new_pte)
- return 0;
-
- for (index = 0; index < backup->cur; index ++)
- if ( (backup->entries[index].table_mfn == table_mfn) &&
- (backup->entries[index].offset == table_offset) )
- break;
-
- if (index != backup->cur)
- {
- if (pte & _PAGE_PRESENT)
- ERROR("Page present while in backup ptes\n");
- pte &= ~MFN_MASK_X86;
- pte |= (new_mfn << PAGE_SHIFT_X86) | _PAGE_PRESENT;
- *new_pte = pte;
- return 1;
- }
-
- return 0;
-}
-
-static int change_pte(xc_interface *xch, uint32_t domid,
- struct xc_domain_meminfo *minfo,
- struct pte_backup *backup,
- struct xc_mmu *mmu,
- pte_func func,
- unsigned long data)
-{
- int pte_num, rc;
- uint64_t i;
- void *content = NULL;
-
- pte_num = PAGE_SIZE / ((minfo->pt_levels == 2) ? 4 : 8);
-
- for (i = 0; i < minfo->p2m_size; i++)
- {
- xen_pfn_t table_mfn = xc_pfn_to_mfn(i, minfo->p2m_table,
- minfo->guest_width);
- uint64_t pte, new_pte;
- int j;
-
- if ( (table_mfn == INVALID_PFN) ||
- ((minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
- XEN_DOMCTL_PFINFO_XTAB) )
- continue;
-
- if ( minfo->pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
- {
- content = xc_map_foreign_range(xch, domid, PAGE_SIZE,
- PROT_READ, table_mfn);
- if (!content)
- goto failed;
-
- for (j = 0; j < pte_num; j++)
- {
- if ( minfo->pt_levels == 2 )
- pte = ((const uint32_t*)content)[j];
- else
- pte = ((const uint64_t*)content)[j];
-
- rc = func(xch, pte, &new_pte, table_mfn, j, backup, data);
-
- switch (rc)
- {
- case 1:
- if ( xc_add_mmu_update(xch, mmu,
- table_mfn << PAGE_SHIFT |
- j * ( (minfo->pt_levels == 2) ?
- sizeof(uint32_t): sizeof(uint64_t)) |
- MMU_PT_UPDATE_PRESERVE_AD,
- new_pte) )
- goto failed;
- break;
-
- case 0:
- break;
-
- default:
- goto failed;
- }
- }
-
- munmap(content, PAGE_SIZE);
- content = NULL;
- }
- }
-
- if ( xc_flush_mmu_updates(xch, mmu) )
- goto failed;
-
- return 0;
-failed:
- /* XXX Shall we take action if we have fail to swap? */
- if (content)
- munmap(content, PAGE_SIZE);
-
- return -1;
-}
-
-static int update_pte(xc_interface *xch, uint32_t domid,
- struct xc_domain_meminfo *minfo,
- struct pte_backup *backup,
- struct xc_mmu *mmu,
- unsigned long new_mfn)
-{
- return change_pte(xch, domid, minfo, backup, mmu,
- __update_pte, new_mfn);
-}
-
-static int clear_pte(xc_interface *xch, uint32_t domid,
- struct xc_domain_meminfo *minfo,
- struct pte_backup *backup,
- struct xc_mmu *mmu,
- xen_pfn_t mfn)
-{
- return change_pte(xch, domid, minfo, backup, mmu,
- __clear_pte, mfn);
-}
-
-/*
- * Check if a page can be exchanged successfully
- */
-
-static int is_page_exchangable(xc_interface *xch, uint32_t domid, xen_pfn_t mfn,
- xc_dominfo_t *info)
-{
- uint32_t status;
- int rc;
-
- /* domain checking */
- if ( !domid || (domid > DOMID_FIRST_RESERVED) )
- {
- DPRINTF("Dom0's page can't be LM");
- return 0;
- }
- if (info->hvm)
- {
- DPRINTF("Currently we can only live change PV guest's page\n");
- return 0;
- }
-
- /* Check if pages are offline pending or not */
- rc = xc_query_page_offline_status(xch, mfn, mfn, &status);
-
- if ( rc || !(status & PG_OFFLINE_STATUS_OFFLINE_PENDING) )
- {
- ERROR("Page %lx is not offline pending %x\n",
- mfn, status);
- return 0;
- }
-
- return 1;
-}
-
-xen_pfn_t *xc_map_m2p(xc_interface *xch,
- unsigned long max_mfn,
- int prot,
- unsigned long *mfn0)
-{
- privcmd_mmap_entry_t *entries;
- unsigned long m2p_chunks, m2p_size;
- xen_pfn_t *m2p;
- xen_pfn_t *extent_start;
- int i;
-
- m2p = NULL;
- m2p_size = M2P_SIZE(max_mfn);
- m2p_chunks = M2P_CHUNKS(max_mfn);
-
- extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
- if ( !extent_start )
- {
- ERROR("failed to allocate space for m2p mfns");
- goto err0;
- }
-
- if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) )
- {
- PERROR("xc_get_m2p_mfns");
- goto err1;
- }
-
- entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
- if (entries == NULL)
- {
- ERROR("failed to allocate space for mmap entries");
- goto err1;
- }
-
- for ( i = 0; i < m2p_chunks; i++ )
- entries[i].mfn = extent_start[i];
-
- m2p = xc_map_foreign_ranges(xch, DOMID_XEN,
- m2p_size, prot, M2P_CHUNK_SIZE,
- entries, m2p_chunks);
- if (m2p == NULL)
- {
- PERROR("xc_mmap_foreign_ranges failed");
- goto err2;
- }
-
- if (mfn0)
- *mfn0 = entries[0].mfn;
-
-err2:
- free(entries);
-err1:
- free(extent_start);
-
-err0:
- return m2p;
-}
-
-/* The domain should be suspended when called here */
-int xc_exchange_page(xc_interface *xch, uint32_t domid, xen_pfn_t mfn)
-{
- xc_dominfo_t info;
- struct xc_domain_meminfo minfo;
- struct xc_mmu *mmu = NULL;
- struct pte_backup old_ptes = {NULL, 0, 0};
- grant_entry_v1_t *gnttab_v1 = NULL;
- grant_entry_v2_t *gnttab_v2 = NULL;
- struct mmuext_op mops;
- int gnt_num, unpined = 0;
- void *old_p, *backup = NULL;
- int rc, result = -1;
- uint32_t status;
- xen_pfn_t new_mfn, gpfn;
- xen_pfn_t *m2p_table;
- unsigned long max_mfn;
-
- if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 )
- {
- ERROR("Could not get domain info");
- return -1;
- }
-
- if (!info.shutdown || info.shutdown_reason != SHUTDOWN_suspend)
- {
- errno = EINVAL;
- ERROR("Can't exchange page unless domain is suspended\n");
- return -1;
- }
- if (!is_page_exchangable(xch, domid, mfn, &info))
- {
- ERROR("Could not exchange page\n");
- return -1;
- }
-
- /* Map M2P and obtain gpfn */
- rc = xc_maximum_ram_page(xch, &max_mfn);
- if ( rc || !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, NULL)) )
- {
- PERROR("Failed to map live M2P table");
- return -1;
- }
- gpfn = m2p_table[mfn];
-
- /* Map domain's memory information */
- memset(&minfo, 0, sizeof(minfo));
- if ( xc_map_domain_meminfo(xch, domid, &minfo) )
- {
- PERROR("Could not map domain's memory information\n");
- goto failed;
- }
-
- /* For translation macros */
- dinfo->guest_width = minfo.guest_width;
- dinfo->p2m_size = minfo.p2m_size;
-
- /* Don't exchange CR3 for PAE guest in PAE host environment */
- if (minfo.guest_width > sizeof(long))
- {
- if ( (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
- XEN_DOMCTL_PFINFO_L3TAB )
- goto failed;
- }
-
- gnttab_v2 = xc_gnttab_map_table_v2(xch, domid, &gnt_num);
- if (!gnttab_v2)
- {
- gnttab_v1 = xc_gnttab_map_table_v1(xch, domid, &gnt_num);
- if (!gnttab_v1)
- {
- ERROR("Failed to map grant table\n");
- goto failed;
- }
- }
-
- if (gnttab_v1
- ? xc_is_page_granted_v1(xch, mfn, gnttab_v1, gnt_num)
- : xc_is_page_granted_v2(xch, mfn, gnttab_v2, gnt_num))
- {
- ERROR("Page %lx is granted now\n", mfn);
- goto failed;
- }
-
- /* allocate required data structure */
- backup = malloc(PAGE_SIZE);
- if (!backup)
- {
- ERROR("Failed to allocate backup pages pointer\n");
- goto failed;
- }
-
- old_ptes.max = DEFAULT_BACKUP_COUNT;
- old_ptes.entries = malloc(sizeof(struct pte_backup_entry) *
- DEFAULT_BACKUP_COUNT);
-
- if (!old_ptes.entries)
- {
- ERROR("Faield to allocate backup\n");
- goto failed;
- }
- old_ptes.cur = 0;
-
- /* Unpin the page if it is pined */
- if (minfo.pfn_type[gpfn] & XEN_DOMCTL_PFINFO_LPINTAB)
- {
- mops.cmd = MMUEXT_UNPIN_TABLE;
- mops.arg1.mfn = mfn;
-
- if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 )
- {
- ERROR("Failed to unpin page %lx", mfn);
- goto failed;
- }
- mops.arg1.mfn = mfn;
- unpined = 1;
- }
-
- /* backup the content */
- old_p = xc_map_foreign_range(xch, domid, PAGE_SIZE,
- PROT_READ, mfn);
- if (!old_p)
- {
- ERROR("Failed to map foreign page %lx\n", mfn);
- goto failed;
- }
-
- memcpy(backup, old_p, PAGE_SIZE);
- munmap(old_p, PAGE_SIZE);
-
- mmu = xc_alloc_mmu_updates(xch, domid);
- if ( mmu == NULL )
- {
- ERROR("%s: failed at %d\n", __FUNCTION__, __LINE__);
- goto failed;
- }
-
- /* Firstly update all pte to be invalid to remove the reference */
- rc = clear_pte(xch, domid, &minfo, &old_ptes, mmu, mfn);
-
- if (rc)
- {
- ERROR("clear pte failed\n");
- goto failed;
- }
-
- rc = xc_domain_memory_exchange_pages(xch, domid,
- 1, 0, &mfn,
- 1, 0, &new_mfn);
-
- if (rc)
- {
- ERROR("Exchange the page failed\n");
- /* Exchange fail means there are refere to the page still */
- rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, mfn);
- if (rc)
- result = -2;
- goto failed;
- }
-
- rc = update_pte(xch, domid, &minfo, &old_ptes, mmu, new_mfn);
-
- if (rc)
- {
- ERROR("update pte failed guest may be broken now\n");
- /* No recover action now for swap fail */
- result = -2;
- goto failed;
- }
-
- /* Check if pages are offlined already */
- rc = xc_query_page_offline_status(xch, mfn, mfn,
- &status);
-
- if (rc)
- {
- ERROR("Fail to query offline status\n");
- }else if ( !(status & PG_OFFLINE_STATUS_OFFLINED) )
- {
- ERROR("page is still online or pending\n");
- goto failed;
- }
- else
- {
- void *new_p;
- IPRINTF("Now page is offlined %lx\n", mfn);
- /* Update the p2m table */
- minfo.p2m_table[gpfn] = new_mfn;
-
- new_p = xc_map_foreign_range(xch, domid, PAGE_SIZE,
- PROT_READ|PROT_WRITE, new_mfn);
- if ( new_p == NULL )
- {
- ERROR("failed to map new_p for copy, guest may be broken?");
- goto failed;
- }
- memcpy(new_p, backup, PAGE_SIZE);
- munmap(new_p, PAGE_SIZE);
- mops.arg1.mfn = new_mfn;
- result = 0;
- }
-
-failed:
-
- if (unpined && (minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LPINTAB))
- {
- switch ( minfo.pfn_type[mfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
- {
- case XEN_DOMCTL_PFINFO_L1TAB:
- mops.cmd = MMUEXT_PIN_L1_TABLE;
- break;
-
- case XEN_DOMCTL_PFINFO_L2TAB:
- mops.cmd = MMUEXT_PIN_L2_TABLE;
- break;
-
- case XEN_DOMCTL_PFINFO_L3TAB:
- mops.cmd = MMUEXT_PIN_L3_TABLE;
- break;
-
- case XEN_DOMCTL_PFINFO_L4TAB:
- mops.cmd = MMUEXT_PIN_L4_TABLE;
- break;
-
- default:
- ERROR("Unpined for non pate table page\n");
- break;
- }
-
- if ( xc_mmuext_op(xch, &mops, 1, domid) < 0 )
- {
- ERROR("failed to pin the mfn again\n");
- result = -2;
- }
- }
-
- free(mmu);
-
- free(old_ptes.entries);
-
- free(backup);
-
- if (gnttab_v1)
- munmap(gnttab_v1, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v1_t)));
- if (gnttab_v2)
- munmap(gnttab_v2, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v2_t)));
-
- xc_unmap_domain_meminfo(xch, &minfo);
- munmap(m2p_table, M2P_SIZE(max_mfn));
-
- return result;
-}
+++ /dev/null
-/******************************************************************************
- * xg_private.c
- *
- * Helper functions for the rest of the library.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <zlib.h>
-
-#include "xg_private.h"
-
-char *xc_read_image(xc_interface *xch,
- const char *filename, unsigned long *size)
-{
- int kernel_fd = -1;
- gzFile kernel_gfd = NULL;
- char *image = NULL, *tmp;
- unsigned int bytes;
-
- if ( (filename == NULL) || (size == NULL) )
- return NULL;
-
- if ( (kernel_fd = open(filename, O_RDONLY)) < 0 )
- {
- PERROR("Could not open kernel image '%s'", filename);
- goto out;
- }
-
- if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL )
- {
- PERROR("Could not allocate decompression state for state file");
- goto out;
- }
-
- *size = 0;
-
-#define CHUNK 1*1024*1024
- while(1)
- {
- if ( (tmp = realloc(image, *size + CHUNK)) == NULL )
- {
- PERROR("Could not allocate memory for kernel image");
- free(image);
- image = NULL;
- goto out;
- }
- image = tmp;
-
- bytes = gzread(kernel_gfd, image + *size, CHUNK);
- switch (bytes)
- {
- case -1:
- PERROR("Error reading kernel image");
- free(image);
- image = NULL;
- goto out;
- case 0: /* EOF */
- if ( *size == 0 )
- {
- PERROR("Could not read kernel image");
- free(image);
- image = NULL;
- }
- goto out;
- default:
- *size += bytes;
- break;
- }
- }
-#undef CHUNK
-
- out:
- if ( image )
- {
- /* Shrink allocation to fit image. */
- tmp = realloc(image, *size);
- if ( tmp )
- image = tmp;
- }
-
- if ( kernel_gfd != NULL )
- gzclose(kernel_gfd);
- else if ( kernel_fd >= 0 )
- close(kernel_fd);
- return image;
-}
-
-char *xc_inflate_buffer(xc_interface *xch,
- const char *in_buf, unsigned long in_size,
- unsigned long *out_size)
-{
- int sts;
- z_stream zStream;
- unsigned long out_len;
- char *out_buf;
-
- /* Not compressed? Then return the original buffer. */
- if ( ((unsigned char)in_buf[0] != 0x1F) ||
- ((unsigned char)in_buf[1] != 0x8B) )
- {
- if ( out_size != NULL )
- *out_size = in_size;
- return (char *)in_buf;
- }
-
- out_len = (unsigned char)in_buf[in_size-4] +
- (256 * ((unsigned char)in_buf[in_size-3] +
- (256 * ((unsigned char)in_buf[in_size-2] +
- (256 * (unsigned char)in_buf[in_size-1])))));
-
- memset(&zStream, 0, sizeof(zStream));
- out_buf = malloc(out_len + 16); /* Leave a little extra space */
- if ( out_buf == NULL )
- {
- ERROR("Error mallocing buffer\n");
- return NULL;
- }
-
- zStream.next_in = (unsigned char *)in_buf;
- zStream.avail_in = in_size;
- zStream.next_out = (unsigned char *)out_buf;
- zStream.avail_out = out_len+16;
- sts = inflateInit2(&zStream, (MAX_WBITS+32)); /* +32 means "handle gzip" */
- if ( sts != Z_OK )
- {
- ERROR("inflateInit failed, sts %d\n", sts);
- free(out_buf);
- return NULL;
- }
-
- /* Inflate in one pass/call */
- sts = inflate(&zStream, Z_FINISH);
- inflateEnd(&zStream);
- if ( sts != Z_STREAM_END )
- {
- ERROR("inflate failed, sts %d\n", sts);
- free(out_buf);
- return NULL;
- }
-
- if ( out_size != NULL )
- *out_size = out_len;
-
- return out_buf;
-}
-
-/*******************/
-
-int pin_table(
- xc_interface *xch, unsigned int type, unsigned long mfn, uint32_t dom)
-{
- struct mmuext_op op;
-
- op.cmd = type;
- op.arg1.mfn = mfn;
-
- if ( xc_mmuext_op(xch, &op, 1, dom) < 0 )
- return 1;
-
- return 0;
-}
-
-/* This is shared between save and restore, and may generally be useful. */
-unsigned long csum_page(void *page)
-{
- int i;
- unsigned long *p = page;
- unsigned long long sum=0;
-
- for ( i = 0; i < (PAGE_SIZE/sizeof(unsigned long)); i++ )
- sum += p[i];
-
- return sum ^ (sum>>32);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef XG_PRIVATE_H
-#define XG_PRIVATE_H
-
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "xc_private.h"
-#include "xenguest.h"
-
-#include <xen/memory.h>
-#include <xen/elfnote.h>
-
-#ifndef ELFSIZE
-#include <limits.h>
-#if UINT_MAX == ULONG_MAX
-#define ELFSIZE 32
-#else
-#define ELFSIZE 64
-#endif
-#endif
-
-char *xc_read_image(xc_interface *xch,
- const char *filename, unsigned long *size);
-char *xc_inflate_buffer(xc_interface *xch,
- const char *in_buf,
- unsigned long in_size,
- unsigned long *out_size);
-
-unsigned long csum_page (void * page);
-
-#define _PAGE_PRESENT 0x001
-#define _PAGE_RW 0x002
-#define _PAGE_USER 0x004
-#define _PAGE_PWT 0x008
-#define _PAGE_PCD 0x010
-#define _PAGE_ACCESSED 0x020
-#define _PAGE_DIRTY 0x040
-#define _PAGE_PAT 0x080
-#define _PAGE_PSE 0x080
-#define _PAGE_GLOBAL 0x100
-
-#define VIRT_BITS_I386 32
-#define VIRT_BITS_X86_64 48
-
-#define PGTBL_LEVELS_I386 3
-#define PGTBL_LEVELS_X86_64 4
-
-#define PGTBL_LEVEL_SHIFT_X86 9
-
-#define L1_PAGETABLE_SHIFT_PAE 12
-#define L2_PAGETABLE_SHIFT_PAE 21
-#define L3_PAGETABLE_SHIFT_PAE 30
-#define L1_PAGETABLE_ENTRIES_PAE 512
-#define L2_PAGETABLE_ENTRIES_PAE 512
-#define L3_PAGETABLE_ENTRIES_PAE 4
-
-#define L1_PAGETABLE_SHIFT_X86_64 12
-#define L2_PAGETABLE_SHIFT_X86_64 21
-#define L3_PAGETABLE_SHIFT_X86_64 30
-#define L4_PAGETABLE_SHIFT_X86_64 39
-#define L1_PAGETABLE_ENTRIES_X86_64 512
-#define L2_PAGETABLE_ENTRIES_X86_64 512
-#define L3_PAGETABLE_ENTRIES_X86_64 512
-#define L4_PAGETABLE_ENTRIES_X86_64 512
-
-typedef uint64_t x86_pgentry_t;
-
-#define PAGE_SHIFT_ARM 12
-#define PAGE_SIZE_ARM (1UL << PAGE_SHIFT_ARM)
-#define PAGE_MASK_ARM (~(PAGE_SIZE_ARM-1))
-
-#define PAGE_SHIFT_X86 12
-#define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86)
-#define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1))
-
-#define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT)
-
-static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m,
- unsigned gwidth)
-{
- if ( gwidth == sizeof(uint64_t) )
- /* 64 bit guest. Need to truncate their pfns for 32 bit toolstacks. */
- return ((uint64_t *)p2m)[pfn];
- else
- {
- /* 32 bit guest. Need to expand INVALID_MFN for 64 bit toolstacks. */
- uint32_t mfn = ((uint32_t *)p2m)[pfn];
-
- return mfn == ~0U ? INVALID_MFN : mfn;
- }
-}
-
-
-/* Masks for PTE<->PFN conversions */
-#define MADDR_BITS_X86 ((dinfo->guest_width == 8) ? 52 : 44)
-#define MFN_MASK_X86 ((1ULL << (MADDR_BITS_X86 - PAGE_SHIFT_X86)) - 1)
-#define MADDR_MASK_X86 (MFN_MASK_X86 << PAGE_SHIFT_X86)
-
-int pin_table(xc_interface *xch, unsigned int type, unsigned long mfn,
- uint32_t dom);
-
-#endif /* XG_PRIVATE_H */
+++ /dev/null
-/*
- * Definitions and utilities for save / restore.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "xc_private.h"
-
-#include <xen/foreign/x86_32.h>
-#include <xen/foreign/x86_64.h>
-
-/*
-** We process save/restore/migrate in batches of pages; the below
-** determines how many pages we (at maximum) deal with in each batch.
-*/
-#define MAX_BATCH_SIZE 1024 /* up to 1024 pages (4MB) at a time */
-
-/* When pinning page tables at the end of restore, we also use batching. */
-#define MAX_PIN_BATCH 1024
-
-/*
-** Determine various platform information required for save/restore, in
-** particular:
-**
-** - the maximum MFN on this machine, used to compute the size of
-** the M2P table;
-**
-** - the starting virtual address of the the hypervisor; we use this
-** to determine which parts of guest address space(s) do and don't
-** require canonicalization during save/restore; and
-**
-** - the number of page-table levels for save/ restore. This should
-** be a property of the domain, but for the moment we just read it
-** from the hypervisor.
-**
-** - The width of a guest word (unsigned long), in bytes.
-**
-** Returns 1 on success, 0 on failure.
-*/
-static inline int get_platform_info(xc_interface *xch, uint32_t dom,
- /* OUT */ unsigned long *max_mfn,
- /* OUT */ unsigned long *hvirt_start,
- /* OUT */ unsigned int *pt_levels,
- /* OUT */ unsigned int *guest_width)
-{
- xen_capabilities_info_t xen_caps = "";
- xen_platform_parameters_t xen_params;
-
- if (xc_version(xch, XENVER_platform_parameters, &xen_params) != 0)
- return 0;
-
- if (xc_version(xch, XENVER_capabilities, &xen_caps) != 0)
- return 0;
-
- if (xc_maximum_ram_page(xch, max_mfn))
- return 0;
-
- *hvirt_start = xen_params.virt_start;
-
- if ( xc_domain_get_guest_width(xch, dom, guest_width) != 0)
- return 0;
-
- /* 64-bit tools will see the 64-bit hvirt_start, but 32-bit guests
- * will be using the compat one. */
- if ( *guest_width < sizeof (unsigned long) )
- /* XXX need to fix up a way of extracting this value from Xen if
- * XXX it becomes variable for domU */
- *hvirt_start = 0xf5800000;
-
- if (strstr(xen_caps, "xen-3.0-x86_64"))
- /* Depends on whether it's a compat 32-on-64 guest */
- *pt_levels = ( (*guest_width == 8) ? 4 : 3 );
- else if (strstr(xen_caps, "xen-3.0-x86_32p"))
- *pt_levels = 3;
- else
- return 0;
-
- return 1;
-}
-
-
-/*
-** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables.
-** The M2P simply holds the corresponding PFN, while the top bit of a P2M
-** entry tell us whether or not the the PFN is currently mapped.
-*/
-
-#define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10))
-
-
-/*
-** The M2P is made up of some number of 'chunks' of at least 2MB in size.
-** The below definitions and utility function(s) deal with mapping the M2P
-** regarldess of the underlying machine memory size or architecture.
-*/
-#define M2P_SHIFT L2_PAGETABLE_SHIFT_PAE
-#define M2P_CHUNK_SIZE (1 << M2P_SHIFT)
-#define M2P_SIZE(_m) ROUNDUP(((_m) * sizeof(xen_pfn_t)), M2P_SHIFT)
-#define M2P_CHUNKS(_m) (M2P_SIZE((_m)) >> M2P_SHIFT)
-
-#define UNFOLD_CR3(_c) \
- ((uint64_t)((dinfo->guest_width == 8) \
- ? ((_c) >> 12) \
- : (((uint32_t)(_c) >> 12) | ((uint32_t)(_c) << 20))))
-
-#define FOLD_CR3(_c) \
- ((uint64_t)((dinfo->guest_width == 8) \
- ? ((uint64_t)(_c)) << 12 \
- : (((uint32_t)(_c) << 12) | ((uint32_t)(_c) >> 20))))
-
-#define MEMCPY_FIELD(_d, _s, _f, _w) do { \
- if ((_w) == 8) \
- memcpy(&(_d)->x64._f, &(_s)->x64._f,sizeof((_d)->x64._f)); \
- else \
- memcpy(&(_d)->x32._f, &(_s)->x32._f,sizeof((_d)->x32._f)); \
-} while (0)
-
-#define MEMSET_ARRAY_FIELD(_p, _f, _v, _w) do { \
- if ((_w) == 8) \
- memset(&(_p)->x64._f[0], (_v), sizeof((_p)->x64._f)); \
- else \
- memset(&(_p)->x32._f[0], (_v), sizeof((_p)->x32._f)); \
-} while (0)
+++ /dev/null
-#include <assert.h>
-
-#include "xg_sr_common.h"
-
-#include <xen-tools/libs.h>
-
-static const char *const dhdr_types[] =
-{
- [DHDR_TYPE_X86_PV] = "x86 PV",
- [DHDR_TYPE_X86_HVM] = "x86 HVM",
-};
-
-const char *dhdr_type_to_str(uint32_t type)
-{
- if ( type < ARRAY_SIZE(dhdr_types) && dhdr_types[type] )
- return dhdr_types[type];
-
- return "Reserved";
-}
-
-static const char *const mandatory_rec_types[] =
-{
- [REC_TYPE_END] = "End",
- [REC_TYPE_PAGE_DATA] = "Page data",
- [REC_TYPE_X86_PV_INFO] = "x86 PV info",
- [REC_TYPE_X86_PV_P2M_FRAMES] = "x86 PV P2M frames",
- [REC_TYPE_X86_PV_VCPU_BASIC] = "x86 PV vcpu basic",
- [REC_TYPE_X86_PV_VCPU_EXTENDED] = "x86 PV vcpu extended",
- [REC_TYPE_X86_PV_VCPU_XSAVE] = "x86 PV vcpu xsave",
- [REC_TYPE_SHARED_INFO] = "Shared info",
- [REC_TYPE_X86_TSC_INFO] = "x86 TSC info",
- [REC_TYPE_HVM_CONTEXT] = "HVM context",
- [REC_TYPE_HVM_PARAMS] = "HVM params",
- [REC_TYPE_TOOLSTACK] = "Toolstack",
- [REC_TYPE_X86_PV_VCPU_MSRS] = "x86 PV vcpu msrs",
- [REC_TYPE_VERIFY] = "Verify",
- [REC_TYPE_CHECKPOINT] = "Checkpoint",
- [REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST] = "Checkpoint dirty pfn list",
- [REC_TYPE_STATIC_DATA_END] = "Static data end",
- [REC_TYPE_X86_CPUID_POLICY] = "x86 CPUID policy",
- [REC_TYPE_X86_MSR_POLICY] = "x86 MSR policy",
-};
-
-const char *rec_type_to_str(uint32_t type)
-{
- if ( !(type & REC_TYPE_OPTIONAL) )
- {
- if ( (type < ARRAY_SIZE(mandatory_rec_types)) &&
- (mandatory_rec_types[type]) )
- return mandatory_rec_types[type];
- }
-
- return "Reserved";
-}
-
-int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
- void *buf, size_t sz)
-{
- static const char zeroes[(1u << REC_ALIGN_ORDER) - 1] = { 0 };
-
- xc_interface *xch = ctx->xch;
- typeof(rec->length) combined_length = rec->length + sz;
- size_t record_length = ROUNDUP(combined_length, REC_ALIGN_ORDER);
- struct iovec parts[] = {
- { &rec->type, sizeof(rec->type) },
- { &combined_length, sizeof(combined_length) },
- { rec->data, rec->length },
- { buf, sz },
- { (void *)zeroes, record_length - combined_length },
- };
-
- if ( record_length > REC_LENGTH_MAX )
- {
- ERROR("Record (0x%08x, %s) length %#zx exceeds max (%#x)", rec->type,
- rec_type_to_str(rec->type), record_length, REC_LENGTH_MAX);
- return -1;
- }
-
- if ( rec->length )
- assert(rec->data);
- if ( sz )
- assert(buf);
-
- if ( writev_exact(ctx->fd, parts, ARRAY_SIZE(parts)) )
- goto err;
-
- return 0;
-
- err:
- PERROR("Unable to write record to stream");
- return -1;
-}
-
-int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rhdr rhdr;
- size_t datasz;
-
- if ( read_exact(fd, &rhdr, sizeof(rhdr)) )
- {
- PERROR("Failed to read Record Header from stream");
- return -1;
- }
-
- if ( rhdr.length > REC_LENGTH_MAX )
- {
- ERROR("Record (0x%08x, %s) length %#x exceeds max (%#x)", rhdr.type,
- rec_type_to_str(rhdr.type), rhdr.length, REC_LENGTH_MAX);
- return -1;
- }
-
- datasz = ROUNDUP(rhdr.length, REC_ALIGN_ORDER);
-
- if ( datasz )
- {
- rec->data = malloc(datasz);
-
- if ( !rec->data )
- {
- ERROR("Unable to allocate %zu bytes for record data (0x%08x, %s)",
- datasz, rhdr.type, rec_type_to_str(rhdr.type));
- return -1;
- }
-
- if ( read_exact(fd, rec->data, datasz) )
- {
- free(rec->data);
- rec->data = NULL;
- PERROR("Failed to read %zu bytes of data for record (0x%08x, %s)",
- datasz, rhdr.type, rec_type_to_str(rhdr.type));
- return -1;
- }
- }
- else
- rec->data = NULL;
-
- rec->type = rhdr.type;
- rec->length = rhdr.length;
-
- return 0;
-};
-
-static void __attribute__((unused)) build_assertions(void)
-{
- BUILD_BUG_ON(sizeof(struct xc_sr_ihdr) != 24);
- BUILD_BUG_ON(sizeof(struct xc_sr_dhdr) != 16);
- BUILD_BUG_ON(sizeof(struct xc_sr_rhdr) != 8);
-
- BUILD_BUG_ON(sizeof(struct xc_sr_rec_page_data_header) != 8);
- BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_info) != 8);
- BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_p2m_frames) != 8);
- BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_pv_vcpu_hdr) != 8);
- BUILD_BUG_ON(sizeof(struct xc_sr_rec_x86_tsc_info) != 24);
- BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params_entry) != 16);
- BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params) != 8);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#ifndef __COMMON__H
-#define __COMMON__H
-
-#include <stdbool.h>
-
-#include "xg_private.h"
-#include "xg_save_restore.h"
-#include "xenctrl_dom.h"
-#include "xc_bitops.h"
-
-#include "xg_sr_stream_format.h"
-
-/* String representation of Domain Header types. */
-const char *dhdr_type_to_str(uint32_t type);
-
-/* String representation of Record types. */
-const char *rec_type_to_str(uint32_t type);
-
-struct xc_sr_context;
-struct xc_sr_record;
-
-/**
- * Save operations. To be implemented for each type of guest, for use by the
- * common save algorithm.
- *
- * Every function must be implemented, even if only with a no-op stub.
- */
-struct xc_sr_save_ops
-{
- /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */
- xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
-
- /**
- * Optionally transform the contents of a page from being specific to the
- * sending environment, to being generic for the stream.
- *
- * The page of data at the end of 'page' may be a read-only mapping of a
- * running guest; it must not be modified. If no transformation is
- * required, the callee should leave '*pages' untouched.
- *
- * If a transformation is required, the callee should allocate themselves
- * a local page using malloc() and return it via '*page'.
- *
- * The caller shall free() '*page' in all cases. In the case that the
- * callee encounters an error, it should *NOT* free() the memory it
- * allocated for '*page'.
- *
- * It is valid to fail with EAGAIN if the transformation is not able to be
- * completed at this point. The page shall be retried later.
- *
- * @returns 0 for success, -1 for failure, with errno appropriately set.
- */
- int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type,
- void **page);
-
- /**
- * Set up local environment to save a domain. (Typically querying
- * running domain state, setting up mappings etc.)
- *
- * This is called once before any common setup has occurred, allowing for
- * guest-specific adjustments to be made to common state.
- */
- int (*setup)(struct xc_sr_context *ctx);
-
- /**
- * Send static records at the head of the stream. This is called once,
- * after the Image and Domain headers are written.
- */
- int (*static_data)(struct xc_sr_context *ctx);
-
- /**
- * Send dynamic records which need to be at the start of the stream. This
- * is called after the STATIC_DATA_END record is written.
- */
- int (*start_of_stream)(struct xc_sr_context *ctx);
-
- /**
- * Send records which need to be at the start of a checkpoint. This is
- * called once, or once per checkpoint in a checkpointed stream, and is
- * ahead of memory data.
- */
- int (*start_of_checkpoint)(struct xc_sr_context *ctx);
-
- /**
- * Send records which need to be at the end of the checkpoint. This is
- * called once, or once per checkpoint in a checkpointed stream, and is
- * after the memory data.
- */
- int (*end_of_checkpoint)(struct xc_sr_context *ctx);
-
- /**
- * Check state of guest to decide whether it makes sense to continue
- * migration. This is called in each iteration or checkpoint to check
- * whether all criteria for the migration are still met. If that's not
- * the case either migration is cancelled via a bad rc or the situation
- * is handled, e.g. by sending appropriate records.
- */
- int (*check_vm_state)(struct xc_sr_context *ctx);
-
- /**
- * Clean up the local environment. Will be called exactly once, either
- * after a successful save, or upon encountering an error.
- */
- int (*cleanup)(struct xc_sr_context *ctx);
-};
-
-
-/**
- * Restore operations. To be implemented for each type of guest, for use by
- * the common restore algorithm.
- *
- * Every function must be implemented, even if only with a no-op stub.
- */
-struct xc_sr_restore_ops
-{
- /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */
- xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
-
- /* Check to see whether a PFN is valid. */
- bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
-
- /* Set the GFN of a PFN. */
- void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
-
- /* Set the type of a PFN. */
- void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
- xen_pfn_t type);
-
- /**
- * Optionally transform the contents of a page from being generic in the
- * stream, to being specific to the restoring environment.
- *
- * 'page' is expected to be modified in-place if a transformation is
- * required.
- *
- * @returns 0 for success, -1 for failure, with errno appropriately set.
- */
- int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page);
-
- /**
- * Set up local environment to restore a domain.
- *
- * This is called once before any common setup has occurred, allowing for
- * guest-specific adjustments to be made to common state.
- */
- int (*setup)(struct xc_sr_context *ctx);
-
- /**
- * Process an individual record from the stream. The caller shall take
- * care of processing common records (e.g. END, PAGE_DATA).
- *
- * @return 0 for success, -1 for failure, or the following sentinels:
- * - RECORD_NOT_PROCESSED
- * - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and
- * a failover is needed.
- */
-#define RECORD_NOT_PROCESSED 1
-#define BROKEN_CHANNEL 2
- int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);
-
- /**
- * Perform any actions required after the static data has arrived. Called
- * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
- * 'missing' should be filled in for any data item the higher level
- * toolstack needs to provide compatiblity for.
- */
- int (*static_data_complete)(struct xc_sr_context *ctx,
- unsigned int *missing);
-
- /**
- * Perform any actions required after the stream has been finished. Called
- * after the END record has been received.
- */
- int (*stream_complete)(struct xc_sr_context *ctx);
-
- /**
- * Clean up the local environment. Will be called exactly once, either
- * after a successful restore, or upon encountering an error.
- */
- int (*cleanup)(struct xc_sr_context *ctx);
-};
-
-/* Wrapper for blobs of data heading Xen-wards. */
-struct xc_sr_blob
-{
- void *ptr;
- size_t size;
-};
-
-/*
- * Update a blob. Duplicate src/size, freeing the old blob if necessary. May
- * fail due to memory allocation.
- */
-static inline int update_blob(struct xc_sr_blob *blob,
- const void *src, size_t size)
-{
- void *ptr;
-
- if ( !src || !size )
- {
- errno = EINVAL;
- return -1;
- }
-
- if ( (ptr = malloc(size)) == NULL )
- return -1;
-
- free(blob->ptr);
- blob->ptr = memcpy(ptr, src, size);
- blob->size = size;
-
- return 0;
-}
-
-struct xc_sr_context
-{
- xc_interface *xch;
- uint32_t domid;
- int fd;
-
- /* Plain VM, or checkpoints over time. */
- xc_stream_type_t stream_type;
-
- xc_dominfo_t dominfo;
-
- union /* Common save or restore data. */
- {
- struct /* Save data. */
- {
- int recv_fd;
-
- struct xc_sr_save_ops ops;
- struct save_callbacks *callbacks;
-
- /* Live migrate vs non live suspend. */
- bool live;
-
- /* Further debugging information in the stream. */
- bool debug;
-
- unsigned long p2m_size;
-
- struct precopy_stats stats;
-
- xen_pfn_t *batch_pfns;
- unsigned int nr_batch_pfns;
- unsigned long *deferred_pages;
- unsigned long nr_deferred_pages;
- xc_hypercall_buffer_t dirty_bitmap_hbuf;
- } save;
-
- struct /* Restore data. */
- {
- struct xc_sr_restore_ops ops;
- struct restore_callbacks *callbacks;
-
- int send_back_fd;
- unsigned long p2m_size;
- xc_hypercall_buffer_t dirty_bitmap_hbuf;
-
- /* From Image Header. */
- uint32_t format_version;
-
- /* From Domain Header. */
- uint32_t guest_type;
- uint32_t guest_page_size;
-
- /* Currently buffering records between a checkpoint */
- bool buffer_all_records;
-
- /* Whether a STATIC_DATA_END record has been seen/inferred. */
- bool seen_static_data_end;
-
-/*
- * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
- * in case the primary will fail, we can recover from the last
- * checkpoint state.
- * This should be enough for most of the cases because primary only send
- * dirty pages at checkpoint.
- */
-#define DEFAULT_BUF_RECORDS 1024
- struct xc_sr_record *buffered_records;
- unsigned int allocated_rec_num;
- unsigned int buffered_rec_num;
-
- /*
- * Xenstore and Console parameters.
- * INPUT: evtchn & domid
- * OUTPUT: gfn
- */
- xen_pfn_t xenstore_gfn, console_gfn;
- unsigned int xenstore_evtchn, console_evtchn;
- uint32_t xenstore_domid, console_domid;
-
- /* Bitmap of currently populated PFNs during restore. */
- unsigned long *populated_pfns;
- xen_pfn_t max_populated_pfn;
-
- /* Sender has invoked verify mode on the stream. */
- bool verify;
- } restore;
- };
-
- union /* Guest-arch specific data. */
- {
- struct /* x86 */
- {
- /* Common save/restore data. */
- union
- {
- struct
- {
- /* X86_{CPUID,MSR}_DATA blobs for CPU Policy. */
- struct xc_sr_blob cpuid, msr;
- } restore;
- };
-
- struct /* x86 PV guest. */
- {
- /* 4 or 8; 32 or 64 bit domain */
- unsigned int width;
- /* 3 or 4 pagetable levels */
- unsigned int levels;
-
- /* Maximum Xen frame */
- xen_pfn_t max_mfn;
- /* Read-only machine to phys map */
- xen_pfn_t *m2p;
- /* first mfn of the compat m2p (Only needed for 32bit PV guests) */
- xen_pfn_t compat_m2p_mfn0;
- /* Number of m2p frames mapped */
- unsigned long nr_m2p_frames;
-
- /* Maximum guest frame */
- xen_pfn_t max_pfn;
-
- /* Number of frames making up the p2m */
- unsigned int p2m_frames;
- /* Guest's phys to machine map. Mapped read-only (save) or
- * allocated locally (restore). Uses guest unsigned longs. */
- void *p2m;
- /* The guest pfns containing the p2m leaves */
- xen_pfn_t *p2m_pfns;
-
- /* Read-only mapping of guests shared info page */
- shared_info_any_t *shinfo;
-
- /* p2m generation count for verifying validity of local p2m. */
- uint64_t p2m_generation;
-
- union
- {
- struct
- {
- /* State machine for the order of received records. */
- bool seen_pv_info;
-
- /* Types for each page (bounded by max_pfn). */
- uint32_t *pfn_types;
-
- /* x86 PV per-vcpu storage structure for blobs. */
- struct xc_sr_x86_pv_restore_vcpu
- {
- struct xc_sr_blob basic, extd, xsave, msr;
- } *vcpus;
- unsigned int nr_vcpus;
- } restore;
- };
- } pv;
-
- struct /* x86 HVM guest. */
- {
- union
- {
- struct
- {
- /* Whether qemu enabled logdirty mode, and we should
- * disable on cleanup. */
- bool qemu_enabled_logdirty;
- } save;
-
- struct
- {
- /* HVM context blob. */
- struct xc_sr_blob context;
- } restore;
- };
- } hvm;
-
- } x86;
- };
-};
-
-extern struct xc_sr_save_ops save_ops_x86_pv;
-extern struct xc_sr_save_ops save_ops_x86_hvm;
-
-extern struct xc_sr_restore_ops restore_ops_x86_pv;
-extern struct xc_sr_restore_ops restore_ops_x86_hvm;
-
-struct xc_sr_record
-{
- uint32_t type;
- uint32_t length;
- void *data;
-};
-
-/*
- * Writes a split record to the stream, applying correct padding where
- * appropriate. It is common when sending records containing blobs from Xen
- * that the header and blob data are separate. This function accepts a second
- * buffer and length, and will merge it with the main record when sending.
- *
- * Records with a non-zero length must provide a valid data field; records
- * with a 0 length shall have their data field ignored.
- *
- * Returns 0 on success and non0 on failure.
- */
-int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
- void *buf, size_t sz);
-
-/*
- * Writes a record to the stream, applying correct padding where appropriate.
- * Records with a non-zero length must provide a valid data field; records
- * with a 0 length shall have their data field ignored.
- *
- * Returns 0 on success and non0 on failure.
- */
-static inline int write_record(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- return write_split_record(ctx, rec, NULL, 0);
-}
-
-/*
- * Reads a record from the stream, and fills in the record structure.
- *
- * Returns 0 on success and non-0 on failure.
- *
- * On success, the records type and size shall be valid.
- * - If size is 0, data shall be NULL.
- * - If size is non-0, data shall be a buffer allocated by malloc() which must
- * be passed to free() by the caller.
- *
- * On failure, the contents of the record structure are undefined.
- */
-int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);
-
-/*
- * This would ideally be private in restore.c, but is needed by
- * x86_pv_localise_page() if we receive pagetables frames ahead of the
- * contents of the frames they point at.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
- const xen_pfn_t *original_pfns, const uint32_t *types);
-
-/* Handle a STATIC_DATA_END record. */
-int handle_static_data_end(struct xc_sr_context *ctx);
-
-#endif
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include "xg_sr_common_x86.h"
-
-int write_x86_tsc_info(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_x86_tsc_info tsc = {};
- struct xc_sr_record rec = {
- .type = REC_TYPE_X86_TSC_INFO,
- .length = sizeof(tsc),
- .data = &tsc,
- };
-
- if ( xc_domain_get_tsc_info(xch, ctx->domid, &tsc.mode,
- &tsc.nsec, &tsc.khz, &tsc.incarnation) < 0 )
- {
- PERROR("Unable to obtain TSC information");
- return -1;
- }
-
- return write_record(ctx, &rec);
-}
-
-int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_x86_tsc_info *tsc = rec->data;
-
- if ( rec->length != sizeof(*tsc) )
- {
- ERROR("X86_TSC_INFO record wrong size: length %u, expected %zu",
- rec->length, sizeof(*tsc));
- return -1;
- }
-
- if ( xc_domain_set_tsc_info(xch, ctx->domid, tsc->mode,
- tsc->nsec, tsc->khz, tsc->incarnation) )
- {
- PERROR("Unable to set TSC information");
- return -1;
- }
-
- return 0;
-}
-
-int write_x86_cpu_policy_records(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_record cpuid = { .type = REC_TYPE_X86_CPUID_POLICY, };
- struct xc_sr_record msrs = { .type = REC_TYPE_X86_MSR_POLICY, };
- uint32_t nr_leaves = 0, nr_msrs = 0;
- int rc;
-
- if ( xc_get_cpu_policy_size(xch, &nr_leaves, &nr_msrs) < 0 )
- {
- PERROR("Unable to get CPU Policy size");
- return -1;
- }
-
- cpuid.data = malloc(nr_leaves * sizeof(xen_cpuid_leaf_t));
- msrs.data = malloc(nr_msrs * sizeof(xen_msr_entry_t));
- if ( !cpuid.data || !msrs.data )
- {
- ERROR("Cannot allocate memory for CPU Policy");
- rc = -1;
- goto out;
- }
-
- if ( xc_get_domain_cpu_policy(xch, ctx->domid, &nr_leaves, cpuid.data,
- &nr_msrs, msrs.data) )
- {
- PERROR("Unable to get d%d CPU Policy", ctx->domid);
- rc = -1;
- goto out;
- }
-
- cpuid.length = nr_leaves * sizeof(xen_cpuid_leaf_t);
- if ( cpuid.length )
- {
- rc = write_record(ctx, &cpuid);
- if ( rc )
- goto out;
- }
-
- msrs.length = nr_msrs * sizeof(xen_msr_entry_t);
- if ( msrs.length )
- rc = write_record(ctx, &msrs);
-
- out:
- free(cpuid.data);
- free(msrs.data);
-
- return rc;
-}
-
-int handle_x86_cpuid_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- int rc;
-
- if ( rec->length == 0 ||
- rec->length % sizeof(xen_cpuid_leaf_t) != 0 )
- {
- ERROR("X86_CPUID_POLICY size %u should be multiple of %zu",
- rec->length, sizeof(xen_cpuid_leaf_t));
- return -1;
- }
-
- rc = update_blob(&ctx->x86.restore.cpuid, rec->data, rec->length);
- if ( rc )
- ERROR("Unable to allocate %u bytes for X86_CPUID_POLICY", rec->length);
-
- return rc;
-}
-
-int handle_x86_msr_policy(struct xc_sr_context *ctx, struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- int rc;
-
- if ( rec->length == 0 ||
- rec->length % sizeof(xen_msr_entry_t) != 0 )
- {
- ERROR("X86_MSR_POLICY size %u should be multiple of %zu",
- rec->length, sizeof(xen_cpuid_leaf_t));
- return -1;
- }
-
- rc = update_blob(&ctx->x86.restore.msr, rec->data, rec->length);
- if ( rc )
- ERROR("Unable to allocate %u bytes for X86_MSR_POLICY", rec->length);
-
- return rc;
-}
-
-int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing)
-{
- xc_interface *xch = ctx->xch;
- uint32_t nr_leaves = 0, nr_msrs = 0;
- uint32_t err_l = ~0, err_s = ~0, err_m = ~0;
-
- if ( ctx->x86.restore.cpuid.ptr )
- nr_leaves = ctx->x86.restore.cpuid.size / sizeof(xen_cpuid_leaf_t);
- else
- *missing |= XGR_SDD_MISSING_CPUID;
-
- if ( ctx->x86.restore.msr.ptr )
- nr_msrs = ctx->x86.restore.msr.size / sizeof(xen_msr_entry_t);
- else
- *missing |= XGR_SDD_MISSING_MSR;
-
- if ( (nr_leaves || nr_msrs) &&
- xc_set_domain_cpu_policy(xch, ctx->domid,
- nr_leaves, ctx->x86.restore.cpuid.ptr,
- nr_msrs, ctx->x86.restore.msr.ptr,
- &err_l, &err_s, &err_m) )
- {
- PERROR("Failed to set CPUID policy: leaf %08x, subleaf %08x, msr %08x",
- err_l, err_s, err_m);
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#ifndef __COMMON_X86__H
-#define __COMMON_X86__H
-
-#include "xg_sr_common.h"
-
-/*
- * Obtains a domains TSC information from Xen and writes a X86_TSC_INFO record
- * into the stream.
- */
-int write_x86_tsc_info(struct xc_sr_context *ctx);
-
-/*
- * Parses a X86_TSC_INFO record and applies the result to the domain.
- */
-int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec);
-
-/*
- * Obtains a domains CPU Policy from Xen, and writes X86_{CPUID,MSR}_POLICY
- * records into the stream.
- */
-int write_x86_cpu_policy_records(struct xc_sr_context *ctx);
-
-/*
- * Parses an X86_CPUID_POLICY record and stashes the content for application
- * when a STATIC_DATA_END record is encountered.
- */
-int handle_x86_cpuid_policy(struct xc_sr_context *ctx,
- struct xc_sr_record *rec);
-
-/*
- * Parses an X86_MSR_POLICY record and stashes the content for application
- * when a STATIC_DATA_END record is encountered.
- */
-int handle_x86_msr_policy(struct xc_sr_context *ctx,
- struct xc_sr_record *rec);
-
-/*
- * Perform common x86 actions required after the static data has arrived.
- */
-int x86_static_data_complete(struct xc_sr_context *ctx, unsigned int *missing);
-
-#endif
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include <assert.h>
-
-#include "xg_sr_common_x86_pv.h"
-
-xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn)
-{
- assert(mfn <= ctx->x86.pv.max_mfn);
- return ctx->x86.pv.m2p[mfn];
-}
-
-bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn)
-{
- return ((mfn <= ctx->x86.pv.max_mfn) &&
- (mfn_to_pfn(ctx, mfn) <= ctx->x86.pv.max_pfn) &&
- (xc_pfn_to_mfn(mfn_to_pfn(ctx, mfn), ctx->x86.pv.p2m,
- ctx->x86.pv.width) == mfn));
-}
-
-void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t pfn = ~0UL;
-
- ERROR("mfn %#lx, max %#lx", mfn, ctx->x86.pv.max_mfn);
-
- if ( (mfn != ~0UL) && (mfn <= ctx->x86.pv.max_mfn) )
- {
- pfn = ctx->x86.pv.m2p[mfn];
- ERROR(" m2p[%#lx] = %#lx, max_pfn %#lx",
- mfn, pfn, ctx->x86.pv.max_pfn);
- }
-
- if ( (pfn != ~0UL) && (pfn <= ctx->x86.pv.max_pfn) )
- ERROR(" p2m[%#lx] = %#lx",
- pfn, xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width));
-}
-
-xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3)
-{
- if ( ctx->x86.pv.width == 8 )
- return cr3 >> 12;
- else
- {
- /* 32bit guests can't represent mfns wider than 32 bits */
- if ( cr3 & 0xffffffff00000000UL )
- return ~0UL;
- else
- return (uint32_t)((cr3 >> 12) | (cr3 << 20));
- }
-}
-
-uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t _mfn)
-{
- uint64_t mfn = _mfn;
-
- if ( ctx->x86.pv.width == 8 )
- return mfn << 12;
- else
- {
- /* 32bit guests can't represent mfns wider than 32 bits */
- if ( mfn & 0xffffffff00000000UL )
- return ~0UL;
- else
- return (uint32_t)((mfn << 12) | (mfn >> 20));
- }
-}
-
-int x86_pv_domain_info(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- unsigned int guest_width, guest_levels;
-
- /* Get the domain width */
- if ( xc_domain_get_guest_width(xch, ctx->domid, &guest_width) )
- {
- PERROR("Unable to determine dom%d's width", ctx->domid);
- return -1;
- }
-
- if ( guest_width == 4 )
- guest_levels = 3;
- else if ( guest_width == 8 )
- guest_levels = 4;
- else
- {
- ERROR("Invalid guest width %d. Expected 32 or 64", guest_width * 8);
- return -1;
- }
- ctx->x86.pv.width = guest_width;
- ctx->x86.pv.levels = guest_levels;
-
- DPRINTF("%d bits, %d levels", guest_width * 8, guest_levels);
-
- return 0;
-}
-
-int x86_pv_map_m2p(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t m2p_chunks, m2p_size, max_page;
- privcmd_mmap_entry_t *entries = NULL;
- xen_pfn_t *extents_start = NULL;
- int rc = -1, i;
-
- if ( xc_maximum_ram_page(xch, &max_page) < 0 )
- {
- PERROR("Failed to get maximum ram page");
- goto err;
- }
-
- ctx->x86.pv.max_mfn = max_page;
- m2p_size = M2P_SIZE(ctx->x86.pv.max_mfn);
- m2p_chunks = M2P_CHUNKS(ctx->x86.pv.max_mfn);
-
- extents_start = malloc(m2p_chunks * sizeof(xen_pfn_t));
- if ( !extents_start )
- {
- ERROR("Unable to allocate %lu bytes for m2p mfns",
- m2p_chunks * sizeof(xen_pfn_t));
- goto err;
- }
-
- if ( xc_machphys_mfn_list(xch, m2p_chunks, extents_start) )
- {
- PERROR("Failed to get m2p mfn list");
- goto err;
- }
-
- entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t));
- if ( !entries )
- {
- ERROR("Unable to allocate %lu bytes for m2p mapping mfns",
- m2p_chunks * sizeof(privcmd_mmap_entry_t));
- goto err;
- }
-
- for ( i = 0; i < m2p_chunks; ++i )
- entries[i].mfn = extents_start[i];
-
- ctx->x86.pv.m2p = xc_map_foreign_ranges(
- xch, DOMID_XEN, m2p_size, PROT_READ,
- M2P_CHUNK_SIZE, entries, m2p_chunks);
-
- if ( !ctx->x86.pv.m2p )
- {
- PERROR("Failed to mmap() m2p ranges");
- goto err;
- }
-
- ctx->x86.pv.nr_m2p_frames = (M2P_CHUNK_SIZE >> PAGE_SHIFT) * m2p_chunks;
-
-#ifdef __i386__
- /* 32 bit toolstacks automatically get the compat m2p */
- ctx->x86.pv.compat_m2p_mfn0 = entries[0].mfn;
-#else
- /* 64 bit toolstacks need to ask Xen specially for it */
- {
- struct xen_machphys_mfn_list xmml = {
- .max_extents = 1,
- .extent_start = { &ctx->x86.pv.compat_m2p_mfn0 },
- };
-
- rc = do_memory_op(xch, XENMEM_machphys_compat_mfn_list,
- &xmml, sizeof(xmml));
- if ( rc || xmml.nr_extents != 1 )
- {
- PERROR("Failed to get compat mfn list from Xen");
- rc = -1;
- goto err;
- }
- }
-#endif
-
- /* All Done */
- rc = 0;
- DPRINTF("max_mfn %#lx", ctx->x86.pv.max_mfn);
-
- err:
- free(entries);
- free(extents_start);
-
- return rc;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#ifndef __COMMON_X86_PV_H
-#define __COMMON_X86_PV_H
-
-#include "xg_sr_common_x86.h"
-
-/* Virtual address ranges reserved for hypervisor. */
-#define HYPERVISOR_VIRT_START_X86_64 0xFFFF800000000000ULL
-#define HYPERVISOR_VIRT_END_X86_64 0xFFFF87FFFFFFFFFFULL
-
-#define HYPERVISOR_VIRT_START_X86_32 0x00000000F5800000ULL
-#define HYPERVISOR_VIRT_END_X86_32 0x00000000FFFFFFFFULL
-
-/*
- * Convert an mfn to a pfn, given Xen's m2p table.
- *
- * Caller must ensure that the requested mfn is in range.
- */
-xen_pfn_t mfn_to_pfn(struct xc_sr_context *ctx, xen_pfn_t mfn);
-
-/*
- * Query whether a particular mfn is valid in the physmap of a guest.
- */
-bool mfn_in_pseudophysmap(struct xc_sr_context *ctx, xen_pfn_t mfn);
-
-/*
- * Debug a particular mfn by walking the p2m and m2p.
- */
-void dump_bad_pseudophysmap_entry(struct xc_sr_context *ctx, xen_pfn_t mfn);
-
-/*
- * Convert a PV cr3 field to an mfn.
- *
- * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into
- * a 32bit architectural cr3.
- */
-xen_pfn_t cr3_to_mfn(struct xc_sr_context *ctx, uint64_t cr3);
-
-/*
- * Convert an mfn to a PV cr3 field.
- *
- * Adjusts for Xen's extended-cr3 format to pack a 44bit physical address into
- * a 32bit architectural cr3.
- */
-uint64_t mfn_to_cr3(struct xc_sr_context *ctx, xen_pfn_t mfn);
-
-/* Bits 12 through 51 of a PTE point at the frame */
-#define PTE_FRAME_MASK 0x000ffffffffff000ULL
-
-/*
- * Extract an mfn from a Pagetable Entry. May return INVALID_MFN if the pte
- * would overflow a 32bit xen_pfn_t.
- */
-static inline xen_pfn_t pte_to_frame(uint64_t pte)
-{
- uint64_t frame = (pte & PTE_FRAME_MASK) >> PAGE_SHIFT;
-
-#ifdef __i386__
- if ( frame >= INVALID_MFN )
- return INVALID_MFN;
-#endif
-
- return frame;
-}
-
-/*
- * Change the frame in a Pagetable Entry while leaving the flags alone.
- */
-static inline uint64_t merge_pte(uint64_t pte, xen_pfn_t mfn)
-{
- return (pte & ~PTE_FRAME_MASK) | ((uint64_t)mfn << PAGE_SHIFT);
-}
-
-/*
- * Get current domain information.
- *
- * Fills ctx->x86.pv
- * - .width
- * - .levels
- * - .fpp
- * - .p2m_frames
- *
- * Used by the save side to create the X86_PV_INFO record, and by the restore
- * side to verify the incoming stream.
- *
- * Returns 0 on success and non-zero on error.
- */
-int x86_pv_domain_info(struct xc_sr_context *ctx);
-
-/*
- * Maps the Xen M2P.
- *
- * Fills ctx->x86.pv.
- * - .max_mfn
- * - .m2p
- *
- * Returns 0 on success and non-zero on error.
- */
-int x86_pv_map_m2p(struct xc_sr_context *ctx);
-
-#endif
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include <arpa/inet.h>
-
-#include <assert.h>
-
-#include "xg_sr_common.h"
-
-/*
- * Read and validate the Image and Domain headers.
- */
-static int read_headers(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_ihdr ihdr;
- struct xc_sr_dhdr dhdr;
-
- if ( read_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
- {
- PERROR("Failed to read Image Header from stream");
- return -1;
- }
-
- ihdr.id = ntohl(ihdr.id);
- ihdr.version = ntohl(ihdr.version);
- ihdr.options = ntohs(ihdr.options);
-
- if ( ihdr.marker != IHDR_MARKER )
- {
- ERROR("Invalid marker: Got 0x%016"PRIx64, ihdr.marker);
- return -1;
- }
-
- if ( ihdr.id != IHDR_ID )
- {
- ERROR("Invalid ID: Expected 0x%08x, Got 0x%08x", IHDR_ID, ihdr.id);
- return -1;
- }
-
- if ( ihdr.version < 2 || ihdr.version > 3 )
- {
- ERROR("Invalid Version: Expected 2 <= ver <= 3, Got %d",
- ihdr.version);
- return -1;
- }
-
- if ( ihdr.options & IHDR_OPT_BIG_ENDIAN )
- {
- ERROR("Unable to handle big endian streams");
- return -1;
- }
-
- ctx->restore.format_version = ihdr.version;
-
- if ( read_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
- {
- PERROR("Failed to read Domain Header from stream");
- return -1;
- }
-
- ctx->restore.guest_type = dhdr.type;
- ctx->restore.guest_page_size = (1U << dhdr.page_shift);
-
- if ( dhdr.xen_major == 0 )
- {
- IPRINTF("Found %s domain, converted from legacy stream format",
- dhdr_type_to_str(dhdr.type));
- DPRINTF(" Legacy conversion script version %u", dhdr.xen_minor);
- }
- else
- IPRINTF("Found %s domain from Xen %u.%u",
- dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor);
- return 0;
-}
-
-/*
- * Is a pfn populated?
- */
-static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- if ( pfn > ctx->restore.max_populated_pfn )
- return false;
- return test_bit(pfn, ctx->restore.populated_pfns);
-}
-
-/*
- * Set a pfn as populated, expanding the tracking structures if needed. To
- * avoid realloc()ing too excessively, the size increased to the nearest power
- * of two large enough to contain the required pfn.
- */
-static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- xc_interface *xch = ctx->xch;
-
- if ( pfn > ctx->restore.max_populated_pfn )
- {
- xen_pfn_t new_max;
- size_t old_sz, new_sz;
- unsigned long *p;
-
- /* Round up to the nearest power of two larger than pfn, less 1. */
- new_max = pfn;
- new_max |= new_max >> 1;
- new_max |= new_max >> 2;
- new_max |= new_max >> 4;
- new_max |= new_max >> 8;
- new_max |= new_max >> 16;
-#ifdef __x86_64__
- new_max |= new_max >> 32;
-#endif
-
- old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
- new_sz = bitmap_size(new_max + 1);
- p = realloc(ctx->restore.populated_pfns, new_sz);
- if ( !p )
- {
- ERROR("Failed to realloc populated bitmap");
- errno = ENOMEM;
- return -1;
- }
-
- memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
-
- ctx->restore.populated_pfns = p;
- ctx->restore.max_populated_pfn = new_max;
- }
-
- assert(!test_bit(pfn, ctx->restore.populated_pfns));
- set_bit(pfn, ctx->restore.populated_pfns);
-
- return 0;
-}
-
-/*
- * Given a set of pfns, obtain memory from Xen to fill the physmap for the
- * unpopulated subset. If types is NULL, no page type checking is performed
- * and all unpopulated pfns are populated.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
- const xen_pfn_t *original_pfns, const uint32_t *types)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
- *pfns = malloc(count * sizeof(*pfns));
- unsigned int i, nr_pfns = 0;
- int rc = -1;
-
- if ( !mfns || !pfns )
- {
- ERROR("Failed to allocate %zu bytes for populating the physmap",
- 2 * count * sizeof(*mfns));
- goto err;
- }
-
- for ( i = 0; i < count; ++i )
- {
- if ( (!types || (types &&
- (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
- types[i] != XEN_DOMCTL_PFINFO_BROKEN))) &&
- !pfn_is_populated(ctx, original_pfns[i]) )
- {
- rc = pfn_set_populated(ctx, original_pfns[i]);
- if ( rc )
- goto err;
- pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
- ++nr_pfns;
- }
- }
-
- if ( nr_pfns )
- {
- rc = xc_domain_populate_physmap_exact(
- xch, ctx->domid, nr_pfns, 0, 0, mfns);
- if ( rc )
- {
- PERROR("Failed to populate physmap");
- goto err;
- }
-
- for ( i = 0; i < nr_pfns; ++i )
- {
- if ( mfns[i] == INVALID_MFN )
- {
- ERROR("Populate physmap failed for pfn %u", i);
- rc = -1;
- goto err;
- }
-
- ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
- }
- }
-
- rc = 0;
-
- err:
- free(pfns);
- free(mfns);
-
- return rc;
-}
-
-/*
- * Given a list of pfns, their types, and a block of page data from the
- * stream, populate and record their types, map the relevant subset and copy
- * the data into the guest.
- */
-static int process_page_data(struct xc_sr_context *ctx, unsigned int count,
- xen_pfn_t *pfns, uint32_t *types, void *page_data)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t *mfns = malloc(count * sizeof(*mfns));
- int *map_errs = malloc(count * sizeof(*map_errs));
- int rc;
- void *mapping = NULL, *guest_page = NULL;
- unsigned int i, /* i indexes the pfns from the record. */
- j, /* j indexes the subset of pfns we decide to map. */
- nr_pages = 0;
-
- if ( !mfns || !map_errs )
- {
- rc = -1;
- ERROR("Failed to allocate %zu bytes to process page data",
- count * (sizeof(*mfns) + sizeof(*map_errs)));
- goto err;
- }
-
- rc = populate_pfns(ctx, count, pfns, types);
- if ( rc )
- {
- ERROR("Failed to populate pfns for batch of %u pages", count);
- goto err;
- }
-
- for ( i = 0; i < count; ++i )
- {
- ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]);
-
- switch ( types[i] )
- {
- case XEN_DOMCTL_PFINFO_NOTAB:
-
- case XEN_DOMCTL_PFINFO_L1TAB:
- case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
- case XEN_DOMCTL_PFINFO_L2TAB:
- case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
- case XEN_DOMCTL_PFINFO_L3TAB:
- case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
- case XEN_DOMCTL_PFINFO_L4TAB:
- case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
-
- mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
- break;
- }
- }
-
- /* Nothing to do? */
- if ( nr_pages == 0 )
- goto done;
-
- mapping = guest_page = xenforeignmemory_map(
- xch->fmem, ctx->domid, PROT_READ | PROT_WRITE,
- nr_pages, mfns, map_errs);
- if ( !mapping )
- {
- rc = -1;
- PERROR("Unable to map %u mfns for %u pages of data",
- nr_pages, count);
- goto err;
- }
-
- for ( i = 0, j = 0; i < count; ++i )
- {
- switch ( types[i] )
- {
- case XEN_DOMCTL_PFINFO_XTAB:
- case XEN_DOMCTL_PFINFO_BROKEN:
- case XEN_DOMCTL_PFINFO_XALLOC:
- /* No page data to deal with. */
- continue;
- }
-
- if ( map_errs[j] )
- {
- rc = -1;
- ERROR("Mapping pfn %#"PRIpfn" (mfn %#"PRIpfn", type %#"PRIx32") failed with %d",
- pfns[i], mfns[j], types[i], map_errs[j]);
- goto err;
- }
-
- /* Undo page normalisation done by the saver. */
- rc = ctx->restore.ops.localise_page(ctx, types[i], page_data);
- if ( rc )
- {
- ERROR("Failed to localise pfn %#"PRIpfn" (type %#"PRIx32")",
- pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
- goto err;
- }
-
- if ( ctx->restore.verify )
- {
- /* Verify mode - compare incoming data to what we already have. */
- if ( memcmp(guest_page, page_data, PAGE_SIZE) )
- ERROR("verify pfn %#"PRIpfn" failed (type %#"PRIx32")",
- pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
- }
- else
- {
- /* Regular mode - copy incoming data into place. */
- memcpy(guest_page, page_data, PAGE_SIZE);
- }
-
- ++j;
- guest_page += PAGE_SIZE;
- page_data += PAGE_SIZE;
- }
-
- done:
- rc = 0;
-
- err:
- if ( mapping )
- xenforeignmemory_unmap(xch->fmem, mapping, nr_pages);
-
- free(map_errs);
- free(mfns);
-
- return rc;
-}
-
-/*
- * Validate a PAGE_DATA record from the stream, and pass the results to
- * process_page_data() to actually perform the legwork.
- */
-static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_page_data_header *pages = rec->data;
- unsigned int i, pages_of_data = 0;
- int rc = -1;
-
- xen_pfn_t *pfns = NULL, pfn;
- uint32_t *types = NULL, type;
-
- /*
- * v2 compatibility only exists for x86 streams. This is a bit of a
- * bodge, but it is less bad than duplicating handle_page_data() between
- * different architectures.
- */
-#if defined(__i386__) || defined(__x86_64__)
- /* v2 compat. Infer the position of STATIC_DATA_END. */
- if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end )
- {
- rc = handle_static_data_end(ctx);
- if ( rc )
- {
- ERROR("Inferred STATIC_DATA_END record failed");
- goto err;
- }
- rc = -1;
- }
-
- if ( !ctx->restore.seen_static_data_end )
- {
- ERROR("No STATIC_DATA_END seen");
- goto err;
- }
-#endif
-
- if ( rec->length < sizeof(*pages) )
- {
- ERROR("PAGE_DATA record truncated: length %u, min %zu",
- rec->length, sizeof(*pages));
- goto err;
- }
-
- if ( pages->count < 1 )
- {
- ERROR("Expected at least 1 pfn in PAGE_DATA record");
- goto err;
- }
-
- if ( rec->length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) )
- {
- ERROR("PAGE_DATA record (length %u) too short to contain %u"
- " pfns worth of information", rec->length, pages->count);
- goto err;
- }
-
- pfns = malloc(pages->count * sizeof(*pfns));
- types = malloc(pages->count * sizeof(*types));
- if ( !pfns || !types )
- {
- ERROR("Unable to allocate enough memory for %u pfns",
- pages->count);
- goto err;
- }
-
- for ( i = 0; i < pages->count; ++i )
- {
- pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK;
- if ( !ctx->restore.ops.pfn_is_valid(ctx, pfn) )
- {
- ERROR("pfn %#"PRIpfn" (index %u) outside domain maximum", pfn, i);
- goto err;
- }
-
- type = (pages->pfn[i] & PAGE_DATA_TYPE_MASK) >> 32;
- if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) &&
- ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) )
- {
- ERROR("Invalid type %#"PRIx32" for pfn %#"PRIpfn" (index %u)",
- type, pfn, i);
- goto err;
- }
-
- if ( type < XEN_DOMCTL_PFINFO_BROKEN )
- /* NOTAB and all L1 through L4 tables (including pinned) should
- * have a page worth of data in the record. */
- pages_of_data++;
-
- pfns[i] = pfn;
- types[i] = type;
- }
-
- if ( rec->length != (sizeof(*pages) +
- (sizeof(uint64_t) * pages->count) +
- (PAGE_SIZE * pages_of_data)) )
- {
- ERROR("PAGE_DATA record wrong size: length %u, expected "
- "%zu + %zu + %lu", rec->length, sizeof(*pages),
- (sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data));
- goto err;
- }
-
- rc = process_page_data(ctx, pages->count, pfns, types,
- &pages->pfn[pages->count]);
- err:
- free(types);
- free(pfns);
-
- return rc;
-}
-
-/*
- * Send checkpoint dirty pfn list to primary.
- */
-static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc = -1;
- unsigned int count, written;
- uint64_t i, *pfns = NULL;
- struct iovec *iov = NULL;
- xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size };
- struct xc_sr_record rec = {
- .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST,
- };
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->restore.dirty_bitmap_hbuf);
-
- if ( xc_shadow_control(
- xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size,
- NULL, 0, &stats) != ctx->restore.p2m_size )
- {
- PERROR("Failed to retrieve logdirty bitmap");
- goto err;
- }
-
- for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ )
- {
- if ( test_bit(i, dirty_bitmap) )
- count++;
- }
-
-
- pfns = malloc(count * sizeof(*pfns));
- if ( !pfns )
- {
- ERROR("Unable to allocate %zu bytes of memory for dirty pfn list",
- count * sizeof(*pfns));
- goto err;
- }
-
- for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i )
- {
- if ( !test_bit(i, dirty_bitmap) )
- continue;
-
- if ( written > count )
- {
- ERROR("Dirty pfn list exceed");
- goto err;
- }
-
- pfns[written++] = i;
- }
-
- /* iovec[] for writev(). */
- iov = malloc(3 * sizeof(*iov));
- if ( !iov )
- {
- ERROR("Unable to allocate memory for sending dirty bitmap");
- goto err;
- }
-
- rec.length = count * sizeof(*pfns);
-
- iov[0].iov_base = &rec.type;
- iov[0].iov_len = sizeof(rec.type);
-
- iov[1].iov_base = &rec.length;
- iov[1].iov_len = sizeof(rec.length);
-
- iov[2].iov_base = pfns;
- iov[2].iov_len = count * sizeof(*pfns);
-
- if ( writev_exact(ctx->restore.send_back_fd, iov, 3) )
- {
- PERROR("Failed to write dirty bitmap to stream");
- goto err;
- }
-
- rc = 0;
- err:
- free(pfns);
- free(iov);
- return rc;
-}
-
-static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
-static int handle_checkpoint(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc = 0, ret;
- unsigned int i;
-
- if ( ctx->stream_type == XC_STREAM_PLAIN )
- {
- ERROR("Found checkpoint in non-checkpointed stream");
- rc = -1;
- goto err;
- }
-
- ret = ctx->restore.callbacks->checkpoint(ctx->restore.callbacks->data);
- switch ( ret )
- {
- case XGR_CHECKPOINT_SUCCESS:
- break;
-
- case XGR_CHECKPOINT_FAILOVER:
- if ( ctx->restore.buffer_all_records )
- rc = BROKEN_CHANNEL;
- else
- /* We don't have a consistent state */
- rc = -1;
- goto err;
-
- default: /* Other fatal error */
- rc = -1;
- goto err;
- }
-
- if ( ctx->restore.buffer_all_records )
- {
- IPRINTF("All records buffered");
-
- for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
- {
- rc = process_record(ctx, &ctx->restore.buffered_records[i]);
- if ( rc )
- goto err;
- }
- ctx->restore.buffered_rec_num = 0;
- IPRINTF("All records processed");
- }
- else
- ctx->restore.buffer_all_records = true;
-
- if ( ctx->stream_type == XC_STREAM_COLO )
- {
-#define HANDLE_CALLBACK_RETURN_VALUE(ret) \
- do { \
- if ( ret == 1 ) \
- rc = 0; /* Success */ \
- else \
- { \
- if ( ret == 2 ) \
- rc = BROKEN_CHANNEL; \
- else \
- rc = -1; /* Some unspecified error */ \
- goto err; \
- } \
- } while (0)
-
- /* COLO */
-
- /* We need to resume guest */
- rc = ctx->restore.ops.stream_complete(ctx);
- if ( rc )
- goto err;
-
- ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn,
- ctx->restore.console_gfn,
- ctx->restore.callbacks->data);
-
- /* Resume secondary vm */
- ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data);
- HANDLE_CALLBACK_RETURN_VALUE(ret);
-
- /* Wait for a new checkpoint */
- ret = ctx->restore.callbacks->wait_checkpoint(
- ctx->restore.callbacks->data);
- HANDLE_CALLBACK_RETURN_VALUE(ret);
-
- /* suspend secondary vm */
- ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data);
- HANDLE_CALLBACK_RETURN_VALUE(ret);
-
-#undef HANDLE_CALLBACK_RETURN_VALUE
-
- rc = send_checkpoint_dirty_pfn_list(ctx);
- if ( rc )
- goto err;
- }
-
- err:
- return rc;
-}
-
-static int buffer_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- unsigned int new_alloc_num;
- struct xc_sr_record *p;
-
- if ( ctx->restore.buffered_rec_num >= ctx->restore.allocated_rec_num )
- {
- new_alloc_num = ctx->restore.allocated_rec_num + DEFAULT_BUF_RECORDS;
- p = realloc(ctx->restore.buffered_records,
- new_alloc_num * sizeof(struct xc_sr_record));
- if ( !p )
- {
- ERROR("Failed to realloc memory for buffered records");
- return -1;
- }
-
- ctx->restore.buffered_records = p;
- ctx->restore.allocated_rec_num = new_alloc_num;
- }
-
- memcpy(&ctx->restore.buffered_records[ctx->restore.buffered_rec_num++],
- rec, sizeof(*rec));
-
- return 0;
-}
-
-int handle_static_data_end(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- unsigned int missing = 0;
- int rc = 0;
-
- if ( ctx->restore.seen_static_data_end )
- {
- ERROR("Multiple STATIC_DATA_END records found");
- return -1;
- }
-
- ctx->restore.seen_static_data_end = true;
-
- rc = ctx->restore.ops.static_data_complete(ctx, &missing);
- if ( rc )
- return rc;
-
- if ( ctx->restore.callbacks->static_data_done &&
- (rc = ctx->restore.callbacks->static_data_done(
- missing, ctx->restore.callbacks->data) != 0) )
- ERROR("static_data_done() callback failed: %d\n", rc);
-
- return rc;
-}
-
-static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- int rc = 0;
-
- switch ( rec->type )
- {
- case REC_TYPE_END:
- break;
-
- case REC_TYPE_PAGE_DATA:
- rc = handle_page_data(ctx, rec);
- break;
-
- case REC_TYPE_VERIFY:
- DPRINTF("Verify mode enabled");
- ctx->restore.verify = true;
- break;
-
- case REC_TYPE_CHECKPOINT:
- rc = handle_checkpoint(ctx);
- break;
-
- case REC_TYPE_STATIC_DATA_END:
- rc = handle_static_data_end(ctx);
- break;
-
- default:
- rc = ctx->restore.ops.process_record(ctx, rec);
- break;
- }
-
- free(rec->data);
- rec->data = NULL;
-
- return rc;
-}
-
-static int setup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc;
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->restore.dirty_bitmap_hbuf);
-
- if ( ctx->stream_type == XC_STREAM_COLO )
- {
- dirty_bitmap = xc_hypercall_buffer_alloc_pages(
- xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size)));
-
- if ( !dirty_bitmap )
- {
- ERROR("Unable to allocate memory for dirty bitmap");
- rc = -1;
- goto err;
- }
- }
-
- rc = ctx->restore.ops.setup(ctx);
- if ( rc )
- goto err;
-
- ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1;
- ctx->restore.populated_pfns = bitmap_alloc(
- ctx->restore.max_populated_pfn + 1);
- if ( !ctx->restore.populated_pfns )
- {
- ERROR("Unable to allocate memory for populated_pfns bitmap");
- rc = -1;
- goto err;
- }
-
- ctx->restore.buffered_records = malloc(
- DEFAULT_BUF_RECORDS * sizeof(struct xc_sr_record));
- if ( !ctx->restore.buffered_records )
- {
- ERROR("Unable to allocate memory for buffered records");
- rc = -1;
- goto err;
- }
- ctx->restore.allocated_rec_num = DEFAULT_BUF_RECORDS;
-
- err:
- return rc;
-}
-
-static void cleanup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- unsigned int i;
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->restore.dirty_bitmap_hbuf);
-
- for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
- free(ctx->restore.buffered_records[i].data);
-
- if ( ctx->stream_type == XC_STREAM_COLO )
- xc_hypercall_buffer_free_pages(
- xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_size)));
-
- free(ctx->restore.buffered_records);
- free(ctx->restore.populated_pfns);
-
- if ( ctx->restore.ops.cleanup(ctx) )
- PERROR("Failed to clean up");
-}
-
-/*
- * Restore a domain.
- */
-static int restore(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_record rec;
- int rc, saved_rc = 0, saved_errno = 0;
-
- IPRINTF("Restoring domain");
-
- rc = setup(ctx);
- if ( rc )
- goto err;
-
- do
- {
- rc = read_record(ctx, ctx->fd, &rec);
- if ( rc )
- {
- if ( ctx->restore.buffer_all_records )
- goto remus_failover;
- else
- goto err;
- }
-
- if ( ctx->restore.buffer_all_records &&
- rec.type != REC_TYPE_END &&
- rec.type != REC_TYPE_CHECKPOINT )
- {
- rc = buffer_record(ctx, &rec);
- if ( rc )
- goto err;
- }
- else
- {
- rc = process_record(ctx, &rec);
- if ( rc == RECORD_NOT_PROCESSED )
- {
- if ( rec.type & REC_TYPE_OPTIONAL )
- DPRINTF("Ignoring optional record %#x (%s)",
- rec.type, rec_type_to_str(rec.type));
- else
- {
- ERROR("Mandatory record %#x (%s) not handled",
- rec.type, rec_type_to_str(rec.type));
- rc = -1;
- goto err;
- }
- }
- else if ( rc == BROKEN_CHANNEL )
- goto remus_failover;
- else if ( rc )
- goto err;
- }
-
- } while ( rec.type != REC_TYPE_END );
-
- remus_failover:
- if ( ctx->stream_type == XC_STREAM_COLO )
- {
- /* With COLO, we have already called stream_complete */
- rc = 0;
- IPRINTF("COLO Failover");
- goto done;
- }
-
- /*
- * With Remus, if we reach here, there must be some error on primary,
- * failover from the last checkpoint state.
- */
- rc = ctx->restore.ops.stream_complete(ctx);
- if ( rc )
- goto err;
-
- IPRINTF("Restore successful");
- goto done;
-
- err:
- saved_errno = errno;
- saved_rc = rc;
- PERROR("Restore failed");
-
- done:
- cleanup(ctx);
-
- if ( saved_rc )
- {
- rc = saved_rc;
- errno = saved_errno;
- }
-
- return rc;
-}
-
-int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
- unsigned int store_evtchn, unsigned long *store_mfn,
- uint32_t store_domid, unsigned int console_evtchn,
- unsigned long *console_gfn, uint32_t console_domid,
- xc_stream_type_t stream_type,
- struct restore_callbacks *callbacks, int send_back_fd)
-{
- xen_pfn_t nr_pfns;
- struct xc_sr_context ctx = {
- .xch = xch,
- .fd = io_fd,
- .stream_type = stream_type,
- };
-
- /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */
- ctx.restore.console_evtchn = console_evtchn;
- ctx.restore.console_domid = console_domid;
- ctx.restore.xenstore_evtchn = store_evtchn;
- ctx.restore.xenstore_domid = store_domid;
- ctx.restore.callbacks = callbacks;
- ctx.restore.send_back_fd = send_back_fd;
-
- /* Sanity check stream_type-related parameters */
- switch ( stream_type )
- {
- case XC_STREAM_COLO:
- assert(callbacks->suspend &&
- callbacks->postcopy &&
- callbacks->wait_checkpoint &&
- callbacks->restore_results);
- /* Fallthrough */
- case XC_STREAM_REMUS:
- assert(callbacks->checkpoint);
- /* Fallthrough */
- case XC_STREAM_PLAIN:
- break;
-
- default:
- assert(!"Bad stream_type");
- break;
- }
-
- if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
- {
- PERROR("Failed to get domain info");
- return -1;
- }
-
- if ( ctx.dominfo.domid != dom )
- {
- ERROR("Domain %u does not exist", dom);
- return -1;
- }
-
- DPRINTF("fd %d, dom %u, hvm %u, stream_type %d",
- io_fd, dom, ctx.dominfo.hvm, stream_type);
-
- ctx.domid = dom;
-
- if ( read_headers(&ctx) )
- return -1;
-
- if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 )
- {
- PERROR("Unable to obtain the guest p2m size");
- return -1;
- }
-
- ctx.restore.p2m_size = nr_pfns;
- ctx.restore.ops = ctx.dominfo.hvm
- ? restore_ops_x86_hvm : restore_ops_x86_pv;
-
- if ( restore(&ctx) )
- return -1;
-
- IPRINTF("XenStore: mfn %#"PRIpfn", dom %d, evt %u",
- ctx.restore.xenstore_gfn,
- ctx.restore.xenstore_domid,
- ctx.restore.xenstore_evtchn);
-
- IPRINTF("Console: mfn %#"PRIpfn", dom %d, evt %u",
- ctx.restore.console_gfn,
- ctx.restore.console_domid,
- ctx.restore.console_evtchn);
-
- *console_gfn = ctx.restore.console_gfn;
- *store_mfn = ctx.restore.xenstore_gfn;
-
- return 0;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include <assert.h>
-#include <arpa/inet.h>
-
-#include "xg_sr_common_x86.h"
-
-/*
- * Process an HVM_CONTEXT record from the stream.
- */
-static int handle_hvm_context(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- int rc = update_blob(&ctx->x86.hvm.restore.context, rec->data, rec->length);
-
- if ( rc )
- ERROR("Unable to allocate %u bytes for hvm context", rec->length);
-
- return rc;
-}
-
-/*
- * Process an HVM_PARAMS record from the stream.
- */
-static int handle_hvm_params(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_hvm_params *hdr = rec->data;
- struct xc_sr_rec_hvm_params_entry *entry = hdr->param;
- unsigned int i;
- int rc;
-
- if ( rec->length < sizeof(*hdr) )
- {
- ERROR("HVM_PARAMS record truncated: length %u, header size %zu",
- rec->length, sizeof(*hdr));
- return -1;
- }
-
- if ( rec->length != (sizeof(*hdr) + hdr->count * sizeof(*entry)) )
- {
- ERROR("HVM_PARAMS record truncated: header %zu, count %u, "
- "expected len %zu, got %u",
- sizeof(*hdr), hdr->count, hdr->count * sizeof(*entry),
- rec->length);
- return -1;
- }
-
- /*
- * Tolerate empty records. Older sending sides used to accidentally
- * generate them.
- */
- if ( hdr->count == 0 )
- {
- DBGPRINTF("Skipping empty HVM_PARAMS record\n");
- return 0;
- }
-
- for ( i = 0; i < hdr->count; i++, entry++ )
- {
- switch ( entry->index )
- {
- case HVM_PARAM_CONSOLE_PFN:
- ctx->restore.console_gfn = entry->value;
- xc_clear_domain_page(xch, ctx->domid, entry->value);
- break;
- case HVM_PARAM_STORE_PFN:
- ctx->restore.xenstore_gfn = entry->value;
- xc_clear_domain_page(xch, ctx->domid, entry->value);
- break;
- case HVM_PARAM_IOREQ_PFN:
- case HVM_PARAM_BUFIOREQ_PFN:
- xc_clear_domain_page(xch, ctx->domid, entry->value);
- break;
-
- case HVM_PARAM_PAE_ENABLED:
- /*
- * This HVM_PARAM only ever existed to pass data into
- * xc_cpuid_apply_policy(). The function has now been updated to
- * use a normal calling convention, making the param obsolete.
- *
- * Discard if we find it in an old migration stream.
- */
- continue;
- }
-
- rc = xc_hvm_param_set(xch, ctx->domid, entry->index, entry->value);
- if ( rc < 0 )
- {
- PERROR("set HVM param %"PRId64" = 0x%016"PRIx64,
- entry->index, entry->value);
- return rc;
- }
- }
- return 0;
-}
-
-/* restore_ops function. */
-static bool x86_hvm_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- return true;
-}
-
-/* restore_ops function. */
-static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx,
- xen_pfn_t pfn)
-{
- return pfn;
-}
-
-/* restore_ops function. */
-static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
- xen_pfn_t gfn)
-{
- /* no op */
-}
-
-/* restore_ops function. */
-static void x86_hvm_set_page_type(struct xc_sr_context *ctx,
- xen_pfn_t pfn, xen_pfn_t type)
-{
- /* no-op */
-}
-
-/* restore_ops function. */
-static int x86_hvm_localise_page(struct xc_sr_context *ctx,
- uint32_t type, void *page)
-{
- /* no-op */
- return 0;
-}
-
-/*
- * restore_ops function. Confirms the stream matches the domain.
- */
-static int x86_hvm_setup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
-
- if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM )
- {
- ERROR("Unable to restore %s domain into an x86 HVM domain",
- dhdr_type_to_str(ctx->restore.guest_type));
- return -1;
- }
-
- if ( ctx->restore.guest_page_size != PAGE_SIZE )
- {
- ERROR("Invalid page size %u for x86 HVM domains",
- ctx->restore.guest_page_size);
- return -1;
- }
-
-#ifdef __i386__
- /* Very large domains (> 1TB) will exhaust virtual address space. */
- if ( ctx->restore.p2m_size > 0x0fffffff )
- {
- errno = E2BIG;
- PERROR("Cannot restore this big a guest");
- return -1;
- }
-#endif
-
- return 0;
-}
-
-/*
- * restore_ops function.
- */
-static int x86_hvm_process_record(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- switch ( rec->type )
- {
- case REC_TYPE_X86_TSC_INFO:
- return handle_x86_tsc_info(ctx, rec);
-
- case REC_TYPE_HVM_CONTEXT:
- return handle_hvm_context(ctx, rec);
-
- case REC_TYPE_HVM_PARAMS:
- return handle_hvm_params(ctx, rec);
-
- case REC_TYPE_X86_CPUID_POLICY:
- return handle_x86_cpuid_policy(ctx, rec);
-
- case REC_TYPE_X86_MSR_POLICY:
- return handle_x86_msr_policy(ctx, rec);
-
- default:
- return RECORD_NOT_PROCESSED;
- }
-}
-
-/*
- * restore_ops function. Sets extra hvm parameters and seeds the grant table.
- */
-static int x86_hvm_stream_complete(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc;
-
- rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_STORE_EVTCHN,
- ctx->restore.xenstore_evtchn);
- if ( rc )
- {
- PERROR("Failed to set HVM_PARAM_STORE_EVTCHN");
- return rc;
- }
-
- rc = xc_hvm_param_set(xch, ctx->domid, HVM_PARAM_CONSOLE_EVTCHN,
- ctx->restore.console_evtchn);
- if ( rc )
- {
- PERROR("Failed to set HVM_PARAM_CONSOLE_EVTCHN");
- return rc;
- }
-
- rc = xc_domain_hvm_setcontext(xch, ctx->domid,
- ctx->x86.hvm.restore.context.ptr,
- ctx->x86.hvm.restore.context.size);
- if ( rc < 0 )
- {
- PERROR("Unable to restore HVM context");
- return rc;
- }
-
- rc = xc_dom_gnttab_seed(xch, ctx->domid, true,
- ctx->restore.console_gfn,
- ctx->restore.xenstore_gfn,
- ctx->restore.console_domid,
- ctx->restore.xenstore_domid);
- if ( rc )
- {
- PERROR("Failed to seed grant table");
- return rc;
- }
-
- return rc;
-}
-
-static int x86_hvm_cleanup(struct xc_sr_context *ctx)
-{
- free(ctx->x86.hvm.restore.context.ptr);
-
- free(ctx->x86.restore.cpuid.ptr);
- free(ctx->x86.restore.msr.ptr);
-
- return 0;
-}
-
-struct xc_sr_restore_ops restore_ops_x86_hvm =
-{
- .pfn_is_valid = x86_hvm_pfn_is_valid,
- .pfn_to_gfn = x86_hvm_pfn_to_gfn,
- .set_gfn = x86_hvm_set_gfn,
- .set_page_type = x86_hvm_set_page_type,
- .localise_page = x86_hvm_localise_page,
- .setup = x86_hvm_setup,
- .process_record = x86_hvm_process_record,
- .static_data_complete = x86_static_data_complete,
- .stream_complete = x86_hvm_stream_complete,
- .cleanup = x86_hvm_cleanup,
-};
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include <assert.h>
-
-#include "xg_sr_common_x86_pv.h"
-
-static xen_pfn_t pfn_to_mfn(const struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- assert(pfn <= ctx->x86.pv.max_pfn);
-
- return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width);
-}
-
-/*
- * Expand our local tracking information for the p2m table and domains maximum
- * size. Normally this will be called once to expand from 0 to max_pfn, but
- * is liable to expand multiple times if the domain grows on the sending side
- * after migration has started.
- */
-static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn)
-{
- xc_interface *xch = ctx->xch;
- unsigned long old_max = ctx->x86.pv.max_pfn, i;
- unsigned int fpp = PAGE_SIZE / ctx->x86.pv.width;
- unsigned long end_frame = (max_pfn / fpp) + 1;
- unsigned long old_end_frame = (old_max / fpp) + 1;
- xen_pfn_t *p2m = NULL, *p2m_pfns = NULL;
- uint32_t *pfn_types = NULL;
- size_t p2msz, p2m_pfnsz, pfn_typesz;
-
- assert(max_pfn > old_max);
-
- p2msz = (max_pfn + 1) * ctx->x86.pv.width;
- p2m = realloc(ctx->x86.pv.p2m, p2msz);
- if ( !p2m )
- {
- ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz);
- return -1;
- }
- ctx->x86.pv.p2m = p2m;
-
- pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types);
- pfn_types = realloc(ctx->x86.pv.restore.pfn_types, pfn_typesz);
- if ( !pfn_types )
- {
- ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz);
- return -1;
- }
- ctx->x86.pv.restore.pfn_types = pfn_types;
-
- p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns);
- p2m_pfns = realloc(ctx->x86.pv.p2m_pfns, p2m_pfnsz);
- if ( !p2m_pfns )
- {
- ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz);
- return -1;
- }
- ctx->x86.pv.p2m_frames = end_frame;
- ctx->x86.pv.p2m_pfns = p2m_pfns;
-
- ctx->x86.pv.max_pfn = max_pfn;
- for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
- {
- ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
- ctx->restore.ops.set_page_type(ctx, i, 0);
- }
-
- for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i )
- ctx->x86.pv.p2m_pfns[i] = INVALID_MFN;
-
- DPRINTF("Changed max_pfn from %#lx to %#lx", old_max, max_pfn);
- return 0;
-}
-
-/*
- * Pin all of the pagetables.
- */
-static int pin_pagetables(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- unsigned long i, nr_pins;
- struct mmuext_op pin[MAX_PIN_BATCH];
-
- for ( i = nr_pins = 0; i <= ctx->x86.pv.max_pfn; ++i )
- {
- if ( (ctx->x86.pv.restore.pfn_types[i] &
- XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
- continue;
-
- switch ( (ctx->x86.pv.restore.pfn_types[i] &
- XEN_DOMCTL_PFINFO_LTABTYPE_MASK) )
- {
- case XEN_DOMCTL_PFINFO_L1TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
- break;
- case XEN_DOMCTL_PFINFO_L2TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
- break;
- case XEN_DOMCTL_PFINFO_L3TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
- break;
- case XEN_DOMCTL_PFINFO_L4TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
- break;
- default:
- continue;
- }
-
- pin[nr_pins].arg1.mfn = pfn_to_mfn(ctx, i);
- nr_pins++;
-
- if ( nr_pins == MAX_PIN_BATCH )
- {
- if ( xc_mmuext_op(xch, pin, nr_pins, ctx->domid) != 0 )
- {
- PERROR("Failed to pin batch of pagetables");
- return -1;
- }
- nr_pins = 0;
- }
- }
-
- if ( (nr_pins > 0) && (xc_mmuext_op(xch, pin, nr_pins, ctx->domid) < 0) )
- {
- PERROR("Failed to pin batch of pagetables");
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Update details in a guests start_info structure.
- */
-static int process_start_info(struct xc_sr_context *ctx,
- vcpu_guest_context_any_t *vcpu)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t pfn, mfn;
- start_info_any_t *guest_start_info = NULL;
- int rc = -1;
-
- pfn = GET_FIELD(vcpu, user_regs.edx, ctx->x86.pv.width);
-
- if ( pfn > ctx->x86.pv.max_pfn )
- {
- ERROR("Start Info pfn %#lx out of range", pfn);
- goto err;
- }
-
- if ( ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
- {
- ERROR("Start Info pfn %#lx has bad type %u", pfn,
- (ctx->x86.pv.restore.pfn_types[pfn] >>
- XEN_DOMCTL_PFINFO_LTAB_SHIFT));
- goto err;
- }
-
- mfn = pfn_to_mfn(ctx, pfn);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("Start Info has bad mfn");
- dump_bad_pseudophysmap_entry(ctx, mfn);
- goto err;
- }
-
- SET_FIELD(vcpu, user_regs.edx, mfn, ctx->x86.pv.width);
- guest_start_info = xc_map_foreign_range(
- xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
- if ( !guest_start_info )
- {
- PERROR("Failed to map Start Info at mfn %#lx", mfn);
- goto err;
- }
-
- /* Deal with xenstore stuff */
- pfn = GET_FIELD(guest_start_info, store_mfn, ctx->x86.pv.width);
- if ( pfn > ctx->x86.pv.max_pfn )
- {
- ERROR("XenStore pfn %#lx out of range", pfn);
- goto err;
- }
-
- mfn = pfn_to_mfn(ctx, pfn);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("XenStore pfn has bad mfn");
- dump_bad_pseudophysmap_entry(ctx, mfn);
- goto err;
- }
-
- ctx->restore.xenstore_gfn = mfn;
- SET_FIELD(guest_start_info, store_mfn, mfn, ctx->x86.pv.width);
- SET_FIELD(guest_start_info, store_evtchn,
- ctx->restore.xenstore_evtchn, ctx->x86.pv.width);
-
- /* Deal with console stuff */
- pfn = GET_FIELD(guest_start_info, console.domU.mfn, ctx->x86.pv.width);
- if ( pfn > ctx->x86.pv.max_pfn )
- {
- ERROR("Console pfn %#lx out of range", pfn);
- goto err;
- }
-
- mfn = pfn_to_mfn(ctx, pfn);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("Console pfn has bad mfn");
- dump_bad_pseudophysmap_entry(ctx, mfn);
- goto err;
- }
-
- ctx->restore.console_gfn = mfn;
- SET_FIELD(guest_start_info, console.domU.mfn, mfn, ctx->x86.pv.width);
- SET_FIELD(guest_start_info, console.domU.evtchn,
- ctx->restore.console_evtchn, ctx->x86.pv.width);
-
- /* Set other information */
- SET_FIELD(guest_start_info, nr_pages,
- ctx->x86.pv.max_pfn + 1, ctx->x86.pv.width);
- SET_FIELD(guest_start_info, shared_info,
- ctx->dominfo.shared_info_frame << PAGE_SHIFT, ctx->x86.pv.width);
- SET_FIELD(guest_start_info, flags, 0, ctx->x86.pv.width);
-
- rc = 0;
-
- err:
- if ( guest_start_info )
- munmap(guest_start_info, PAGE_SIZE);
-
- return rc;
-}
-
-/*
- * Process one stashed vcpu worth of basic state and send to Xen.
- */
-static int process_vcpu_basic(struct xc_sr_context *ctx,
- unsigned int vcpuid)
-{
- xc_interface *xch = ctx->xch;
- vcpu_guest_context_any_t *vcpu = ctx->x86.pv.restore.vcpus[vcpuid].basic.ptr;
- xen_pfn_t pfn, mfn;
- unsigned int i, gdt_count;
- int rc = -1;
-
- /* Vcpu 0 is special: Convert the suspend record to an mfn. */
- if ( vcpuid == 0 )
- {
- rc = process_start_info(ctx, vcpu);
- if ( rc )
- return rc;
- rc = -1;
- }
-
- SET_FIELD(vcpu, flags,
- GET_FIELD(vcpu, flags, ctx->x86.pv.width) | VGCF_online,
- ctx->x86.pv.width);
-
- gdt_count = GET_FIELD(vcpu, gdt_ents, ctx->x86.pv.width);
- if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
- {
- ERROR("GDT entry count (%u) out of range (max %u)",
- gdt_count, FIRST_RESERVED_GDT_ENTRY);
- errno = ERANGE;
- goto err;
- }
- gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
-
- /* Convert GDT frames to mfns. */
- for ( i = 0; i < gdt_count; ++i )
- {
- pfn = GET_FIELD(vcpu, gdt_frames[i], ctx->x86.pv.width);
- if ( pfn > ctx->x86.pv.max_pfn )
- {
- ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn);
- goto err;
- }
-
- if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
- {
- ERROR("GDT frame %u (pfn %#lx) has bad type %u", i, pfn,
- (ctx->x86.pv.restore.pfn_types[pfn] >>
- XEN_DOMCTL_PFINFO_LTAB_SHIFT));
- goto err;
- }
-
- mfn = pfn_to_mfn(ctx, pfn);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("GDT frame %u has bad mfn", i);
- dump_bad_pseudophysmap_entry(ctx, mfn);
- goto err;
- }
-
- SET_FIELD(vcpu, gdt_frames[i], mfn, ctx->x86.pv.width);
- }
-
- /* Convert CR3 to an mfn. */
- pfn = cr3_to_mfn(ctx, GET_FIELD(vcpu, ctrlreg[3], ctx->x86.pv.width));
- if ( pfn > ctx->x86.pv.max_pfn )
- {
- ERROR("cr3 (pfn %#lx) out of range", pfn);
- goto err;
- }
-
- if ( (ctx->x86.pv.restore.pfn_types[pfn] &
- XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
- (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
- {
- ERROR("cr3 (pfn %#lx) has bad type %u, expected %u", pfn,
- (ctx->x86.pv.restore.pfn_types[pfn] >>
- XEN_DOMCTL_PFINFO_LTAB_SHIFT),
- ctx->x86.pv.levels);
- goto err;
- }
-
- mfn = pfn_to_mfn(ctx, pfn);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("cr3 has bad mfn");
- dump_bad_pseudophysmap_entry(ctx, mfn);
- goto err;
- }
-
- SET_FIELD(vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn), ctx->x86.pv.width);
-
- /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */
- if ( ctx->x86.pv.levels == 4 && (vcpu->x64.ctrlreg[1] & 1) )
- {
- pfn = vcpu->x64.ctrlreg[1] >> PAGE_SHIFT;
-
- if ( pfn > ctx->x86.pv.max_pfn )
- {
- ERROR("cr1 (pfn %#lx) out of range", pfn);
- goto err;
- }
-
- if ( (ctx->x86.pv.restore.pfn_types[pfn] &
- XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
- (((xen_pfn_t)ctx->x86.pv.levels) << XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
- {
- ERROR("cr1 (pfn %#lx) has bad type %u, expected %u", pfn,
- (ctx->x86.pv.restore.pfn_types[pfn] >>
- XEN_DOMCTL_PFINFO_LTAB_SHIFT),
- ctx->x86.pv.levels);
- goto err;
- }
-
- mfn = pfn_to_mfn(ctx, pfn);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("cr1 has bad mfn");
- dump_bad_pseudophysmap_entry(ctx, mfn);
- goto err;
- }
-
- vcpu->x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT;
- }
-
- if ( xc_vcpu_setcontext(xch, ctx->domid, vcpuid, vcpu) )
- {
- PERROR("Failed to set vcpu%u's basic info", vcpuid);
- goto err;
- }
-
- rc = 0;
-
- err:
- return rc;
-}
-
-/*
- * Process one stashed vcpu worth of extended state and send to Xen.
- */
-static int process_vcpu_extended(struct xc_sr_context *ctx,
- unsigned int vcpuid)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_x86_pv_restore_vcpu *vcpu =
- &ctx->x86.pv.restore.vcpus[vcpuid];
- DECLARE_DOMCTL;
-
- domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
- domctl.domain = ctx->domid;
- memcpy(&domctl.u.ext_vcpucontext, vcpu->extd.ptr, vcpu->extd.size);
-
- if ( xc_domctl(xch, &domctl) != 0 )
- {
- PERROR("Failed to set vcpu%u's extended info", vcpuid);
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Process one stashed vcpu worth of xsave state and send to Xen.
- */
-static int process_vcpu_xsave(struct xc_sr_context *ctx,
- unsigned int vcpuid)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_x86_pv_restore_vcpu *vcpu =
- &ctx->x86.pv.restore.vcpus[vcpuid];
- int rc;
- DECLARE_DOMCTL;
- DECLARE_HYPERCALL_BUFFER(void, buffer);
-
- buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->xsave.size);
- if ( !buffer )
- {
- ERROR("Unable to allocate %zu bytes for xsave hypercall buffer",
- vcpu->xsave.size);
- return -1;
- }
-
- domctl.cmd = XEN_DOMCTL_setvcpuextstate;
- domctl.domain = ctx->domid;
- domctl.u.vcpuextstate.vcpu = vcpuid;
- domctl.u.vcpuextstate.size = vcpu->xsave.size;
- set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
-
- memcpy(buffer, vcpu->xsave.ptr, vcpu->xsave.size);
-
- rc = xc_domctl(xch, &domctl);
- if ( rc )
- PERROR("Failed to set vcpu%u's xsave info", vcpuid);
-
- xc_hypercall_buffer_free(xch, buffer);
-
- return rc;
-}
-
-/*
- * Process one stashed vcpu worth of msr state and send to Xen.
- */
-static int process_vcpu_msrs(struct xc_sr_context *ctx,
- unsigned int vcpuid)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_x86_pv_restore_vcpu *vcpu =
- &ctx->x86.pv.restore.vcpus[vcpuid];
- int rc;
- DECLARE_DOMCTL;
- DECLARE_HYPERCALL_BUFFER(void, buffer);
-
- buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->msr.size);
- if ( !buffer )
- {
- ERROR("Unable to allocate %zu bytes for msr hypercall buffer",
- vcpu->msr.size);
- return -1;
- }
-
- domctl.cmd = XEN_DOMCTL_set_vcpu_msrs;
- domctl.domain = ctx->domid;
- domctl.u.vcpu_msrs.vcpu = vcpuid;
- domctl.u.vcpu_msrs.msr_count = vcpu->msr.size / sizeof(xen_domctl_vcpu_msr_t);
- set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
-
- memcpy(buffer, vcpu->msr.ptr, vcpu->msr.size);
-
- rc = xc_domctl(xch, &domctl);
- if ( rc )
- PERROR("Failed to set vcpu%u's msrs", vcpuid);
-
- xc_hypercall_buffer_free(xch, buffer);
-
- return rc;
-}
-
-/*
- * Process all stashed vcpu context and send to Xen.
- */
-static int update_vcpu_context(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_x86_pv_restore_vcpu *vcpu;
- unsigned int i;
- int rc = 0;
-
- for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i )
- {
- vcpu = &ctx->x86.pv.restore.vcpus[i];
-
- if ( vcpu->basic.ptr )
- {
- rc = process_vcpu_basic(ctx, i);
- if ( rc )
- return rc;
- }
- else if ( i == 0 )
- {
- ERROR("Sender didn't send vcpu0's basic state");
- return -1;
- }
-
- if ( vcpu->extd.ptr )
- {
- rc = process_vcpu_extended(ctx, i);
- if ( rc )
- return rc;
- }
-
- if ( vcpu->xsave.ptr )
- {
- rc = process_vcpu_xsave(ctx, i);
- if ( rc )
- return rc;
- }
-
- if ( vcpu->msr.ptr )
- {
- rc = process_vcpu_msrs(ctx, i);
- if ( rc )
- return rc;
- }
- }
-
- return rc;
-}
-
-/*
- * Copy the p2m which has been constructed locally as memory has been
- * allocated, over the p2m in guest, so the guest can find its memory again on
- * resume.
- */
-static int update_guest_p2m(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t mfn, pfn, *guest_p2m = NULL;
- unsigned int i;
- int rc = -1;
-
- for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i )
- {
- pfn = ctx->x86.pv.p2m_pfns[i];
-
- if ( pfn > ctx->x86.pv.max_pfn )
- {
- ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range",
- pfn, i);
- goto err;
- }
-
- if ( (ctx->x86.pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
- {
- ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %u", pfn, i,
- (ctx->x86.pv.restore.pfn_types[pfn] >>
- XEN_DOMCTL_PFINFO_LTAB_SHIFT));
- goto err;
- }
-
- mfn = pfn_to_mfn(ctx, pfn);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("p2m_frame_list[%u] has bad mfn", i);
- dump_bad_pseudophysmap_entry(ctx, mfn);
- goto err;
- }
-
- ctx->x86.pv.p2m_pfns[i] = mfn;
- }
-
- guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE,
- ctx->x86.pv.p2m_pfns,
- ctx->x86.pv.p2m_frames);
- if ( !guest_p2m )
- {
- PERROR("Failed to map p2m frames");
- goto err;
- }
-
- memcpy(guest_p2m, ctx->x86.pv.p2m,
- (ctx->x86.pv.max_pfn + 1) * ctx->x86.pv.width);
- rc = 0;
-
- err:
- if ( guest_p2m )
- munmap(guest_p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE);
-
- return rc;
-}
-
-/*
- * The valid width/pt_levels values in X86_PV_INFO are inextricably linked.
- * Cross-check the legitimate combinations.
- */
-static bool valid_x86_pv_info_combination(
- const struct xc_sr_rec_x86_pv_info *info)
-{
- switch ( info->guest_width )
- {
- case 4: return info->pt_levels == 3;
- case 8: return info->pt_levels == 4;
- default: return false;
- }
-}
-
-/*
- * Process an X86_PV_INFO record.
- */
-static int handle_x86_pv_info(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_x86_pv_info *info = rec->data;
-
- if ( ctx->x86.pv.restore.seen_pv_info )
- {
- ERROR("Already received X86_PV_INFO record");
- return -1;
- }
-
- if ( rec->length < sizeof(*info) )
- {
- ERROR("X86_PV_INFO record truncated: length %u, expected %zu",
- rec->length, sizeof(*info));
- return -1;
- }
-
- if ( !valid_x86_pv_info_combination(info) )
- {
- ERROR("Invalid X86_PV_INFO combination: width %u, pt_levels %u",
- info->guest_width, info->pt_levels);
- return -1;
- }
-
- /*
- * PV domains default to native width. For an incomming compat domain, we
- * will typically be the first entity to inform Xen.
- */
- if ( info->guest_width != ctx->x86.pv.width )
- {
- struct xen_domctl domctl = {
- .domain = ctx->domid,
- .cmd = XEN_DOMCTL_set_address_size,
- .u.address_size.size = info->guest_width * 8,
- };
- int rc = do_domctl(xch, &domctl);
-
- if ( rc != 0 )
- {
- ERROR("Failed to update d%d address size to %u",
- ctx->domid, info->guest_width * 8);
- return -1;
- }
-
- /* Domain's information changed, better to refresh. */
- rc = x86_pv_domain_info(ctx);
- if ( rc != 0 )
- {
- ERROR("Unable to refresh guest information");
- return -1;
- }
- }
-
- /* Sanity check (possibly new) domain settings. */
- if ( (info->guest_width != ctx->x86.pv.width) ||
- (info->pt_levels != ctx->x86.pv.levels) )
- {
- ERROR("X86_PV_INFO width/pt_levels settings %u/%u mismatch with d%d %u/%u",
- info->guest_width, info->pt_levels, ctx->domid,
- ctx->x86.pv.width, ctx->x86.pv.levels);
- return -1;
- }
-
- ctx->x86.pv.restore.seen_pv_info = true;
- return 0;
-}
-
-/*
- * Process an X86_PV_P2M_FRAMES record. Takes care of expanding the local p2m
- * state if needed.
- */
-static int handle_x86_pv_p2m_frames(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_x86_pv_p2m_frames *data = rec->data;
- unsigned int start, end, x, fpp = PAGE_SIZE / ctx->x86.pv.width;
- int rc;
-
- /* v2 compat. Infer the position of STATIC_DATA_END. */
- if ( ctx->restore.format_version < 3 && !ctx->restore.seen_static_data_end )
- {
- rc = handle_static_data_end(ctx);
- if ( rc )
- {
- ERROR("Inferred STATIC_DATA_END record failed");
- return rc;
- }
- }
-
- if ( !ctx->restore.seen_static_data_end )
- {
- ERROR("No STATIC_DATA_END seen");
- return -1;
- }
-
- if ( !ctx->x86.pv.restore.seen_pv_info )
- {
- ERROR("Not yet received X86_PV_INFO record");
- return -1;
- }
-
- if ( rec->length < sizeof(*data) )
- {
- ERROR("X86_PV_P2M_FRAMES record truncated: length %u, min %zu",
- rec->length, sizeof(*data) + sizeof(uint64_t));
- return -1;
- }
-
- if ( data->start_pfn > data->end_pfn )
- {
- ERROR("End pfn in stream (%#x) exceeds Start (%#x)",
- data->end_pfn, data->start_pfn);
- return -1;
- }
-
- start = data->start_pfn / fpp;
- end = data->end_pfn / fpp + 1;
-
- if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) )
- {
- ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#x"
- ", end_pfn %#x, length %u, expected %zu + (%u - %u) * %zu",
- data->start_pfn, data->end_pfn, rec->length,
- sizeof(*data), end, start, sizeof(uint64_t));
- return -1;
- }
-
- if ( data->end_pfn > ctx->x86.pv.max_pfn )
- {
- rc = expand_p2m(ctx, data->end_pfn);
- if ( rc )
- return rc;
- }
-
- for ( x = 0; x < (end - start); ++x )
- ctx->x86.pv.p2m_pfns[start + x] = data->p2m_pfns[x];
-
- return 0;
-}
-
-/*
- * Processes X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} records from the stream.
- * The blobs are all stashed to one side as they need to be deferred until the
- * very end of the stream, rather than being send to Xen at the point they
- * arrive in the stream. It performs all pre-hypercall size validation.
- */
-static int handle_x86_pv_vcpu_blob(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_x86_pv_vcpu_hdr *vhdr = rec->data;
- struct xc_sr_x86_pv_restore_vcpu *vcpu;
- const char *rec_name;
- size_t blobsz;
- struct xc_sr_blob *blob = NULL;
- int rc = -1;
-
- switch ( rec->type )
- {
- case REC_TYPE_X86_PV_VCPU_BASIC:
- rec_name = "X86_PV_VCPU_BASIC";
- break;
-
- case REC_TYPE_X86_PV_VCPU_EXTENDED:
- rec_name = "X86_PV_VCPU_EXTENDED";
- break;
-
- case REC_TYPE_X86_PV_VCPU_XSAVE:
- rec_name = "X86_PV_VCPU_XSAVE";
- break;
-
- case REC_TYPE_X86_PV_VCPU_MSRS:
- rec_name = "X86_PV_VCPU_MSRS";
- break;
-
- default:
- ERROR("Unrecognised vcpu blob record %s (%u)",
- rec_type_to_str(rec->type), rec->type);
- goto out;
- }
-
- /* Confirm that there is a complete header. */
- if ( rec->length < sizeof(*vhdr) )
- {
- ERROR("%s record truncated: length %u, header size %zu",
- rec_name, rec->length, sizeof(*vhdr));
- goto out;
- }
-
- blobsz = rec->length - sizeof(*vhdr);
-
- /*
- * Tolerate empty records. Older sending sides used to accidentally
- * generate them.
- */
- if ( blobsz == 0 )
- {
- DBGPRINTF("Skipping empty %s record for vcpu %u\n",
- rec_type_to_str(rec->type), vhdr->vcpu_id);
- rc = 0;
- goto out;
- }
-
- /* Check that the vcpu id is within range. */
- if ( vhdr->vcpu_id >= ctx->x86.pv.restore.nr_vcpus )
- {
- ERROR("%s record vcpu_id (%u) exceeds domain max (%u)",
- rec_name, vhdr->vcpu_id, ctx->x86.pv.restore.nr_vcpus - 1);
- goto out;
- }
-
- vcpu = &ctx->x86.pv.restore.vcpus[vhdr->vcpu_id];
-
- /* Further per-record checks, where possible. */
- switch ( rec->type )
- {
- case REC_TYPE_X86_PV_VCPU_BASIC:
- {
- size_t vcpusz = ctx->x86.pv.width == 8 ?
- sizeof(vcpu_guest_context_x86_64_t) :
- sizeof(vcpu_guest_context_x86_32_t);
-
- if ( blobsz != vcpusz )
- {
- ERROR("%s record wrong size: expected %zu, got %u",
- rec_name, sizeof(*vhdr) + vcpusz, rec->length);
- goto out;
- }
- blob = &vcpu->basic;
- break;
- }
-
- case REC_TYPE_X86_PV_VCPU_EXTENDED:
- if ( blobsz > 128 )
- {
- ERROR("%s record too long: max %zu, got %u",
- rec_name, sizeof(*vhdr) + 128, rec->length);
- goto out;
- }
- blob = &vcpu->extd;
- break;
-
- case REC_TYPE_X86_PV_VCPU_XSAVE:
- if ( blobsz < 16 )
- {
- ERROR("%s record too short: min %zu, got %u",
- rec_name, sizeof(*vhdr) + 16, rec->length);
- goto out;
- }
- blob = &vcpu->xsave;
- break;
-
- case REC_TYPE_X86_PV_VCPU_MSRS:
- if ( blobsz % sizeof(xen_domctl_vcpu_msr_t) != 0 )
- {
- ERROR("%s record payload size %zu expected to be a multiple of %zu",
- rec_name, blobsz, sizeof(xen_domctl_vcpu_msr_t));
- goto out;
- }
- blob = &vcpu->msr;
- break;
- }
-
- rc = update_blob(blob, vhdr->context, blobsz);
- if ( rc )
- ERROR("Unable to allocate %zu bytes for vcpu%u %s blob",
- blobsz, vhdr->vcpu_id, rec_name);
-
- out:
- return rc;
-}
-
-/*
- * Process a SHARED_INFO record from the stream.
- */
-static int handle_shared_info(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- xc_interface *xch = ctx->xch;
- unsigned int i;
- int rc = -1;
- shared_info_any_t *guest_shinfo = NULL;
- const shared_info_any_t *old_shinfo = rec->data;
-
- if ( !ctx->x86.pv.restore.seen_pv_info )
- {
- ERROR("Not yet received X86_PV_INFO record");
- return -1;
- }
-
- if ( rec->length != PAGE_SIZE )
- {
- ERROR("X86_PV_SHARED_INFO record wrong size: length %u"
- ", expected 4096", rec->length);
- goto err;
- }
-
- guest_shinfo = xc_map_foreign_range(
- xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
- ctx->dominfo.shared_info_frame);
- if ( !guest_shinfo )
- {
- PERROR("Failed to map Shared Info at mfn %#lx",
- ctx->dominfo.shared_info_frame);
- goto err;
- }
-
- MEMCPY_FIELD(guest_shinfo, old_shinfo, vcpu_info, ctx->x86.pv.width);
- MEMCPY_FIELD(guest_shinfo, old_shinfo, arch, ctx->x86.pv.width);
-
- SET_FIELD(guest_shinfo, arch.pfn_to_mfn_frame_list_list,
- 0, ctx->x86.pv.width);
-
- MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_pending, 0, ctx->x86.pv.width);
- for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
- SET_FIELD(guest_shinfo, vcpu_info[i].evtchn_pending_sel,
- 0, ctx->x86.pv.width);
-
- MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_mask, 0xff, ctx->x86.pv.width);
-
- rc = 0;
-
- err:
- if ( guest_shinfo )
- munmap(guest_shinfo, PAGE_SIZE);
-
- return rc;
-}
-
-/* restore_ops function. */
-static bool x86_pv_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- return pfn <= ctx->x86.pv.max_pfn;
-}
-
-/* restore_ops function. */
-static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn,
- unsigned long type)
-{
- assert(pfn <= ctx->x86.pv.max_pfn);
-
- ctx->x86.pv.restore.pfn_types[pfn] = type;
-}
-
-/* restore_ops function. */
-static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
- xen_pfn_t mfn)
-{
- assert(pfn <= ctx->x86.pv.max_pfn);
-
- if ( ctx->x86.pv.width == sizeof(uint64_t) )
- /* 64 bit guest. Need to expand INVALID_MFN for 32 bit toolstacks. */
- ((uint64_t *)ctx->x86.pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn;
- else
- /* 32 bit guest. Can truncate INVALID_MFN for 64 bit toolstacks. */
- ((uint32_t *)ctx->x86.pv.p2m)[pfn] = mfn;
-}
-
-/*
- * restore_ops function. Convert pfns back to mfns in pagetables. Possibly
- * needs to populate new frames if a PTE is found referring to a frame which
- * hasn't yet been seen from PAGE_DATA records.
- */
-static int x86_pv_localise_page(struct xc_sr_context *ctx,
- uint32_t type, void *page)
-{
- xc_interface *xch = ctx->xch;
- uint64_t *table = page;
- uint64_t pte;
- unsigned int i, to_populate;
- xen_pfn_t pfns[(PAGE_SIZE / sizeof(uint64_t))];
-
- type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- /* Only page tables need localisation. */
- if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
- return 0;
-
- /* Check to see whether we need to populate any new frames. */
- for ( i = 0, to_populate = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
- {
- pte = table[i];
-
- if ( pte & _PAGE_PRESENT )
- {
- xen_pfn_t pfn = pte_to_frame(pte);
-
-#ifdef __i386__
- if ( pfn == INVALID_MFN )
- {
- ERROR("PTE truncation detected. L%u[%u] = %016"PRIx64,
- type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
- errno = E2BIG;
- return -1;
- }
-#endif
-
- if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN )
- pfns[to_populate++] = pfn;
- }
- }
-
- if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
- return -1;
-
- for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
- {
- pte = table[i];
-
- if ( pte & _PAGE_PRESENT )
- {
- xen_pfn_t mfn, pfn;
-
- pfn = pte_to_frame(pte);
- mfn = pfn_to_mfn(ctx, pfn);
-
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("Bad mfn for L%u[%u] - pte %"PRIx64,
- type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
- dump_bad_pseudophysmap_entry(ctx, mfn);
- errno = ERANGE;
- return -1;
- }
-
- table[i] = merge_pte(pte, mfn);
- }
- }
-
- return 0;
-}
-
-/*
- * restore_ops function. Confirm that the incoming stream matches the type of
- * domain we are attempting to restore into.
- */
-static int x86_pv_setup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc;
-
- if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV )
- {
- ERROR("Unable to restore %s domain into an x86_pv domain",
- dhdr_type_to_str(ctx->restore.guest_type));
- return -1;
- }
-
- if ( ctx->restore.guest_page_size != PAGE_SIZE )
- {
- ERROR("Invalid page size %d for x86_pv domains",
- ctx->restore.guest_page_size);
- return -1;
- }
-
- rc = x86_pv_domain_info(ctx);
- if ( rc )
- return rc;
-
- ctx->x86.pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1;
- ctx->x86.pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu),
- ctx->x86.pv.restore.nr_vcpus);
- if ( !ctx->x86.pv.restore.vcpus )
- {
- errno = ENOMEM;
- return -1;
- }
-
- rc = x86_pv_map_m2p(ctx);
- if ( rc )
- return rc;
-
- return rc;
-}
-
-/*
- * restore_ops function.
- */
-static int x86_pv_process_record(struct xc_sr_context *ctx,
- struct xc_sr_record *rec)
-{
- switch ( rec->type )
- {
- case REC_TYPE_X86_PV_INFO:
- return handle_x86_pv_info(ctx, rec);
-
- case REC_TYPE_X86_PV_P2M_FRAMES:
- return handle_x86_pv_p2m_frames(ctx, rec);
-
- case REC_TYPE_X86_PV_VCPU_BASIC:
- case REC_TYPE_X86_PV_VCPU_EXTENDED:
- case REC_TYPE_X86_PV_VCPU_XSAVE:
- case REC_TYPE_X86_PV_VCPU_MSRS:
- return handle_x86_pv_vcpu_blob(ctx, rec);
-
- case REC_TYPE_SHARED_INFO:
- return handle_shared_info(ctx, rec);
-
- case REC_TYPE_X86_TSC_INFO:
- return handle_x86_tsc_info(ctx, rec);
-
- case REC_TYPE_X86_CPUID_POLICY:
- return handle_x86_cpuid_policy(ctx, rec);
-
- case REC_TYPE_X86_MSR_POLICY:
- return handle_x86_msr_policy(ctx, rec);
-
- default:
- return RECORD_NOT_PROCESSED;
- }
-}
-
-/*
- * restore_ops function. Update the vcpu context in Xen, pin the pagetables,
- * rewrite the p2m and seed the grant table.
- */
-static int x86_pv_stream_complete(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc;
-
- rc = update_vcpu_context(ctx);
- if ( rc )
- return rc;
-
- rc = pin_pagetables(ctx);
- if ( rc )
- return rc;
-
- rc = update_guest_p2m(ctx);
- if ( rc )
- return rc;
-
- rc = xc_dom_gnttab_seed(xch, ctx->domid, false,
- ctx->restore.console_gfn,
- ctx->restore.xenstore_gfn,
- ctx->restore.console_domid,
- ctx->restore.xenstore_domid);
- if ( rc )
- {
- PERROR("Failed to seed grant table");
- return rc;
- }
-
- return rc;
-}
-
-/*
- * restore_ops function.
- */
-static int x86_pv_cleanup(struct xc_sr_context *ctx)
-{
- free(ctx->x86.pv.p2m);
- free(ctx->x86.pv.p2m_pfns);
-
- if ( ctx->x86.pv.restore.vcpus )
- {
- unsigned int i;
-
- for ( i = 0; i < ctx->x86.pv.restore.nr_vcpus; ++i )
- {
- struct xc_sr_x86_pv_restore_vcpu *vcpu =
- &ctx->x86.pv.restore.vcpus[i];
-
- free(vcpu->basic.ptr);
- free(vcpu->extd.ptr);
- free(vcpu->xsave.ptr);
- free(vcpu->msr.ptr);
- }
-
- free(ctx->x86.pv.restore.vcpus);
- }
-
- free(ctx->x86.pv.restore.pfn_types);
-
- if ( ctx->x86.pv.m2p )
- munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE);
-
- free(ctx->x86.restore.cpuid.ptr);
- free(ctx->x86.restore.msr.ptr);
-
- return 0;
-}
-
-struct xc_sr_restore_ops restore_ops_x86_pv =
-{
- .pfn_is_valid = x86_pv_pfn_is_valid,
- .pfn_to_gfn = pfn_to_mfn,
- .set_page_type = x86_pv_set_page_type,
- .set_gfn = x86_pv_set_gfn,
- .localise_page = x86_pv_localise_page,
- .setup = x86_pv_setup,
- .process_record = x86_pv_process_record,
- .static_data_complete = x86_static_data_complete,
- .stream_complete = x86_pv_stream_complete,
- .cleanup = x86_pv_cleanup,
-};
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include <assert.h>
-#include <arpa/inet.h>
-
-#include "xg_sr_common.h"
-
-/*
- * Writes an Image header and Domain header into the stream.
- */
-static int write_headers(struct xc_sr_context *ctx, uint16_t guest_type)
-{
- xc_interface *xch = ctx->xch;
- int32_t xen_version = xc_version(xch, XENVER_version, NULL);
- struct xc_sr_ihdr ihdr = {
- .marker = IHDR_MARKER,
- .id = htonl(IHDR_ID),
- .version = htonl(3),
- .options = htons(IHDR_OPT_LITTLE_ENDIAN),
- };
- struct xc_sr_dhdr dhdr = {
- .type = guest_type,
- .page_shift = XC_PAGE_SHIFT,
- .xen_major = (xen_version >> 16) & 0xffff,
- .xen_minor = (xen_version) & 0xffff,
- };
-
- if ( xen_version < 0 )
- {
- PERROR("Unable to obtain Xen Version");
- return -1;
- }
-
- if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
- {
- PERROR("Unable to write Image Header to stream");
- return -1;
- }
-
- if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
- {
- PERROR("Unable to write Domain Header to stream");
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Writes an END record into the stream.
- */
-static int write_end_record(struct xc_sr_context *ctx)
-{
- struct xc_sr_record end = { .type = REC_TYPE_END };
-
- return write_record(ctx, &end);
-}
-
-/*
- * Writes a STATIC_DATA_END record into the stream.
- */
-static int write_static_data_end_record(struct xc_sr_context *ctx)
-{
- struct xc_sr_record end = { .type = REC_TYPE_STATIC_DATA_END };
-
- return write_record(ctx, &end);
-}
-
-/*
- * Writes a CHECKPOINT record into the stream.
- */
-static int write_checkpoint_record(struct xc_sr_context *ctx)
-{
- struct xc_sr_record checkpoint = { .type = REC_TYPE_CHECKPOINT };
-
- return write_record(ctx, &checkpoint);
-}
-
-/*
- * Writes a batch of memory as a PAGE_DATA record into the stream. The batch
- * is constructed in ctx->save.batch_pfns.
- *
- * This function:
- * - gets the types for each pfn in the batch.
- * - for each pfn with real data:
- * - maps and attempts to localise the pages.
- * - construct and writes a PAGE_DATA record into the stream.
- */
-static int write_batch(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t *mfns = NULL, *types = NULL;
- void *guest_mapping = NULL;
- void **guest_data = NULL;
- void **local_pages = NULL;
- int *errors = NULL, rc = -1;
- unsigned int i, p, nr_pages = 0, nr_pages_mapped = 0;
- unsigned int nr_pfns = ctx->save.nr_batch_pfns;
- void *page, *orig_page;
- uint64_t *rec_pfns = NULL;
- struct iovec *iov = NULL; int iovcnt = 0;
- struct xc_sr_rec_page_data_header hdr = { 0 };
- struct xc_sr_record rec = {
- .type = REC_TYPE_PAGE_DATA,
- };
-
- assert(nr_pfns != 0);
-
- /* Mfns of the batch pfns. */
- mfns = malloc(nr_pfns * sizeof(*mfns));
- /* Types of the batch pfns. */
- types = malloc(nr_pfns * sizeof(*types));
- /* Errors from attempting to map the gfns. */
- errors = malloc(nr_pfns * sizeof(*errors));
- /* Pointers to page data to send. Mapped gfns or local allocations. */
- guest_data = calloc(nr_pfns, sizeof(*guest_data));
- /* Pointers to locally allocated pages. Need freeing. */
- local_pages = calloc(nr_pfns, sizeof(*local_pages));
- /* iovec[] for writev(). */
- iov = malloc((nr_pfns + 4) * sizeof(*iov));
-
- if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov )
- {
- ERROR("Unable to allocate arrays for a batch of %u pages",
- nr_pfns);
- goto err;
- }
-
- for ( i = 0; i < nr_pfns; ++i )
- {
- types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx,
- ctx->save.batch_pfns[i]);
-
- /* Likely a ballooned page. */
- if ( mfns[i] == INVALID_MFN )
- {
- set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
- ++ctx->save.nr_deferred_pages;
- }
- }
-
- rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types);
- if ( rc )
- {
- PERROR("Failed to get types for pfn batch");
- goto err;
- }
- rc = -1;
-
- for ( i = 0; i < nr_pfns; ++i )
- {
- switch ( types[i] )
- {
- case XEN_DOMCTL_PFINFO_BROKEN:
- case XEN_DOMCTL_PFINFO_XALLOC:
- case XEN_DOMCTL_PFINFO_XTAB:
- continue;
- }
-
- mfns[nr_pages++] = mfns[i];
- }
-
- if ( nr_pages > 0 )
- {
- guest_mapping = xenforeignmemory_map(
- xch->fmem, ctx->domid, PROT_READ, nr_pages, mfns, errors);
- if ( !guest_mapping )
- {
- PERROR("Failed to map guest pages");
- goto err;
- }
- nr_pages_mapped = nr_pages;
-
- for ( i = 0, p = 0; i < nr_pfns; ++i )
- {
- switch ( types[i] )
- {
- case XEN_DOMCTL_PFINFO_BROKEN:
- case XEN_DOMCTL_PFINFO_XALLOC:
- case XEN_DOMCTL_PFINFO_XTAB:
- continue;
- }
-
- if ( errors[p] )
- {
- ERROR("Mapping of pfn %#"PRIpfn" (mfn %#"PRIpfn") failed %d",
- ctx->save.batch_pfns[i], mfns[p], errors[p]);
- goto err;
- }
-
- orig_page = page = guest_mapping + (p * PAGE_SIZE);
- rc = ctx->save.ops.normalise_page(ctx, types[i], &page);
-
- if ( orig_page != page )
- local_pages[i] = page;
-
- if ( rc )
- {
- if ( rc == -1 && errno == EAGAIN )
- {
- set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages);
- ++ctx->save.nr_deferred_pages;
- types[i] = XEN_DOMCTL_PFINFO_XTAB;
- --nr_pages;
- }
- else
- goto err;
- }
- else
- guest_data[i] = page;
-
- rc = -1;
- ++p;
- }
- }
-
- rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns));
- if ( !rec_pfns )
- {
- ERROR("Unable to allocate %zu bytes of memory for page data pfn list",
- nr_pfns * sizeof(*rec_pfns));
- goto err;
- }
-
- hdr.count = nr_pfns;
-
- rec.length = sizeof(hdr);
- rec.length += nr_pfns * sizeof(*rec_pfns);
- rec.length += nr_pages * PAGE_SIZE;
-
- for ( i = 0; i < nr_pfns; ++i )
- rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i];
-
- iov[0].iov_base = &rec.type;
- iov[0].iov_len = sizeof(rec.type);
-
- iov[1].iov_base = &rec.length;
- iov[1].iov_len = sizeof(rec.length);
-
- iov[2].iov_base = &hdr;
- iov[2].iov_len = sizeof(hdr);
-
- iov[3].iov_base = rec_pfns;
- iov[3].iov_len = nr_pfns * sizeof(*rec_pfns);
-
- iovcnt = 4;
-
- if ( nr_pages )
- {
- for ( i = 0; i < nr_pfns; ++i )
- {
- if ( guest_data[i] )
- {
- iov[iovcnt].iov_base = guest_data[i];
- iov[iovcnt].iov_len = PAGE_SIZE;
- iovcnt++;
- --nr_pages;
- }
- }
- }
-
- if ( writev_exact(ctx->fd, iov, iovcnt) )
- {
- PERROR("Failed to write page data to stream");
- goto err;
- }
-
- /* Sanity check we have sent all the pages we expected to. */
- assert(nr_pages == 0);
- rc = ctx->save.nr_batch_pfns = 0;
-
- err:
- free(rec_pfns);
- if ( guest_mapping )
- xenforeignmemory_unmap(xch->fmem, guest_mapping, nr_pages_mapped);
- for ( i = 0; local_pages && i < nr_pfns; ++i )
- free(local_pages[i]);
- free(iov);
- free(local_pages);
- free(guest_data);
- free(errors);
- free(types);
- free(mfns);
-
- return rc;
-}
-
-/*
- * Flush a batch of pfns into the stream.
- */
-static int flush_batch(struct xc_sr_context *ctx)
-{
- int rc = 0;
-
- if ( ctx->save.nr_batch_pfns == 0 )
- return rc;
-
- rc = write_batch(ctx);
-
- if ( !rc )
- {
- VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns,
- MAX_BATCH_SIZE *
- sizeof(*ctx->save.batch_pfns));
- }
-
- return rc;
-}
-
-/*
- * Add a single pfn to the batch, flushing the batch if full.
- */
-static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn)
-{
- int rc = 0;
-
- if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE )
- rc = flush_batch(ctx);
-
- if ( rc == 0 )
- ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn;
-
- return rc;
-}
-
-/*
- * Pause/suspend the domain, and refresh ctx->dominfo if required.
- */
-static int suspend_domain(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
-
- /* TODO: Properly specify the return value from this callback. All
- * implementations currently appear to return 1 for success, whereas
- * the legacy code checks for != 0. */
- int cb_rc = ctx->save.callbacks->suspend(ctx->save.callbacks->data);
-
- if ( cb_rc == 0 )
- {
- ERROR("save callback suspend() failed: %d", cb_rc);
- return -1;
- }
-
- /* Refresh domain information. */
- if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) ||
- (ctx->dominfo.domid != ctx->domid) )
- {
- PERROR("Unable to refresh domain information");
- return -1;
- }
-
- /* Confirm the domain has actually been paused. */
- if ( !ctx->dominfo.shutdown ||
- (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) )
- {
- ERROR("Domain has not been suspended: shutdown %d, reason %d",
- ctx->dominfo.shutdown, ctx->dominfo.shutdown_reason);
- return -1;
- }
-
- xc_report_progress_single(xch, "Domain now suspended");
-
- return 0;
-}
-
-/*
- * Send a subset of pages in the guests p2m, according to the dirty bitmap.
- * Used for each subsequent iteration of the live migration loop.
- *
- * Bitmap is bounded by p2m_size.
- */
-static int send_dirty_pages(struct xc_sr_context *ctx,
- unsigned long entries)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t p;
- unsigned long written;
- int rc;
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->save.dirty_bitmap_hbuf);
-
- for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p )
- {
- if ( !test_bit(p, dirty_bitmap) )
- continue;
-
- rc = add_to_batch(ctx, p);
- if ( rc )
- return rc;
-
- /* Update progress every 4MB worth of memory sent. */
- if ( (written & ((1U << (22 - 12)) - 1)) == 0 )
- xc_report_progress_step(xch, written, entries);
-
- ++written;
- }
-
- rc = flush_batch(ctx);
- if ( rc )
- return rc;
-
- if ( written > entries )
- DPRINTF("Bitmap contained more entries than expected...");
-
- xc_report_progress_step(xch, entries, entries);
-
- return ctx->save.ops.check_vm_state(ctx);
-}
-
-/*
- * Send all pages in the guests p2m. Used as the first iteration of the live
- * migration loop, and for a non-live save.
- */
-static int send_all_pages(struct xc_sr_context *ctx)
-{
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->save.dirty_bitmap_hbuf);
-
- bitmap_set(dirty_bitmap, ctx->save.p2m_size);
-
- return send_dirty_pages(ctx, ctx->save.p2m_size);
-}
-
-static int enable_logdirty(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int on1 = 0, off = 0, on2 = 0;
- int rc;
-
- /* This juggling is required if logdirty is enabled for VRAM tracking. */
- rc = xc_shadow_control(xch, ctx->domid,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL);
- if ( rc < 0 )
- {
- on1 = errno;
- rc = xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL);
- if ( rc < 0 )
- off = errno;
- else {
- rc = xc_shadow_control(xch, ctx->domid,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL);
- if ( rc < 0 )
- on2 = errno;
- }
- if ( rc < 0 )
- {
- PERROR("Failed to enable logdirty: %d,%d,%d", on1, off, on2);
- return rc;
- }
- }
-
- return 0;
-}
-
-static int update_progress_string(struct xc_sr_context *ctx, char **str)
-{
- xc_interface *xch = ctx->xch;
- char *new_str = NULL;
- unsigned int iter = ctx->save.stats.iteration;
-
- if ( asprintf(&new_str, "Frames iteration %u", iter) == -1 )
- {
- PERROR("Unable to allocate new progress string");
- return -1;
- }
-
- free(*str);
- *str = new_str;
-
- xc_set_progress_prefix(xch, *str);
- return 0;
-}
-
-/*
- * This is the live migration precopy policy - it's called periodically during
- * the precopy phase of live migrations, and is responsible for deciding when
- * the precopy phase should terminate and what should be done next.
- *
- * The policy implemented here behaves identically to the policy previously
- * hard-coded into xc_domain_save() - it proceeds to the stop-and-copy phase of
- * the live migration when there are either fewer than 50 dirty pages, or more
- * than 5 precopy rounds have completed.
- */
-#define SPP_MAX_ITERATIONS 5
-#define SPP_TARGET_DIRTY_COUNT 50
-
-static int simple_precopy_policy(struct precopy_stats stats, void *user)
-{
- return ((stats.dirty_count >= 0 &&
- stats.dirty_count < SPP_TARGET_DIRTY_COUNT) ||
- stats.iteration >= SPP_MAX_ITERATIONS)
- ? XGS_POLICY_STOP_AND_COPY
- : XGS_POLICY_CONTINUE_PRECOPY;
-}
-
-/*
- * Send memory while guest is running.
- */
-static int send_memory_live(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
- char *progress_str = NULL;
- unsigned int x = 0;
- int rc;
- int policy_decision;
-
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->save.dirty_bitmap_hbuf);
-
- precopy_policy_t precopy_policy = ctx->save.callbacks->precopy_policy;
- void *data = ctx->save.callbacks->data;
-
- struct precopy_stats *policy_stats;
-
- rc = update_progress_string(ctx, &progress_str);
- if ( rc )
- goto out;
-
- ctx->save.stats = (struct precopy_stats){
- .dirty_count = ctx->save.p2m_size,
- };
- policy_stats = &ctx->save.stats;
-
- if ( precopy_policy == NULL )
- precopy_policy = simple_precopy_policy;
-
- bitmap_set(dirty_bitmap, ctx->save.p2m_size);
-
- for ( ; ; )
- {
- policy_decision = precopy_policy(*policy_stats, data);
- x++;
-
- if ( stats.dirty_count > 0 && policy_decision != XGS_POLICY_ABORT )
- {
- rc = update_progress_string(ctx, &progress_str);
- if ( rc )
- goto out;
-
- rc = send_dirty_pages(ctx, stats.dirty_count);
- if ( rc )
- goto out;
- }
-
- if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY )
- break;
-
- policy_stats->iteration = x;
- policy_stats->total_written += policy_stats->dirty_count;
- policy_stats->dirty_count = -1;
-
- policy_decision = precopy_policy(*policy_stats, data);
-
- if ( policy_decision != XGS_POLICY_CONTINUE_PRECOPY )
- break;
-
- if ( xc_shadow_control(
- xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size,
- NULL, 0, &stats) != ctx->save.p2m_size )
- {
- PERROR("Failed to retrieve logdirty bitmap");
- rc = -1;
- goto out;
- }
-
- policy_stats->dirty_count = stats.dirty_count;
-
- }
-
- if ( policy_decision == XGS_POLICY_ABORT )
- {
- PERROR("Abort precopy loop");
- rc = -1;
- goto out;
- }
-
- out:
- xc_set_progress_prefix(xch, NULL);
- free(progress_str);
- return rc;
-}
-
-static int colo_merge_secondary_dirty_bitmap(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_record rec;
- uint64_t *pfns = NULL;
- uint64_t pfn;
- unsigned int count, i;
- int rc;
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->save.dirty_bitmap_hbuf);
-
- rc = read_record(ctx, ctx->save.recv_fd, &rec);
- if ( rc )
- goto err;
-
- if ( rec.type != REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST )
- {
- PERROR("Expect dirty bitmap record, but received %u", rec.type);
- rc = -1;
- goto err;
- }
-
- if ( rec.length % sizeof(*pfns) )
- {
- PERROR("Invalid dirty pfn list record length %u", rec.length);
- rc = -1;
- goto err;
- }
-
- count = rec.length / sizeof(*pfns);
- pfns = rec.data;
-
- for ( i = 0; i < count; i++ )
- {
- pfn = pfns[i];
- if ( pfn > ctx->save.p2m_size )
- {
- PERROR("Invalid pfn 0x%" PRIx64, pfn);
- rc = -1;
- goto err;
- }
-
- set_bit(pfn, dirty_bitmap);
- }
-
- rc = 0;
-
- err:
- free(rec.data);
- return rc;
-}
-
-/*
- * Suspend the domain and send dirty memory.
- * This is the last iteration of the live migration and the
- * heart of the checkpointed stream.
- */
-static int suspend_and_send_dirty(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
- char *progress_str = NULL;
- int rc;
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->save.dirty_bitmap_hbuf);
-
- rc = suspend_domain(ctx);
- if ( rc )
- goto out;
-
- if ( xc_shadow_control(
- xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- HYPERCALL_BUFFER(dirty_bitmap), ctx->save.p2m_size,
- NULL, XEN_DOMCTL_SHADOW_LOGDIRTY_FINAL, &stats) !=
- ctx->save.p2m_size )
- {
- PERROR("Failed to retrieve logdirty bitmap");
- rc = -1;
- goto out;
- }
-
- if ( ctx->save.live )
- {
- rc = update_progress_string(ctx, &progress_str);
- if ( rc )
- goto out;
- }
- else
- xc_set_progress_prefix(xch, "Checkpointed save");
-
- bitmap_or(dirty_bitmap, ctx->save.deferred_pages, ctx->save.p2m_size);
-
- if ( !ctx->save.live && ctx->stream_type == XC_STREAM_COLO )
- {
- rc = colo_merge_secondary_dirty_bitmap(ctx);
- if ( rc )
- {
- PERROR("Failed to get secondary vm's dirty pages");
- goto out;
- }
- }
-
- rc = send_dirty_pages(ctx, stats.dirty_count + ctx->save.nr_deferred_pages);
- if ( rc )
- goto out;
-
- bitmap_clear(ctx->save.deferred_pages, ctx->save.p2m_size);
- ctx->save.nr_deferred_pages = 0;
-
- out:
- xc_set_progress_prefix(xch, NULL);
- free(progress_str);
- return rc;
-}
-
-static int verify_frames(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size };
- int rc;
- struct xc_sr_record rec = { .type = REC_TYPE_VERIFY };
-
- DPRINTF("Enabling verify mode");
-
- rc = write_record(ctx, &rec);
- if ( rc )
- goto out;
-
- xc_set_progress_prefix(xch, "Frames verify");
- rc = send_all_pages(ctx);
- if ( rc )
- goto out;
-
- if ( xc_shadow_control(
- xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_PEEK,
- &ctx->save.dirty_bitmap_hbuf, ctx->save.p2m_size,
- NULL, 0, &stats) != ctx->save.p2m_size )
- {
- PERROR("Failed to retrieve logdirty bitmap");
- rc = -1;
- goto out;
- }
-
- DPRINTF(" Further stats: faults %u, dirty %u",
- stats.fault_count, stats.dirty_count);
-
- out:
- return rc;
-}
-
-/*
- * Send all domain memory. This is the heart of the live migration loop.
- */
-static int send_domain_memory_live(struct xc_sr_context *ctx)
-{
- int rc;
-
- rc = enable_logdirty(ctx);
- if ( rc )
- goto out;
-
- rc = send_memory_live(ctx);
- if ( rc )
- goto out;
-
- rc = suspend_and_send_dirty(ctx);
- if ( rc )
- goto out;
-
- if ( ctx->save.debug && ctx->stream_type != XC_STREAM_PLAIN )
- {
- rc = verify_frames(ctx);
- if ( rc )
- goto out;
- }
-
- out:
- return rc;
-}
-
-/*
- * Checkpointed save.
- */
-static int send_domain_memory_checkpointed(struct xc_sr_context *ctx)
-{
- return suspend_and_send_dirty(ctx);
-}
-
-/*
- * Send all domain memory, pausing the domain first. Generally used for
- * suspend-to-file.
- */
-static int send_domain_memory_nonlive(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc;
-
- rc = suspend_domain(ctx);
- if ( rc )
- goto err;
-
- xc_set_progress_prefix(xch, "Frames");
-
- rc = send_all_pages(ctx);
- if ( rc )
- goto err;
-
- err:
- return rc;
-}
-
-static int setup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc;
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->save.dirty_bitmap_hbuf);
-
- rc = ctx->save.ops.setup(ctx);
- if ( rc )
- goto err;
-
- dirty_bitmap = xc_hypercall_buffer_alloc_pages(
- xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->save.p2m_size)));
- ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE *
- sizeof(*ctx->save.batch_pfns));
- ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size));
-
- if ( !ctx->save.batch_pfns || !dirty_bitmap || !ctx->save.deferred_pages )
- {
- ERROR("Unable to allocate memory for dirty bitmaps, batch pfns and"
- " deferred pages");
- rc = -1;
- errno = ENOMEM;
- goto err;
- }
-
- rc = 0;
-
- err:
- return rc;
-}
-
-static void cleanup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
- &ctx->save.dirty_bitmap_hbuf);
-
-
- xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL);
-
- if ( ctx->save.ops.cleanup(ctx) )
- PERROR("Failed to clean up");
-
- xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
- NRPAGES(bitmap_size(ctx->save.p2m_size)));
- free(ctx->save.deferred_pages);
- free(ctx->save.batch_pfns);
-}
-
-/*
- * Save a domain.
- */
-static int save(struct xc_sr_context *ctx, uint16_t guest_type)
-{
- xc_interface *xch = ctx->xch;
- int rc, saved_rc = 0, saved_errno = 0;
-
- IPRINTF("Saving domain %d, type %s",
- ctx->domid, dhdr_type_to_str(guest_type));
-
- rc = setup(ctx);
- if ( rc )
- goto err;
-
- xc_report_progress_single(xch, "Start of stream");
-
- rc = write_headers(ctx, guest_type);
- if ( rc )
- goto err;
-
- rc = ctx->save.ops.static_data(ctx);
- if ( rc )
- goto err;
-
- rc = write_static_data_end_record(ctx);
- if ( rc )
- goto err;
-
- rc = ctx->save.ops.start_of_stream(ctx);
- if ( rc )
- goto err;
-
- do {
- rc = ctx->save.ops.start_of_checkpoint(ctx);
- if ( rc )
- goto err;
-
- rc = ctx->save.ops.check_vm_state(ctx);
- if ( rc )
- goto err;
-
- if ( ctx->save.live )
- rc = send_domain_memory_live(ctx);
- else if ( ctx->stream_type != XC_STREAM_PLAIN )
- rc = send_domain_memory_checkpointed(ctx);
- else
- rc = send_domain_memory_nonlive(ctx);
-
- if ( rc )
- goto err;
-
- if ( !ctx->dominfo.shutdown ||
- (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) )
- {
- ERROR("Domain has not been suspended");
- rc = -1;
- goto err;
- }
-
- rc = ctx->save.ops.end_of_checkpoint(ctx);
- if ( rc )
- goto err;
-
- if ( ctx->stream_type != XC_STREAM_PLAIN )
- {
- /*
- * We have now completed the initial live portion of the checkpoint
- * process. Therefore switch into periodically sending synchronous
- * batches of pages.
- */
- ctx->save.live = false;
-
- rc = write_checkpoint_record(ctx);
- if ( rc )
- goto err;
-
- if ( ctx->stream_type == XC_STREAM_COLO )
- {
- rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data);
- if ( !rc )
- {
- rc = -1;
- goto err;
- }
- }
-
- rc = ctx->save.callbacks->postcopy(ctx->save.callbacks->data);
- if ( rc <= 0 )
- goto err;
-
- if ( ctx->stream_type == XC_STREAM_COLO )
- {
- rc = ctx->save.callbacks->wait_checkpoint(
- ctx->save.callbacks->data);
- if ( rc <= 0 )
- goto err;
- }
- else if ( ctx->stream_type == XC_STREAM_REMUS )
- {
- rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data);
- if ( rc <= 0 )
- goto err;
- }
- else
- {
- ERROR("Unknown checkpointed stream");
- rc = -1;
- goto err;
- }
- }
- } while ( ctx->stream_type != XC_STREAM_PLAIN );
-
- xc_report_progress_single(xch, "End of stream");
-
- rc = write_end_record(ctx);
- if ( rc )
- goto err;
-
- xc_report_progress_single(xch, "Complete");
- goto done;
-
- err:
- saved_errno = errno;
- saved_rc = rc;
- PERROR("Save failed");
-
- done:
- cleanup(ctx);
-
- if ( saved_rc )
- {
- rc = saved_rc;
- errno = saved_errno;
- }
-
- return rc;
-};
-
-int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom,
- uint32_t flags, struct save_callbacks *callbacks,
- xc_stream_type_t stream_type, int recv_fd)
-{
- struct xc_sr_context ctx = {
- .xch = xch,
- .fd = io_fd,
- .stream_type = stream_type,
- };
-
- /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions. */
- ctx.save.callbacks = callbacks;
- ctx.save.live = !!(flags & XCFLAGS_LIVE);
- ctx.save.debug = !!(flags & XCFLAGS_DEBUG);
- ctx.save.recv_fd = recv_fd;
-
- if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
- {
- PERROR("Failed to get domain info");
- return -1;
- }
-
- if ( ctx.dominfo.domid != dom )
- {
- ERROR("Domain %u does not exist", dom);
- return -1;
- }
-
- /* Sanity check stream_type-related parameters */
- switch ( stream_type )
- {
- case XC_STREAM_COLO:
- assert(callbacks->wait_checkpoint);
- /* Fallthrough */
- case XC_STREAM_REMUS:
- assert(callbacks->checkpoint && callbacks->postcopy);
- /* Fallthrough */
- case XC_STREAM_PLAIN:
- if ( ctx.dominfo.hvm )
- assert(callbacks->switch_qemu_logdirty);
- break;
-
- default:
- assert(!"Bad stream_type");
- break;
- }
-
- DPRINTF("fd %d, dom %u, flags %u, hvm %d",
- io_fd, dom, flags, ctx.dominfo.hvm);
-
- ctx.domid = dom;
-
- if ( ctx.dominfo.hvm )
- {
- ctx.save.ops = save_ops_x86_hvm;
- return save(&ctx, DHDR_TYPE_X86_HVM);
- }
- else
- {
- ctx.save.ops = save_ops_x86_pv;
- return save(&ctx, DHDR_TYPE_X86_PV);
- }
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include <assert.h>
-
-#include "xg_sr_common_x86.h"
-
-#include <xen/hvm/params.h>
-
-/*
- * Query for the HVM context and write an HVM_CONTEXT record into the stream.
- */
-static int write_hvm_context(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc, hvm_buf_size;
- struct xc_sr_record hvm_rec = {
- .type = REC_TYPE_HVM_CONTEXT,
- };
-
- hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid, 0, 0);
- if ( hvm_buf_size < 0 )
- {
- PERROR("Couldn't get HVM context size from Xen");
- rc = -1;
- goto out;
- }
-
- hvm_rec.data = malloc(hvm_buf_size);
- if ( !hvm_rec.data )
- {
- PERROR("Couldn't allocate memory");
- rc = -1;
- goto out;
- }
-
- hvm_buf_size = xc_domain_hvm_getcontext(xch, ctx->domid,
- hvm_rec.data, hvm_buf_size);
- if ( hvm_buf_size < 0 )
- {
- PERROR("Couldn't get HVM context from Xen");
- rc = -1;
- goto out;
- }
-
- hvm_rec.length = hvm_buf_size;
- rc = write_record(ctx, &hvm_rec);
- if ( rc < 0 )
- {
- PERROR("error write HVM_CONTEXT record");
- goto out;
- }
-
- out:
- free(hvm_rec.data);
- return rc;
-}
-
-/*
- * Query for a range of HVM parameters and write an HVM_PARAMS record into the
- * stream.
- */
-static int write_hvm_params(struct xc_sr_context *ctx)
-{
- static const unsigned int params[] = {
- HVM_PARAM_STORE_PFN,
- HVM_PARAM_IOREQ_PFN,
- HVM_PARAM_BUFIOREQ_PFN,
- HVM_PARAM_PAGING_RING_PFN,
- HVM_PARAM_MONITOR_RING_PFN,
- HVM_PARAM_SHARING_RING_PFN,
- HVM_PARAM_VM86_TSS_SIZED,
- HVM_PARAM_CONSOLE_PFN,
- HVM_PARAM_ACPI_IOPORTS_LOCATION,
- HVM_PARAM_VIRIDIAN,
- HVM_PARAM_IDENT_PT,
- HVM_PARAM_VM_GENERATION_ID_ADDR,
- HVM_PARAM_IOREQ_SERVER_PFN,
- HVM_PARAM_NR_IOREQ_SERVER_PAGES,
- HVM_PARAM_X87_FIP_WIDTH,
- HVM_PARAM_MCA_CAP,
- };
-
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_hvm_params_entry entries[ARRAY_SIZE(params)];
- struct xc_sr_rec_hvm_params hdr = {
- .count = 0,
- };
- struct xc_sr_record rec = {
- .type = REC_TYPE_HVM_PARAMS,
- .length = sizeof(hdr),
- .data = &hdr,
- };
- unsigned int i;
- int rc;
-
- for ( i = 0; i < ARRAY_SIZE(params); i++ )
- {
- uint32_t index = params[i];
- uint64_t value;
-
- rc = xc_hvm_param_get(xch, ctx->domid, index, &value);
- if ( rc )
- {
- PERROR("Failed to get HVMPARAM at index %u", index);
- return rc;
- }
-
- if ( value != 0 )
- {
- entries[hdr.count].index = index;
- entries[hdr.count].value = value;
- hdr.count++;
- }
- }
-
- /* No params? Skip this record. */
- if ( hdr.count == 0 )
- return 0;
-
- rc = write_split_record(ctx, &rec, entries, hdr.count * sizeof(*entries));
- if ( rc )
- PERROR("Failed to write HVM_PARAMS record");
-
- return rc;
-}
-
-static xen_pfn_t x86_hvm_pfn_to_gfn(const struct xc_sr_context *ctx,
- xen_pfn_t pfn)
-{
- /* identity map */
- return pfn;
-}
-
-static int x86_hvm_normalise_page(struct xc_sr_context *ctx,
- xen_pfn_t type, void **page)
-{
- return 0;
-}
-
-static int x86_hvm_setup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t nr_pfns;
-
- if ( xc_domain_nr_gpfns(xch, ctx->domid, &nr_pfns) < 0 )
- {
- PERROR("Unable to obtain the guest p2m size");
- return -1;
- }
-#ifdef __i386__
- /* Very large domains (> 1TB) will exhaust virtual address space. */
- if ( nr_pfns > 0x0fffffff )
- {
- errno = E2BIG;
- PERROR("Cannot save this big a guest");
- return -1;
- }
-#endif
-
- ctx->save.p2m_size = nr_pfns;
-
- if ( ctx->save.callbacks->switch_qemu_logdirty(
- ctx->domid, 1, ctx->save.callbacks->data) )
- {
- PERROR("Couldn't enable qemu log-dirty mode");
- return -1;
- }
-
- ctx->x86.hvm.save.qemu_enabled_logdirty = true;
-
- return 0;
-}
-
-static int x86_hvm_static_data(struct xc_sr_context *ctx)
-{
- return write_x86_cpu_policy_records(ctx);
-}
-
-static int x86_hvm_start_of_stream(struct xc_sr_context *ctx)
-{
- return 0;
-}
-
-static int x86_hvm_start_of_checkpoint(struct xc_sr_context *ctx)
-{
- return 0;
-}
-
-static int x86_hvm_check_vm_state(struct xc_sr_context *ctx)
-{
- return 0;
-}
-
-static int x86_hvm_end_of_checkpoint(struct xc_sr_context *ctx)
-{
- int rc;
-
- /* Write the TSC record. */
- rc = write_x86_tsc_info(ctx);
- if ( rc )
- return rc;
-
- /* Write the HVM_CONTEXT record. */
- rc = write_hvm_context(ctx);
- if ( rc )
- return rc;
-
- /* Write HVM_PARAMS record contains applicable HVM params. */
- rc = write_hvm_params(ctx);
- if ( rc )
- return rc;
-
- return 0;
-}
-
-static int x86_hvm_cleanup(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
-
- /* If qemu successfully enabled logdirty mode, attempt to disable. */
- if ( ctx->x86.hvm.save.qemu_enabled_logdirty &&
- ctx->save.callbacks->switch_qemu_logdirty(
- ctx->domid, 0, ctx->save.callbacks->data) )
- {
- PERROR("Couldn't disable qemu log-dirty mode");
- return -1;
- }
-
- return 0;
-}
-
-struct xc_sr_save_ops save_ops_x86_hvm =
-{
- .pfn_to_gfn = x86_hvm_pfn_to_gfn,
- .normalise_page = x86_hvm_normalise_page,
- .setup = x86_hvm_setup,
- .static_data = x86_hvm_static_data,
- .start_of_stream = x86_hvm_start_of_stream,
- .start_of_checkpoint = x86_hvm_start_of_checkpoint,
- .end_of_checkpoint = x86_hvm_end_of_checkpoint,
- .check_vm_state = x86_hvm_check_vm_state,
- .cleanup = x86_hvm_cleanup,
-};
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#include <assert.h>
-#include <limits.h>
-
-#include "xg_sr_common_x86_pv.h"
-
-/* Check a 64 bit virtual address for being canonical. */
-static inline bool is_canonical_address(xen_vaddr_t vaddr)
-{
- return ((int64_t)vaddr >> 47) == ((int64_t)vaddr >> 63);
-}
-
-/*
- * Maps the guests shared info page.
- */
-static int map_shinfo(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
-
- ctx->x86.pv.shinfo = xc_map_foreign_range(
- xch, ctx->domid, PAGE_SIZE, PROT_READ, ctx->dominfo.shared_info_frame);
- if ( !ctx->x86.pv.shinfo )
- {
- PERROR("Failed to map shared info frame at mfn %#lx",
- ctx->dominfo.shared_info_frame);
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Copy a list of mfns from a guest, accounting for differences between guest
- * and toolstack width. Can fail if truncation would occur.
- */
-static int copy_mfns_from_guest(const struct xc_sr_context *ctx,
- xen_pfn_t *dst, const void *src, size_t count)
-{
- size_t x;
-
- if ( ctx->x86.pv.width == sizeof(unsigned long) )
- memcpy(dst, src, count * sizeof(*dst));
- else
- {
- for ( x = 0; x < count; ++x )
- {
-#ifdef __x86_64__
- /* 64bit toolstack, 32bit guest. Expand any INVALID_MFN. */
- uint32_t s = ((uint32_t *)src)[x];
-
- dst[x] = s == ~0U ? INVALID_MFN : s;
-#else
- /*
- * 32bit toolstack, 64bit guest. Truncate INVALID_MFN, but bail
- * if any other truncation would occur.
- *
- * This will only occur on hosts where a PV guest has ram above
- * the 16TB boundary. A 32bit dom0 is unlikely to have
- * successfully booted on a system this large.
- */
- uint64_t s = ((uint64_t *)src)[x];
-
- if ( (s != ~0ULL) && ((s >> 32) != 0) )
- {
- errno = E2BIG;
- return -1;
- }
-
- dst[x] = s;
-#endif
- }
- }
-
- return 0;
-}
-
-/*
- * Map the p2m leave pages and build an array of their pfns.
- */
-static int map_p2m_leaves(struct xc_sr_context *ctx, xen_pfn_t *mfns,
- size_t n_mfns)
-{
- xc_interface *xch = ctx->xch;
- unsigned int x;
-
- ctx->x86.pv.p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
- mfns, n_mfns);
- if ( !ctx->x86.pv.p2m )
- {
- PERROR("Failed to map p2m frames");
- return -1;
- }
-
- ctx->save.p2m_size = ctx->x86.pv.max_pfn + 1;
- ctx->x86.pv.p2m_frames = n_mfns;
- ctx->x86.pv.p2m_pfns = malloc(n_mfns * sizeof(*mfns));
- if ( !ctx->x86.pv.p2m_pfns )
- {
- ERROR("Cannot allocate %zu bytes for p2m pfns list",
- n_mfns * sizeof(*mfns));
- return -1;
- }
-
- /* Convert leaf frames from mfns to pfns. */
- for ( x = 0; x < n_mfns; ++x )
- {
- if ( !mfn_in_pseudophysmap(ctx, mfns[x]) )
- {
- ERROR("Bad mfn in p2m_frame_list[%u]", x);
- dump_bad_pseudophysmap_entry(ctx, mfns[x]);
- errno = ERANGE;
- return -1;
- }
-
- ctx->x86.pv.p2m_pfns[x] = mfn_to_pfn(ctx, mfns[x]);
- }
-
- return 0;
-}
-
-/*
- * Walk the guests frame list list and frame list to identify and map the
- * frames making up the guests p2m table. Construct a list of pfns making up
- * the table.
- */
-static int map_p2m_tree(struct xc_sr_context *ctx)
-{
- /* Terminology:
- *
- * fll - frame list list, top level p2m, list of fl mfns
- * fl - frame list, mid level p2m, list of leaf mfns
- * local - own allocated buffers, adjusted for bitness
- * guest - mappings into the domain
- */
- xc_interface *xch = ctx->xch;
- int rc = -1;
- unsigned int x, saved_x, fpp, fll_entries, fl_entries;
- xen_pfn_t fll_mfn, saved_mfn, max_pfn;
-
- xen_pfn_t *local_fll = NULL;
- void *guest_fll = NULL;
- size_t local_fll_size;
-
- xen_pfn_t *local_fl = NULL;
- void *guest_fl = NULL;
- size_t local_fl_size;
-
- fpp = PAGE_SIZE / ctx->x86.pv.width;
- fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
- if ( fll_entries > fpp )
- {
- ERROR("max_pfn %#lx too large for p2m tree", ctx->x86.pv.max_pfn);
- goto err;
- }
-
- fll_mfn = GET_FIELD(ctx->x86.pv.shinfo, arch.pfn_to_mfn_frame_list_list,
- ctx->x86.pv.width);
- if ( fll_mfn == 0 || fll_mfn > ctx->x86.pv.max_mfn )
- {
- ERROR("Bad mfn %#lx for p2m frame list list", fll_mfn);
- goto err;
- }
-
- /* Map the guest top p2m. */
- guest_fll = xc_map_foreign_range(xch, ctx->domid, PAGE_SIZE,
- PROT_READ, fll_mfn);
- if ( !guest_fll )
- {
- PERROR("Failed to map p2m frame list list at %#lx", fll_mfn);
- goto err;
- }
-
- local_fll_size = fll_entries * sizeof(*local_fll);
- local_fll = malloc(local_fll_size);
- if ( !local_fll )
- {
- ERROR("Cannot allocate %zu bytes for local p2m frame list list",
- local_fll_size);
- goto err;
- }
-
- if ( copy_mfns_from_guest(ctx, local_fll, guest_fll, fll_entries) )
- {
- ERROR("Truncation detected copying p2m frame list list");
- goto err;
- }
-
- /* Check for bad mfns in frame list list. */
- saved_mfn = 0;
- saved_x = 0;
- for ( x = 0; x < fll_entries; ++x )
- {
- if ( local_fll[x] == 0 || local_fll[x] > ctx->x86.pv.max_mfn )
- {
- ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list list",
- local_fll[x], x, fll_entries);
- goto err;
- }
- if ( local_fll[x] != saved_mfn )
- {
- saved_mfn = local_fll[x];
- saved_x = x;
- }
- }
-
- /*
- * Check for actual lower max_pfn:
- * If the trailing entries of the frame list list were all the same we can
- * assume they all reference mid pages all referencing p2m pages with all
- * invalid entries. Otherwise there would be multiple pfns referencing all
- * the same mfn which can't work across migration, as this sharing would be
- * broken by the migration process.
- * Adjust max_pfn if possible to avoid allocating much larger areas as
- * needed for p2m and logdirty map.
- */
- max_pfn = (saved_x + 1) * fpp * fpp - 1;
- if ( max_pfn < ctx->x86.pv.max_pfn )
- {
- ctx->x86.pv.max_pfn = max_pfn;
- fll_entries = (ctx->x86.pv.max_pfn / (fpp * fpp)) + 1;
- }
- ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
- DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
- ctx->x86.pv.p2m_frames);
- fl_entries = (ctx->x86.pv.max_pfn / fpp) + 1;
-
- /* Map the guest mid p2m frames. */
- guest_fl = xc_map_foreign_pages(xch, ctx->domid, PROT_READ,
- local_fll, fll_entries);
- if ( !guest_fl )
- {
- PERROR("Failed to map p2m frame list");
- goto err;
- }
-
- local_fl_size = fl_entries * sizeof(*local_fl);
- local_fl = malloc(local_fl_size);
- if ( !local_fl )
- {
- ERROR("Cannot allocate %zu bytes for local p2m frame list",
- local_fl_size);
- goto err;
- }
-
- if ( copy_mfns_from_guest(ctx, local_fl, guest_fl, fl_entries) )
- {
- ERROR("Truncation detected copying p2m frame list");
- goto err;
- }
-
- for ( x = 0; x < fl_entries; ++x )
- {
- if ( local_fl[x] == 0 || local_fl[x] > ctx->x86.pv.max_mfn )
- {
- ERROR("Bad mfn %#lx at index %u (of %u) in p2m frame list",
- local_fl[x], x, fl_entries);
- goto err;
- }
- }
-
- /* Map the p2m leaves themselves. */
- rc = map_p2m_leaves(ctx, local_fl, fl_entries);
-
- err:
- free(local_fl);
- if ( guest_fl )
- munmap(guest_fl, fll_entries * PAGE_SIZE);
-
- free(local_fll);
- if ( guest_fll )
- munmap(guest_fll, PAGE_SIZE);
-
- return rc;
-}
-
-/*
- * Get p2m_generation count.
- * Returns an error if the generation count has changed since the last call.
- */
-static int get_p2m_generation(struct xc_sr_context *ctx)
-{
- uint64_t p2m_generation;
- int rc;
-
- p2m_generation = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_generation,
- ctx->x86.pv.width);
-
- rc = (p2m_generation == ctx->x86.pv.p2m_generation) ? 0 : -1;
- ctx->x86.pv.p2m_generation = p2m_generation;
-
- return rc;
-}
-
-static int x86_pv_check_vm_state_p2m_list(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc;
-
- if ( !ctx->save.live )
- return 0;
-
- rc = get_p2m_generation(ctx);
- if ( rc )
- ERROR("p2m generation count changed. Migration aborted.");
-
- return rc;
-}
-
-/*
- * Map the guest p2m frames specified via a cr3 value, a virtual address, and
- * the maximum pfn. PTE entries are 64 bits for both, 32 and 64 bit guests as
- * in 32 bit case we support PAE guests only.
- */
-static int map_p2m_list(struct xc_sr_context *ctx, uint64_t p2m_cr3)
-{
- xc_interface *xch = ctx->xch;
- xen_vaddr_t p2m_vaddr, p2m_end, mask, off;
- xen_pfn_t p2m_mfn, mfn, saved_mfn, max_pfn;
- uint64_t *ptes = NULL;
- xen_pfn_t *mfns = NULL;
- unsigned int fpp, n_pages, level, shift, idx_start, idx_end, idx, saved_idx;
- int rc = -1;
-
- p2m_mfn = cr3_to_mfn(ctx, p2m_cr3);
- assert(p2m_mfn != 0);
- if ( p2m_mfn > ctx->x86.pv.max_mfn )
- {
- ERROR("Bad p2m_cr3 value %#" PRIx64, p2m_cr3);
- errno = ERANGE;
- goto err;
- }
-
- get_p2m_generation(ctx);
-
- p2m_vaddr = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_vaddr,
- ctx->x86.pv.width);
- fpp = PAGE_SIZE / ctx->x86.pv.width;
- ctx->x86.pv.p2m_frames = ctx->x86.pv.max_pfn / fpp + 1;
- p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
-
- if ( ctx->x86.pv.width == 8 )
- {
- mask = 0x0000ffffffffffffULL;
- if ( !is_canonical_address(p2m_vaddr) ||
- !is_canonical_address(p2m_end) ||
- p2m_end < p2m_vaddr ||
- (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_64 &&
- p2m_end > HYPERVISOR_VIRT_START_X86_64) )
- {
- ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
- p2m_vaddr, p2m_end);
- errno = ERANGE;
- goto err;
- }
- }
- else
- {
- mask = 0x00000000ffffffffULL;
- if ( p2m_vaddr > mask || p2m_end > mask || p2m_end < p2m_vaddr ||
- (p2m_vaddr <= HYPERVISOR_VIRT_END_X86_32 &&
- p2m_end > HYPERVISOR_VIRT_START_X86_32) )
- {
- ERROR("Bad virtual p2m address range %#" PRIx64 "-%#" PRIx64,
- p2m_vaddr, p2m_end);
- errno = ERANGE;
- goto err;
- }
- }
-
- DPRINTF("p2m list from %#" PRIx64 " to %#" PRIx64 ", root at %#lx",
- p2m_vaddr, p2m_end, p2m_mfn);
- DPRINTF("max_pfn %#lx, p2m_frames %d", ctx->x86.pv.max_pfn,
- ctx->x86.pv.p2m_frames);
-
- mfns = malloc(sizeof(*mfns));
- if ( !mfns )
- {
- ERROR("Cannot allocate memory for array of %u mfns", 1);
- goto err;
- }
- mfns[0] = p2m_mfn;
- off = 0;
- saved_mfn = 0;
- idx_start = idx_end = saved_idx = 0;
-
- for ( level = ctx->x86.pv.levels; level > 0; level-- )
- {
- n_pages = idx_end - idx_start + 1;
- ptes = xc_map_foreign_pages(xch, ctx->domid, PROT_READ, mfns, n_pages);
- if ( !ptes )
- {
- PERROR("Failed to map %u page table pages for p2m list", n_pages);
- goto err;
- }
- free(mfns);
-
- shift = level * 9 + 3;
- idx_start = ((p2m_vaddr - off) & mask) >> shift;
- idx_end = ((p2m_end - off) & mask) >> shift;
- idx = idx_end - idx_start + 1;
- mfns = malloc(sizeof(*mfns) * idx);
- if ( !mfns )
- {
- ERROR("Cannot allocate memory for array of %u mfns", idx);
- goto err;
- }
-
- for ( idx = idx_start; idx <= idx_end; idx++ )
- {
- mfn = pte_to_frame(ptes[idx]);
- if ( mfn == 0 || mfn > ctx->x86.pv.max_mfn )
- {
- ERROR("Bad mfn %#lx during page table walk for vaddr %#" PRIx64 " at level %d of p2m list",
- mfn, off + ((xen_vaddr_t)idx << shift), level);
- errno = ERANGE;
- goto err;
- }
- mfns[idx - idx_start] = mfn;
-
- /* Maximum pfn check at level 2. Same reasoning as for p2m tree. */
- if ( level == 2 )
- {
- if ( mfn != saved_mfn )
- {
- saved_mfn = mfn;
- saved_idx = idx - idx_start;
- }
- }
- }
-
- if ( level == 2 )
- {
- if ( saved_idx == idx_end )
- saved_idx++;
- max_pfn = ((xen_pfn_t)saved_idx << 9) * fpp - 1;
- if ( max_pfn < ctx->x86.pv.max_pfn )
- {
- ctx->x86.pv.max_pfn = max_pfn;
- ctx->x86.pv.p2m_frames = (ctx->x86.pv.max_pfn + fpp) / fpp;
- p2m_end = p2m_vaddr + ctx->x86.pv.p2m_frames * PAGE_SIZE - 1;
- idx_end = idx_start + saved_idx;
- }
- }
-
- munmap(ptes, n_pages * PAGE_SIZE);
- ptes = NULL;
- off = p2m_vaddr & ((mask >> shift) << shift);
- }
-
- /* Map the p2m leaves themselves. */
- rc = map_p2m_leaves(ctx, mfns, idx_end - idx_start + 1);
-
- err:
- free(mfns);
- if ( ptes )
- munmap(ptes, n_pages * PAGE_SIZE);
-
- return rc;
-}
-
-/*
- * Map the guest p2m frames.
- * Depending on guest support this might either be a virtual mapped linear
- * list (preferred format) or a 3 level tree linked via mfns.
- */
-static int map_p2m(struct xc_sr_context *ctx)
-{
- uint64_t p2m_cr3;
-
- ctx->x86.pv.p2m_generation = ~0ULL;
- ctx->x86.pv.max_pfn = GET_FIELD(ctx->x86.pv.shinfo, arch.max_pfn,
- ctx->x86.pv.width) - 1;
- p2m_cr3 = GET_FIELD(ctx->x86.pv.shinfo, arch.p2m_cr3, ctx->x86.pv.width);
-
- return p2m_cr3 ? map_p2m_list(ctx, p2m_cr3) : map_p2m_tree(ctx);
-}
-
-/*
- * Obtain a specific vcpus basic state and write an X86_PV_VCPU_BASIC record
- * into the stream. Performs mfn->pfn conversion on architectural state.
- */
-static int write_one_vcpu_basic(struct xc_sr_context *ctx, uint32_t id)
-{
- xc_interface *xch = ctx->xch;
- xen_pfn_t mfn, pfn;
- unsigned int i, gdt_count;
- int rc = -1;
- vcpu_guest_context_any_t vcpu;
- struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
- .vcpu_id = id,
- };
- struct xc_sr_record rec = {
- .type = REC_TYPE_X86_PV_VCPU_BASIC,
- .length = sizeof(vhdr),
- .data = &vhdr,
- };
-
- if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) )
- {
- PERROR("Failed to get vcpu%u context", id);
- goto err;
- }
-
- /* Vcpu0 is special: Convert the suspend record to a pfn. */
- if ( id == 0 )
- {
- mfn = GET_FIELD(&vcpu, user_regs.edx, ctx->x86.pv.width);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("Bad mfn for suspend record");
- dump_bad_pseudophysmap_entry(ctx, mfn);
- errno = ERANGE;
- goto err;
- }
- SET_FIELD(&vcpu, user_regs.edx, mfn_to_pfn(ctx, mfn),
- ctx->x86.pv.width);
- }
-
- gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86.pv.width);
- if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
- {
- ERROR("GDT entry count (%u) out of range (max %u)",
- gdt_count, FIRST_RESERVED_GDT_ENTRY);
- errno = ERANGE;
- goto err;
- }
- gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
-
- /* Convert GDT frames to pfns. */
- for ( i = 0; i < gdt_count; ++i )
- {
- mfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86.pv.width);
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("Bad mfn for frame %u of vcpu%u's GDT", i, id);
- dump_bad_pseudophysmap_entry(ctx, mfn);
- errno = ERANGE;
- goto err;
- }
- SET_FIELD(&vcpu, gdt_frames[i], mfn_to_pfn(ctx, mfn),
- ctx->x86.pv.width);
- }
-
- /* Convert CR3 to a pfn. */
- mfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86.pv.width));
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("Bad mfn for vcpu%u's cr3", id);
- dump_bad_pseudophysmap_entry(ctx, mfn);
- errno = ERANGE;
- goto err;
- }
- pfn = mfn_to_pfn(ctx, mfn);
- SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, pfn), ctx->x86.pv.width);
-
- /* 64bit guests: Convert CR1 (guest pagetables) to pfn. */
- if ( ctx->x86.pv.levels == 4 && vcpu.x64.ctrlreg[1] )
- {
- mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- ERROR("Bad mfn for vcpu%u's cr1", id);
- dump_bad_pseudophysmap_entry(ctx, mfn);
- errno = ERANGE;
- goto err;
- }
- pfn = mfn_to_pfn(ctx, mfn);
- vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT);
- }
-
- if ( ctx->x86.pv.width == 8 )
- rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x64));
- else
- rc = write_split_record(ctx, &rec, &vcpu, sizeof(vcpu.x32));
-
- err:
- return rc;
-}
-
-/*
- * Obtain a specific vcpus extended state and write an X86_PV_VCPU_EXTENDED
- * record into the stream.
- */
-static int write_one_vcpu_extended(struct xc_sr_context *ctx, uint32_t id)
-{
- xc_interface *xch = ctx->xch;
- struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
- .vcpu_id = id,
- };
- struct xc_sr_record rec = {
- .type = REC_TYPE_X86_PV_VCPU_EXTENDED,
- .length = sizeof(vhdr),
- .data = &vhdr,
- };
- struct xen_domctl domctl = {
- .cmd = XEN_DOMCTL_get_ext_vcpucontext,
- .domain = ctx->domid,
- .u.ext_vcpucontext.vcpu = id,
- };
-
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("Unable to get vcpu%u extended context", id);
- return -1;
- }
-
- /* No content? Skip the record. */
- if ( domctl.u.ext_vcpucontext.size == 0 )
- return 0;
-
- return write_split_record(ctx, &rec, &domctl.u.ext_vcpucontext,
- domctl.u.ext_vcpucontext.size);
-}
-
-/*
- * Query to see whether a specific vcpu has xsave state and if so, write an
- * X86_PV_VCPU_XSAVE record into the stream.
- */
-static int write_one_vcpu_xsave(struct xc_sr_context *ctx, uint32_t id)
-{
- xc_interface *xch = ctx->xch;
- int rc = -1;
- DECLARE_HYPERCALL_BUFFER(void, buffer);
- struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
- .vcpu_id = id,
- };
- struct xc_sr_record rec = {
- .type = REC_TYPE_X86_PV_VCPU_XSAVE,
- .length = sizeof(vhdr),
- .data = &vhdr,
- };
- struct xen_domctl domctl = {
- .cmd = XEN_DOMCTL_getvcpuextstate,
- .domain = ctx->domid,
- .u.vcpuextstate.vcpu = id,
- };
-
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("Unable to get vcpu%u's xsave context", id);
- goto err;
- }
-
- /* No xsave state? skip this record. */
- if ( !domctl.u.vcpuextstate.xfeature_mask )
- goto out;
-
- buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size);
- if ( !buffer )
- {
- ERROR("Unable to allocate %"PRIx64" bytes for vcpu%u's xsave context",
- domctl.u.vcpuextstate.size, id);
- goto err;
- }
-
- set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("Unable to get vcpu%u's xsave context", id);
- goto err;
- }
-
- /* No xsave state? Skip this record. */
- if ( domctl.u.vcpuextstate.size == 0 )
- goto out;
-
- rc = write_split_record(ctx, &rec, buffer, domctl.u.vcpuextstate.size);
- if ( rc )
- goto err;
-
- out:
- rc = 0;
-
- err:
- xc_hypercall_buffer_free(xch, buffer);
-
- return rc;
-}
-
-/*
- * Query to see whether a specific vcpu has msr state and if so, write an
- * X86_PV_VCPU_MSRS record into the stream.
- */
-static int write_one_vcpu_msrs(struct xc_sr_context *ctx, uint32_t id)
-{
- xc_interface *xch = ctx->xch;
- int rc = -1;
- size_t buffersz;
- DECLARE_HYPERCALL_BUFFER(void, buffer);
- struct xc_sr_rec_x86_pv_vcpu_hdr vhdr = {
- .vcpu_id = id,
- };
- struct xc_sr_record rec = {
- .type = REC_TYPE_X86_PV_VCPU_MSRS,
- .length = sizeof(vhdr),
- .data = &vhdr,
- };
- struct xen_domctl domctl = {
- .cmd = XEN_DOMCTL_get_vcpu_msrs,
- .domain = ctx->domid,
- .u.vcpu_msrs.vcpu = id,
- };
-
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("Unable to get vcpu%u's msrs", id);
- goto err;
- }
-
- /* No MSRs? skip this record. */
- if ( !domctl.u.vcpu_msrs.msr_count )
- goto out;
-
- buffersz = domctl.u.vcpu_msrs.msr_count * sizeof(xen_domctl_vcpu_msr_t);
- buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz);
- if ( !buffer )
- {
- ERROR("Unable to allocate %zu bytes for vcpu%u's msrs",
- buffersz, id);
- goto err;
- }
-
- set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("Unable to get vcpu%u's msrs", id);
- goto err;
- }
-
- /* No MSRs? Skip this record. */
- if ( domctl.u.vcpu_msrs.msr_count == 0 )
- goto out;
-
- rc = write_split_record(ctx, &rec, buffer,
- domctl.u.vcpu_msrs.msr_count *
- sizeof(xen_domctl_vcpu_msr_t));
- if ( rc )
- goto err;
-
- out:
- rc = 0;
-
- err:
- xc_hypercall_buffer_free(xch, buffer);
-
- return rc;
-}
-
-/*
- * For each vcpu, if it is online, write its state into the stream.
- */
-static int write_all_vcpu_information(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- xc_vcpuinfo_t vinfo;
- unsigned int i;
- int rc;
-
- for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i )
- {
- rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo);
- if ( rc )
- {
- PERROR("Failed to get vcpu%u information", i);
- return rc;
- }
-
- /* Vcpu offline? skip all these records. */
- if ( !vinfo.online )
- continue;
-
- rc = write_one_vcpu_basic(ctx, i);
- if ( rc )
- return rc;
-
- rc = write_one_vcpu_extended(ctx, i);
- if ( rc )
- return rc;
-
- rc = write_one_vcpu_xsave(ctx, i);
- if ( rc )
- return rc;
-
- rc = write_one_vcpu_msrs(ctx, i);
- if ( rc )
- return rc;
- }
-
- return 0;
-}
-
-/*
- * Writes an X86_PV_INFO record into the stream.
- */
-static int write_x86_pv_info(struct xc_sr_context *ctx)
-{
- struct xc_sr_rec_x86_pv_info info = {
- .guest_width = ctx->x86.pv.width,
- .pt_levels = ctx->x86.pv.levels,
- };
- struct xc_sr_record rec = {
- .type = REC_TYPE_X86_PV_INFO,
- .length = sizeof(info),
- .data = &info,
- };
-
- return write_record(ctx, &rec);
-}
-
-/*
- * Writes an X86_PV_P2M_FRAMES record into the stream. This contains the list
- * of pfns making up the p2m table.
- */
-static int write_x86_pv_p2m_frames(struct xc_sr_context *ctx)
-{
- xc_interface *xch = ctx->xch;
- int rc; unsigned int i;
- size_t datasz = ctx->x86.pv.p2m_frames * sizeof(uint64_t);
- uint64_t *data = NULL;
- struct xc_sr_rec_x86_pv_p2m_frames hdr = {
- .end_pfn = ctx->x86.pv.max_pfn,
- };
- struct xc_sr_record rec = {
- .type = REC_TYPE_X86_PV_P2M_FRAMES,
- .length = sizeof(hdr),
- .data = &hdr,
- };
-
- /* No need to translate if sizeof(uint64_t) == sizeof(xen_pfn_t). */
- if ( sizeof(uint64_t) != sizeof(*ctx->x86.pv.p2m_pfns) )
- {
- if ( !(data = malloc(datasz)) )
- {
- ERROR("Cannot allocate %zu bytes for X86_PV_P2M_FRAMES data",
- datasz);
- return -1;
- }
-
- for ( i = 0; i < ctx->x86.pv.p2m_frames; ++i )
- data[i] = ctx->x86.pv.p2m_pfns[i];
- }
- else
- data = (uint64_t *)ctx->x86.pv.p2m_pfns;
-
- rc = write_split_record(ctx, &rec, data, datasz);
-
- if ( data != (uint64_t *)ctx->x86.pv.p2m_pfns )
- free(data);
-
- return rc;
-}
-
-/*
- * Writes an SHARED_INFO record into the stream.
- */
-static int write_shared_info(struct xc_sr_context *ctx)
-{
- struct xc_sr_record rec = {
- .type = REC_TYPE_SHARED_INFO,
- .length = PAGE_SIZE,
- .data = ctx->x86.pv.shinfo,
- };
-
- return write_record(ctx, &rec);
-}
-
-/*
- * Normalise a pagetable for the migration stream. Performs mfn->pfn
- * conversions on the ptes.
- */
-static int normalise_pagetable(struct xc_sr_context *ctx, const uint64_t *src,
- uint64_t *dst, unsigned long type)
-{
- xc_interface *xch = ctx->xch;
- uint64_t pte;
- unsigned int i, xen_first = -1, xen_last = -1; /* Indices of Xen mappings. */
-
- type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( ctx->x86.pv.levels == 4 )
- {
- /* 64bit guests only have Xen mappings in their L4 tables. */
- if ( type == XEN_DOMCTL_PFINFO_L4TAB )
- {
- xen_first = (HYPERVISOR_VIRT_START_X86_64 >>
- L4_PAGETABLE_SHIFT_X86_64) & 511;
- xen_last = (HYPERVISOR_VIRT_END_X86_64 >>
- L4_PAGETABLE_SHIFT_X86_64) & 511;
- }
- }
- else
- {
- switch ( type )
- {
- case XEN_DOMCTL_PFINFO_L4TAB:
- ERROR("??? Found L4 table for 32bit guest");
- errno = EINVAL;
- return -1;
-
- case XEN_DOMCTL_PFINFO_L3TAB:
- /* 32bit guests can only use the first 4 entries of their L3 tables.
- * All other are potentially used by Xen. */
- xen_first = 4;
- xen_last = 511;
- break;
-
- case XEN_DOMCTL_PFINFO_L2TAB:
- /* It is hard to spot Xen mappings in a 32bit guest's L2. Most
- * are normal but only a few will have Xen mappings.
- */
- i = (HYPERVISOR_VIRT_START_X86_32 >> L2_PAGETABLE_SHIFT_PAE) & 511;
- if ( pte_to_frame(src[i]) == ctx->x86.pv.compat_m2p_mfn0 )
- {
- xen_first = i;
- xen_last = (HYPERVISOR_VIRT_END_X86_32 >>
- L2_PAGETABLE_SHIFT_PAE) & 511;
- }
- break;
- }
- }
-
- for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
- {
- xen_pfn_t mfn;
-
- pte = src[i];
-
- /* Remove Xen mappings: Xen will reconstruct on the other side. */
- if ( i >= xen_first && i <= xen_last )
- pte = 0;
-
- /*
- * Errors during the live part of migration are expected as a result
- * of split pagetable updates, page type changes, active grant
- * mappings etc. The pagetable will need to be resent after pausing.
- * In such cases we fail with EAGAIN.
- *
- * For domains which are already paused, errors are fatal.
- */
- if ( pte & _PAGE_PRESENT )
- {
- mfn = pte_to_frame(pte);
-
-#ifdef __i386__
- if ( mfn == INVALID_MFN )
- {
- if ( !ctx->dominfo.paused )
- errno = EAGAIN;
- else
- {
- ERROR("PTE truncation detected. L%lu[%u] = %016"PRIx64,
- type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
- errno = E2BIG;
- }
- return -1;
- }
-#endif
-
- if ( (type > XEN_DOMCTL_PFINFO_L1TAB) && (pte & _PAGE_PSE) )
- {
- ERROR("Cannot migrate superpage (L%lu[%u]: 0x%016"PRIx64")",
- type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
- errno = E2BIG;
- return -1;
- }
-
- if ( !mfn_in_pseudophysmap(ctx, mfn) )
- {
- if ( !ctx->dominfo.paused )
- errno = EAGAIN;
- else
- {
- ERROR("Bad mfn for L%lu[%u]",
- type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i);
- dump_bad_pseudophysmap_entry(ctx, mfn);
- errno = ERANGE;
- }
- return -1;
- }
-
- pte = merge_pte(pte, mfn_to_pfn(ctx, mfn));
- }
-
- dst[i] = pte;
- }
-
- return 0;
-}
-
-static xen_pfn_t x86_pv_pfn_to_gfn(const struct xc_sr_context *ctx,
- xen_pfn_t pfn)
-{
- assert(pfn <= ctx->x86.pv.max_pfn);
-
- return xc_pfn_to_mfn(pfn, ctx->x86.pv.p2m, ctx->x86.pv.width);
-}
-
-
-/*
- * save_ops function. Performs pagetable normalisation on appropriate pages.
- */
-static int x86_pv_normalise_page(struct xc_sr_context *ctx, xen_pfn_t type,
- void **page)
-{
- xc_interface *xch = ctx->xch;
- void *local_page;
- int rc;
-
- type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
- return 0;
-
- local_page = malloc(PAGE_SIZE);
- if ( !local_page )
- {
- ERROR("Unable to allocate scratch page");
- rc = -1;
- goto out;
- }
-
- rc = normalise_pagetable(ctx, *page, local_page, type);
- *page = local_page;
-
- out:
- return rc;
-}
-
-/*
- * save_ops function. Queries domain information and maps the Xen m2p and the
- * guests shinfo and p2m table.
- */
-static int x86_pv_setup(struct xc_sr_context *ctx)
-{
- int rc;
-
- rc = x86_pv_domain_info(ctx);
- if ( rc )
- return rc;
-
- rc = x86_pv_map_m2p(ctx);
- if ( rc )
- return rc;
-
- rc = map_shinfo(ctx);
- if ( rc )
- return rc;
-
- rc = map_p2m(ctx);
- if ( rc )
- return rc;
-
- return 0;
-}
-
-static int x86_pv_static_data(struct xc_sr_context *ctx)
-{
- int rc;
-
- rc = write_x86_pv_info(ctx);
- if ( rc )
- return rc;
-
- rc = write_x86_cpu_policy_records(ctx);
- if ( rc )
- return rc;
-
- return 0;
-}
-
-static int x86_pv_start_of_stream(struct xc_sr_context *ctx)
-{
- int rc;
-
- /*
- * Ideally should be able to change during migration. Currently
- * corruption will occur if the contents or location of the P2M changes
- * during the live migration loop. If one is very lucky, the breakage
- * will not be subtle.
- */
- rc = write_x86_pv_p2m_frames(ctx);
- if ( rc )
- return rc;
-
- return 0;
-}
-
-static int x86_pv_start_of_checkpoint(struct xc_sr_context *ctx)
-{
- return 0;
-}
-
-static int x86_pv_end_of_checkpoint(struct xc_sr_context *ctx)
-{
- int rc;
-
- rc = write_x86_tsc_info(ctx);
- if ( rc )
- return rc;
-
- rc = write_shared_info(ctx);
- if ( rc )
- return rc;
-
- rc = write_all_vcpu_information(ctx);
- if ( rc )
- return rc;
-
- return 0;
-}
-
-static int x86_pv_check_vm_state(struct xc_sr_context *ctx)
-{
- if ( ctx->x86.pv.p2m_generation == ~0ULL )
- return 0;
-
- return x86_pv_check_vm_state_p2m_list(ctx);
-}
-
-static int x86_pv_cleanup(struct xc_sr_context *ctx)
-{
- free(ctx->x86.pv.p2m_pfns);
-
- if ( ctx->x86.pv.p2m )
- munmap(ctx->x86.pv.p2m, ctx->x86.pv.p2m_frames * PAGE_SIZE);
-
- if ( ctx->x86.pv.shinfo )
- munmap(ctx->x86.pv.shinfo, PAGE_SIZE);
-
- if ( ctx->x86.pv.m2p )
- munmap(ctx->x86.pv.m2p, ctx->x86.pv.nr_m2p_frames * PAGE_SIZE);
-
- return 0;
-}
-
-struct xc_sr_save_ops save_ops_x86_pv =
-{
- .pfn_to_gfn = x86_pv_pfn_to_gfn,
- .normalise_page = x86_pv_normalise_page,
- .setup = x86_pv_setup,
- .static_data = x86_pv_static_data,
- .start_of_stream = x86_pv_start_of_stream,
- .start_of_checkpoint = x86_pv_start_of_checkpoint,
- .end_of_checkpoint = x86_pv_end_of_checkpoint,
- .check_vm_state = x86_pv_check_vm_state,
- .cleanup = x86_pv_cleanup,
-};
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-#ifndef __STREAM_FORMAT__H
-#define __STREAM_FORMAT__H
-
-/*
- * C structures for the Migration v2 stream format.
- * See docs/specs/libxc-migration-stream.pandoc
- */
-
-#include <inttypes.h>
-
-/*
- * Image Header
- */
-struct xc_sr_ihdr
-{
- uint64_t marker;
- uint32_t id;
- uint32_t version;
- uint16_t options;
- uint16_t _res1;
- uint32_t _res2;
-};
-
-#define IHDR_MARKER 0xffffffffffffffffULL
-#define IHDR_ID 0x58454E46U
-
-#define _IHDR_OPT_ENDIAN 0
-#define IHDR_OPT_LITTLE_ENDIAN (0 << _IHDR_OPT_ENDIAN)
-#define IHDR_OPT_BIG_ENDIAN (1 << _IHDR_OPT_ENDIAN)
-
-/*
- * Domain Header
- */
-struct xc_sr_dhdr
-{
- uint32_t type;
- uint16_t page_shift;
- uint16_t _res1;
- uint32_t xen_major;
- uint32_t xen_minor;
-};
-
-#define DHDR_TYPE_X86_PV 0x00000001U
-#define DHDR_TYPE_X86_HVM 0x00000002U
-
-/*
- * Record Header
- */
-struct xc_sr_rhdr
-{
- uint32_t type;
- uint32_t length;
-};
-
-/* All records must be aligned up to an 8 octet boundary */
-#define REC_ALIGN_ORDER (3U)
-/* Somewhat arbitrary - 128MB */
-#define REC_LENGTH_MAX (128U << 20)
-
-#define REC_TYPE_END 0x00000000U
-#define REC_TYPE_PAGE_DATA 0x00000001U
-#define REC_TYPE_X86_PV_INFO 0x00000002U
-#define REC_TYPE_X86_PV_P2M_FRAMES 0x00000003U
-#define REC_TYPE_X86_PV_VCPU_BASIC 0x00000004U
-#define REC_TYPE_X86_PV_VCPU_EXTENDED 0x00000005U
-#define REC_TYPE_X86_PV_VCPU_XSAVE 0x00000006U
-#define REC_TYPE_SHARED_INFO 0x00000007U
-#define REC_TYPE_X86_TSC_INFO 0x00000008U
-#define REC_TYPE_HVM_CONTEXT 0x00000009U
-#define REC_TYPE_HVM_PARAMS 0x0000000aU
-#define REC_TYPE_TOOLSTACK 0x0000000bU
-#define REC_TYPE_X86_PV_VCPU_MSRS 0x0000000cU
-#define REC_TYPE_VERIFY 0x0000000dU
-#define REC_TYPE_CHECKPOINT 0x0000000eU
-#define REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST 0x0000000fU
-#define REC_TYPE_STATIC_DATA_END 0x00000010U
-#define REC_TYPE_X86_CPUID_POLICY 0x00000011U
-#define REC_TYPE_X86_MSR_POLICY 0x00000012U
-
-#define REC_TYPE_OPTIONAL 0x80000000U
-
-/* PAGE_DATA */
-struct xc_sr_rec_page_data_header
-{
- uint32_t count;
- uint32_t _res1;
- uint64_t pfn[0];
-};
-
-#define PAGE_DATA_PFN_MASK 0x000fffffffffffffULL
-#define PAGE_DATA_TYPE_MASK 0xf000000000000000ULL
-
-/* X86_PV_INFO */
-struct xc_sr_rec_x86_pv_info
-{
- uint8_t guest_width;
- uint8_t pt_levels;
- uint8_t _res[6];
-};
-
-/* X86_PV_P2M_FRAMES */
-struct xc_sr_rec_x86_pv_p2m_frames
-{
- uint32_t start_pfn;
- uint32_t end_pfn;
- uint64_t p2m_pfns[0];
-};
-
-/* X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} */
-struct xc_sr_rec_x86_pv_vcpu_hdr
-{
- uint32_t vcpu_id;
- uint32_t _res1;
- uint8_t context[0];
-};
-
-/* X86_TSC_INFO */
-struct xc_sr_rec_x86_tsc_info
-{
- uint32_t mode;
- uint32_t khz;
- uint64_t nsec;
- uint32_t incarnation;
- uint32_t _res1;
-};
-
-/* HVM_PARAMS */
-struct xc_sr_rec_hvm_params_entry
-{
- uint64_t index;
- uint64_t value;
-};
-
-struct xc_sr_rec_hvm_params
-{
- uint32_t count;
- uint32_t _res1;
- struct xc_sr_rec_hvm_params_entry param[0];
-};
-
-#endif
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/*
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <unistd.h>
-#include <fcntl.h>
-
-#include <xenevtchn.h>
-
-#include "xc_private.h"
-#include "xenguest.h"
-
-#define SUSPEND_LOCK_FILE XEN_RUN_DIR "/suspend-evtchn-%d.lock"
-
-/*
- * locking
- */
-
-#define ERR(x) do{ \
- ERROR("Can't " #x " lock file for suspend event channel %s: %s\n", \
- suspend_file, strerror(errno)); \
- goto err; \
-}while(0)
-
-#define SUSPEND_FILE_BUFLEN (sizeof(SUSPEND_LOCK_FILE) + 10)
-
-static void get_suspend_file(char buf[], uint32_t domid)
-{
- snprintf(buf, SUSPEND_FILE_BUFLEN, SUSPEND_LOCK_FILE, domid);
-}
-
-static int lock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd)
-{
- int fd = -1, r;
- char suspend_file[SUSPEND_FILE_BUFLEN];
- struct stat ours, theirs;
- struct flock fl;
-
- get_suspend_file(suspend_file, domid);
-
- *lockfd = -1;
-
- for (;;) {
- if (fd >= 0)
- close (fd);
-
- fd = open(suspend_file, O_CREAT | O_RDWR, 0600);
- if (fd < 0)
- ERR("create");
-
- r = fcntl(fd, F_SETFD, FD_CLOEXEC);
- if (r)
- ERR("fcntl F_SETFD FD_CLOEXEC");
-
- memset(&fl, 0, sizeof(fl));
- fl.l_type = F_WRLCK;
- fl.l_whence = SEEK_SET;
- fl.l_len = 1;
- r = fcntl(fd, F_SETLK, &fl);
- if (r)
- ERR("fcntl F_SETLK");
-
- r = fstat(fd, &ours);
- if (r)
- ERR("fstat");
-
- r = stat(suspend_file, &theirs);
- if (r) {
- if (errno == ENOENT)
- /* try again */
- continue;
- ERR("stat");
- }
-
- if (ours.st_ino != theirs.st_ino)
- /* someone else must have removed it while we were locking it */
- continue;
-
- break;
- }
-
- *lockfd = fd;
- return 0;
-
- err:
- if (fd >= 0)
- close(fd);
-
- return -1;
-}
-
-static int unlock_suspend_event(xc_interface *xch, uint32_t domid, int *lockfd)
-{
- int r;
- char suspend_file[SUSPEND_FILE_BUFLEN];
-
- if (*lockfd < 0)
- return 0;
-
- get_suspend_file(suspend_file, domid);
-
- r = unlink(suspend_file);
- if (r)
- ERR("unlink");
-
- r = close(*lockfd);
- *lockfd = -1;
- if (r)
- ERR("close");
-
- err:
- if (*lockfd >= 0)
- close(*lockfd);
-
- return -1;
-}
-
-int xc_await_suspend(xc_interface *xch, xenevtchn_handle *xce, int suspend_evtchn)
-{
- int rc;
-
- do {
- rc = xenevtchn_pending(xce);
- if (rc < 0) {
- ERROR("error polling suspend notification channel: %d", rc);
- return -1;
- }
- } while (rc != suspend_evtchn);
-
- /* harmless for one-off suspend */
- if (xenevtchn_unmask(xce, suspend_evtchn) < 0)
- ERROR("failed to unmask suspend notification channel: %d", rc);
-
- return 0;
-}
-
-/* Internal callers are allowed to call this with suspend_evtchn<0
- * but *lockfd>0. */
-int xc_suspend_evtchn_release(xc_interface *xch, xenevtchn_handle *xce,
- uint32_t domid, int suspend_evtchn, int *lockfd)
-{
- if (suspend_evtchn >= 0)
- xenevtchn_unbind(xce, suspend_evtchn);
-
- return unlock_suspend_event(xch, domid, lockfd);
-}
-
-int xc_suspend_evtchn_init_sane(xc_interface *xch, xenevtchn_handle *xce,
- uint32_t domid, int port, int *lockfd)
-{
- int rc, suspend_evtchn = -1;
-
- if (lock_suspend_event(xch, domid, lockfd)) {
- errno = EINVAL;
- goto cleanup;
- }
-
- suspend_evtchn = xenevtchn_bind_interdomain(xce, domid, port);
- if (suspend_evtchn < 0) {
- ERROR("failed to bind suspend event channel: %d", suspend_evtchn);
- goto cleanup;
- }
-
- rc = xc_domain_subscribe_for_suspend(xch, domid, port);
- if (rc < 0) {
- ERROR("failed to subscribe to domain: %d", rc);
- goto cleanup;
- }
-
- return suspend_evtchn;
-
-cleanup:
- xc_suspend_evtchn_release(xch, xce, domid, suspend_evtchn, lockfd);
-
- return -1;
-}
-
-int xc_suspend_evtchn_init_exclusive(xc_interface *xch, xenevtchn_handle *xce,
- uint32_t domid, int port, int *lockfd)
-{
- int suspend_evtchn;
-
- suspend_evtchn = xc_suspend_evtchn_init_sane(xch, xce, domid, port, lockfd);
- if (suspend_evtchn < 0)
- return suspend_evtchn;
-
- /* event channel is pending immediately after binding */
- xc_await_suspend(xch, xce, suspend_evtchn);
-
- return suspend_evtchn;
-}