David Scott [Fri, 18 Jun 2010 12:38:27 +0000 (13:38 +0100)]
CA-41230: In VM.{snapshot,clone}, keep track of whether a new VDI has actually been created and therefore, whether that VDI should be deleted on failure. In particular CDs are shared not duplicated and so the cleanup code shouldn't try to delete them.
Unfortunately the 'writable ISO SR' support changes the default NFS ISO SR mount options to read/write from read/only, exposing this bug.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
diff -r 2a39d3f3842e ocaml/xapi/xapi_vm_clone.ml--- a/ocaml/xapi/xapi_vm_clone.ml Mon May 10 17:14:10 2010 +0100
+++ b/ocaml/xapi/xapi_vm_clone.ml Mon May 10 17:14:17 2010 +0100
@@ -21,7 +21,11 @@
open D
let delete_disks rpc session_id disks =
- List.iter (fun (vbd,vdi) -> try Client.VDI.destroy rpc session_id vdi with _ -> ()) disks
+ List.iter (fun (vbd,vdi,on_error_delete) ->
+ if on_error_delete
+ then try Client.VDI.destroy rpc session_id vdi with _ -> ()
+ else debug "Not destroying CD VDI: %s" (Ref.string_of vdi)
+ ) disks
let wait_for_clone ?progress_minmax ~__context task =
Helpers.call_api_functions ~__context (fun rpc session ->
@@ -134,14 +138,14 @@
(* If the VBD is empty there is no VDI to copy. *)
(* If the VBD is a CD then eject it (we cannot make copies of ISOs: they're identified *)
(* by their filename unlike other VDIs) *)
- let newvdi =
+ let newvdi, on_error_delete =
if vbd_r.API.vBD_empty
- then Ref.null
+ then Ref.null, false
else if vbd_r.API.vBD_type = `CD
- then vbd_r.API.vBD_VDI
- else clone_single_vdi ~progress:(done_so_far, size, total) rpc session_id disk_op ~__context vbd_r.API.vBD_VDI driver_params
+ then vbd_r.API.vBD_VDI, false (* don't delete the original CD *)
+ else clone_single_vdi ~progress:(done_so_far, size, total) rpc session_id disk_op ~__context vbd_r.API.vBD_VDI driver_params, true (* do delete newly created VDI *)
in
- ((vbd,newvdi)::acc, (Int64.add done_so_far size))
+ ((vbd,newvdi,on_error_delete)::acc, (Int64.add done_so_far size))
with e ->
debug "Error in safe_clone_disks: %s" (Printexc.to_string e);
delete_disks rpc session_id acc; (* Delete those cloned so far *)
@@ -348,7 +352,7 @@
(* copy VBDs *)
let new_vbds : [`VBD] Ref.t list =
- List.map (fun (vbd, newvdi) -> Xapi_vbd_helpers.copy ~__context ~vm:ref ~vdi:newvdi vbd) cloned_disks in
+ List.map (fun (vbd, newvdi, _) -> Xapi_vbd_helpers.copy ~__context ~vm:ref ~vdi:newvdi vbd) cloned_disks in
(* copy VIFs *)
let new_vifs : [`VIF] Ref.t list =
Rob Hoes [Fri, 18 Jun 2010 12:38:23 +0000 (13:38 +0100)]
Remove "dechainify VLAN" code from dbsync_slave
This is no longer needed, as the creation of VLANs-on-VLANs is not possible anymore (for about two years), and this function therefore causes unnecessary overhead in xapi's startup sequence. Removed an unused function from the same module as well.
Signed-off-by: Rob Hoes <rob.hoes@citrix.com>
diff -r 7e09c13b96e7 ocaml/xapi/dbsync_slave.ml--- a/ocaml/xapi/dbsync_slave.ml
+++ b/ocaml/xapi/dbsync_slave.ml
@@ -31,14 +31,6 @@
let ( ** ) = Int64.mul
let ( // ) = Int64.div
-let trim_end s =
- let i = ref (String.length s - 1) in
- while !i > 0 && (List.mem s.[!i] [ ' '; '\t'; '\n'; '\r' ])
- do
- decr i
- done;
- if !i >= 0 then String.sub s 0 (!i + 1) else ""
-
(* create localhost record *)
let get_my_ip_addr() =
@@ -115,35 +107,6 @@
end else
Db.Host.remove_from_other_config ~__context ~self:host ~key:Xapi_globs.host_no_local_storage
-(* CA-25162: Dechainify VLANs. We're actually doing this for _all_
- * PIFs, not just those relevant to localhost. Mostly this will be
- * a no-op, and it shouldn't matter if we fix problems for other hosts
- * here, and it covers the case where we're a slave and the master has
- * broken vlans which need to be corrected before we try to replicate
- * them *)
-let fix_chained_vlans ~__context =
- let pifs = Db.PIF.get_all_records ~__context in
- let (vlan_pifs,underlying_pifs) = List.partition (fun (_,pifr) -> pifr.API.pIF_VLAN >= 0L) pifs in
- List.iter (fun (vlan_pif_ref,vlan_pif_record) ->
- let pif_underneath_vlan = Helpers.get_pif_underneath_vlan ~__context vlan_pif_ref in
- if not (List.exists (fun (pif_ref,_) -> pif_ref = pif_underneath_vlan) underlying_pifs) then begin
- (* There's a problem - the underlying PIF of the vlan might be a vlan itself (or might not exist)
- Find the real underlying PIF by matching the host and device *)
- try
- let (real_pif_ref,real_pif_rec) = List.find (fun (_,pif_rec) ->
- pif_rec.API.pIF_host = vlan_pif_record.API.pIF_host &&
- pif_rec.API.pIF_device = vlan_pif_record.API.pIF_device) underlying_pifs in
- let vlan = Db.PIF.get_VLAN_master_of ~__context ~self:vlan_pif_ref in
- warn "Resetting tagged PIF of VLAN %s, previously was %s" (Ref.string_of vlan) (Ref.string_of pif_underneath_vlan);
- Db.VLAN.set_tagged_PIF ~__context ~self:vlan ~value:real_pif_ref
- with _ ->
- (* Can't find an underlying PIF - delete the VLAN record. This is pretty unlikely. *)
- error "Destroying dangling VLAN and associated PIF record - the underlying device has disappeared";
- let vlan = Db.PIF.get_VLAN_master_of ~__context ~self:vlan_pif_ref in
- Db.VLAN.destroy ~__context ~self:vlan;
- Db.PIF.destroy ~__context ~self:vlan_pif_ref
- end) vlan_pifs
-
(*************** update database tools ******************)
Rob Hoes [Fri, 18 Jun 2010 12:38:15 +0000 (13:38 +0100)]
Require lifecycle specification for APIs
Another step in the transition to better lifecycle information in the API.
Each API in the datamodel.ml should have an explicit lifecycle description attached to it (which may be [] for prototypes). Specifying 'in_product_since' is still allowed for backwards compatibility, but should no longer be used!
-let call ~name ?(doc="") ?(in_oss_since=Some "3.0.3") ~in_product_since ?internal_deprecated_since
+let get_published lifecycle =
+ try
+ let _, published, _ = List.find (fun (t, _, _) -> t = Published) lifecycle in
+ Some published
+ with Not_found -> None
+
+let get_deprecated lifecycle =
+ try
+ let _, deprecated, _ = List.find (fun (t, _, _) -> t = Deprecated) lifecycle in
+ Some deprecated
+ with Not_found -> None
+
+let call ~name ?(doc="") ?(in_oss_since=Some "3.0.3") ?in_product_since ?internal_deprecated_since
?result ?(flags=[`Session;`Async])
?(effect=true) ?(tag=Custom) ?(errs=[]) ?(custom_marshaller=false) ?(db_only=false)
?(no_current_operations=false) ?(secret=false) ?(hide_from_docs=false)
@@ -196,20 +208,30 @@
(* if you specify versioned_params then these get put in the params field of the message record;
* otherwise params go in with no default values and param_release=call_release...
*)
- let call_release = {internal=get_product_releases in_product_since;
- opensource=get_oss_releases in_oss_since;
- internal_deprecated_since = internal_deprecated_since;
- } in
+ if lifecycle = None && in_product_since = None then
+ failwith ("Lifecycle for message '" ^ name ^ "' not specified");
let lifecycle = match lifecycle with
| None ->
- let publish = [Published, in_product_since, doc] in
+ let published = match in_product_since with
+ | None -> []
+ | Some rel -> [Published, rel, doc]
+ in
let deprecated = match internal_deprecated_since with
| None -> []
| Some rel -> [Deprecated, rel, ""]
in
- publish @ deprecated
+ published @ deprecated
| Some l -> l
in
+ let call_release =
+ {
+ internal = (match get_published lifecycle with
+ | Some published -> get_product_releases published
+ | None -> ["closed"]);
+ opensource = get_oss_releases in_oss_since;
+ internal_deprecated_since = get_deprecated lifecycle;
+ }
+ in
{
msg_name = name;
msg_params =
@@ -2637,28 +2659,40 @@
(** Make an object field record *)
-let field ?(in_oss_since = Some "3.0.3") ?(in_product_since = rel_rio) ?(internal_only = false)
+let field ?(in_oss_since = Some "3.0.3") ?in_product_since ?(internal_only = false)
?internal_deprecated_since ?(ignore_foreign_key = false) ?(writer_roles=None) ?(reader_roles=None)
?(qualifier = RW) ?(ty = String) ?(effect = false) ?(default_value = None) ?(persist = true)
?(map_keys_roles=[]) (* list of (key_name,(writer_roles)) for a map field *)
?lifecycle name desc =
-
+ (* in_product_since currently defaults to 'Some rel_rio', for backwards compatibility.
+ * This should eventually become 'None'. *)
+ let in_product_since = match in_product_since with None -> Some rel_rio | x -> x in
+ if lifecycle = None && in_product_since = None then
+ failwith ("Lifecycle for field '" ^ name ^ "' not specified");
let lifecycle = match lifecycle with
- | None ->
- let publish = [Published, in_product_since, desc] in
- let deprecated = match internal_deprecated_since with
- | None -> []
- | Some rel -> [Deprecated, rel, ""]
- in
- publish @ deprecated
- | Some l -> l
+ | None ->
+ let published = match in_product_since with
+ | None -> []
+ | Some rel -> [Published, rel, desc]
+ in
+ let deprecated = match internal_deprecated_since with
+ | None -> []
+ | Some rel -> [Deprecated, rel, ""]
+ in
+ published @ deprecated
+ | Some l -> l
+ in
+ let release =
+ {
+ internal = (match get_published lifecycle with
+ | Some published -> get_product_releases published
+ | None -> ["closed"]);
+ opensource = get_oss_releases in_oss_since;
+ internal_deprecated_since = get_deprecated lifecycle;
+ }
in
Field {
- release = {
- internal=get_product_releases in_product_since;
- opensource=(get_oss_releases in_oss_since);
- internal_deprecated_since=internal_deprecated_since;
- };
+ release = release;
lifecycle=lifecycle;
qualifier=qualifier; ty=ty; internal_only = internal_only; default_value = default_value;
field_name=name;
@@ -2704,7 +2738,7 @@
let default_field_writer_roles = _R_POOL_ADMIN (* by default, only root can write to them *)
(** Create an object and map the object name into the messages *)
-let create_obj ?lifecycle ~in_oss_since ~in_product_since ~internal_deprecated_since ~gen_constructor_destructor ~gen_events ~persist ~name ~descr ~doccomments ~contents ~messages ~in_db
+let create_obj ?lifecycle ~in_oss_since ?in_product_since ?(internal_deprecated_since=None) ~gen_constructor_destructor ~gen_events ~persist ~name ~descr ~doccomments ~contents ~messages ~in_db
?(contents_default_reader_roles=default_field_reader_roles) ?(contents_default_writer_roles=None)
?(implicit_messages_allowed_roles=_R_ALL) (* used in implicit obj msgs (get_all, etc) *)
?force_custom_actions:(force_custom_actions=None) (* None,Some(RW),Some(StaticRO) *)
@@ -2719,20 +2753,34 @@
| Field f->Field{f with field_setter_roles=get_field_writer_roles f.field_setter_roles;
field_getter_roles=get_field_reader_roles f.field_getter_roles}
) contents in
+ if lifecycle = None && in_product_since = None then
+ failwith ("Lifecycle for class '" ^ name ^ "' not specified");
let lifecycle = match lifecycle with
| None ->
- let publish = [Published, in_product_since, descr] in
+ let published = match in_product_since with
+ | None -> []
+ | Some rel -> [Published, rel, descr]
+ in
let deprecated = match internal_deprecated_since with
| None -> []
| Some rel -> [Deprecated, rel, ""]
in
- publish @ deprecated
+ published @ deprecated
| Some l -> l
+ in
+ let release =
+ {
+ internal = (match get_published lifecycle with
+ | Some published -> get_product_releases published
+ | None -> ["closed"]);
+ opensource = get_oss_releases in_oss_since;
+ internal_deprecated_since = get_deprecated lifecycle;
+ }
in
let msgs = List.map (fun m -> {m with msg_obj_name=name;msg_allowed_roles=get_msg_allowed_roles m.msg_allowed_roles}) messages in
{ name = name; description = descr; obj_lifecycle = lifecycle; messages = msgs; contents = contents;
doccomments = doccomments; gen_constructor_destructor = gen_constructor_destructor; force_custom_actions = force_custom_actions;
- persist = persist; gen_events = gen_events; obj_release = {internal=get_product_releases in_product_since; opensource=get_oss_releases in_oss_since; internal_deprecated_since = internal_deprecated_since};
+ persist = persist; gen_events = gen_events; obj_release = release;
in_database=in_db; obj_allowed_roles = messages_default_allowed_roles; obj_implicit_msg_allowed_roles = implicit_messages_allowed_roles;
}
%% Document authors
\newcommand{\docauthors}{
}
-\newcommand{\legalnotice}{Copyright \copyright{} 2006-2008 Citrix Systems, Inc. All Rights Reserved.}
+\newcommand{\legalnotice}{Copyright \copyright{} 2006-2010 Citrix Systems, Inc. All Rights Reserved.}
Daniel Stodden [Fri, 18 Jun 2010 12:37:42 +0000 (13:37 +0100)]
Update scripts/block for blkback's new pause/resume ops.
The kernel can emit uevents when quiescing a VBD's I/O queue. It won't
write pause-done, but signal state changes through a new key:
queue-state. The basic idea is to let storage-level code hook into
pause/resume transitions where desirable. This script implements a
default handler.
The original device.ml protocol remains as is, but the implementation
differs:
David Scott [Fri, 18 Jun 2010 12:37:34 +0000 (13:37 +0100)]
CA-41839: Moving to RPMs for xapi had the side-effect of renaming the CLI RPM from xe-cli-... to xapi-xe... This patch changes it back for now to minimise unwanted churn.
Rob Hoes [Thu, 17 Jun 2010 16:31:00 +0000 (17:31 +0100)]
Remove bond-slave filter from PIF.scan
This filter uses sysfs to find out whether an interface is a bond slave. This does not work when the openvswitch backend is used. Besides, it does not seem to be necessary (anymore?) anyway.
PIF.scan creates a new PIF for each interface it finds if this interface is physical and there is no PIF with the same MAC. The slave interfaces always have the same MAC as the bond master PIF (bridging backend) or their "real" MACs as stored in their PIFs (openvswitch backend). Therefore, when a bond is in effect, the MACs of all bond slaves have corresponding PIFs in the DB, so PIF.scan won't create new ones.
Rob Hoes [Thu, 17 Jun 2010 16:31:00 +0000 (17:31 +0100)]
CA-40910: Only copy physical PIFs on pool join
On pool join, any bonds or VLANs on the joining host are supposed to be ignored. When the host reboots after the join, it will inherit the bond/VLAN setup from the pool master. Therefore, Bond and VLAN objects are not copied from the joining host to the pool. However, bond and VLAN master PIFs are, with is not good. This patch fixes that.
Rob Hoes [Thu, 17 Jun 2010 16:31:00 +0000 (17:31 +0100)]
Split off bond/VLAN reconstruction code from Dbsync_slave
When a pool slave starts up, it needs synchronise its bonds and VLANs with the pool master. This code is in the Dbsync_slave module, but does not really belong there (DBsync_slave is for synchronising DB fields such as PIF.currently_attached with the state of the physical system). This patch gives the bond/VLAN recreation code its own module.
Jonathan Knowles [Wed, 19 May 2010 10:50:11 +0000 (11:50 +0100)]
[CA-41286] Fixes a memory accounting error in the squeezing code.
When calculating an upper bound for the memory usage of an HVM domain with no balloon driver, the squeezing code would double-count (inappropriately) the shadow memory area.
This caused the squeezing algorithm to underestimate the amount of usable memory of any host running one or more HVM domains without balloon drivers.
This change removes the double-counting error, and also fixes a bug that caused the squeezer to add (inappropriately) the shadow memory size to Xen maxmem.
Signed-off-by: Jonathan Knowles <jonathan.knowles@eu.citrix.com> Acked-by: Dave Scott <dave.scott@eu.citrix.com>
David Scott [Thu, 13 May 2010 13:31:01 +0000 (14:31 +0100)]
Revert c/s 848:dddf116cddb5 - VM.other_config:auto_poweron is inexplicable either way around, so revert back to the old behaviour until we come up with a new plan (probably involving merging this with HA VM restart)
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
Rob Hoes [Thu, 17 Jun 2010 16:31:00 +0000 (17:31 +0100)]
Updated API evolution document
This document is meant to describe how the elements of the XenAPI may evolve over time, and provide compatibility guidelines. This is only a proposal and is still incomplete; we need some discussion about it, especially to establish what sort of compatibility guarantees we can give.
Zheng Li [Tue, 4 May 2010 05:22:08 +0000 (06:22 +0100)]
A few bugfixes and a few minor improvements to current xe cmdline tool implementation
Bugfixes:
* xe command line options doesn't mix well with XE_EXTRA_ARGS varialbe. E.g. setting XE_EXTRA_ARGS to "username=xxxx,password=yyyy" (or any non-nil valid configuration) and calling "xe -s <some server> vm-list" will break xe. Note that this is a common user case in a cluster-like environment where all the machines have the same user/passwd config, where one can conveniently set user/passwd in XE_EXTRA_ARGS for once and connect to different servers by only specifying different "-s" arguments in the cmdline.
* Setting "compat=true" in xe's RC file won't work. E.g. xe vm-clone vm-name=<vm name> new-name=<new vm name> with "compat=true" in ~/.xe won't work (but with "compat=true" in XE_EXTRA_ARGS or in xe cmdline will work).
* Setting a password with comma via XE_EXTRA_ARGS will break the logic. After the fix, it's possible to specify that by using backslash to escape the comma (e.g. password=pass\,word)
* clean up the options handling logic, so that cmdline options, RC file setting and XE_EXTRA_ARGS variable can mix consistently even in some corner cases and follow the natural priority: cmdline option > XE_EXTRA_ARGS > ~/.xe RC > default settings
Improvements:
* change options "-debug" and "-debug-on-fail" to "--debug" and "--debug-on-fail", so that every command line option now follows the common naming convention of -shortcut v.s.--full-name (with the only standard exception of having both "-help" and "--help"). AFAICS, both debug options are (maybe deliberately) not documented in the manual, so changing the names might not be a big issue regarding compatibilities.
* complete the pair relation between command line options and RC/environment variables. There were some missings from either side: e.g. "compat=xxxx" has no "--compat" correspondence and "--debug"("--debug-on-fail") has not "debug=xxxx" in par.
Zheng Li [Fri, 23 Apr 2010 18:30:04 +0000 (19:30 +0100)]
Add missing edition "XD"
"XD" edition seems to be missing here. The more interesting thing is that when we fail because of this bug, the log file will say "unknown edition" as expected, but the cli's error message will say "daemon was not found".
Zheng Li [Fri, 23 Apr 2010 17:54:31 +0000 (18:54 +0100)]
Enable native/bytecode compilation choices, and some Makefile simplification.
* Enable the independent bytecode compilation mode.
Previously this was unfortunately blocked by the hard wired xapi-client.cmxa in the Makefile. Now we should be able to load the bytecode version of xapi client lib in the OCaml interpreter for interactive development with instant feedback. Use $(COMPILE_NATIVE) and $(COMPILE_BYTE) to control these options.
* Change a few "XXX_TARGET = $(if $(equal $(XXX_CONF), yes), $(XXX_OBJ), none)" to "XXX_TARGET = $(if $(XXX_CONF), $(XXX_OBJ))"
The judge conditions are not strictly equivalent in semantics. The former holds only if XXX_CONF equals to "yes", the later condition holds when XXX_CONF is set to any non-negative string (not "no", "false", "0", none etc.). But I think it should be the same regarding XAPI's current Makefiles, and with more simplicity and tolerance. A lot more simplication and optimization can be done to these Makefiles, it's just a matter of time. Moreover XAPI should consider to update to new version of OMake (which should be better, but unfortunately not fully compatible with the old one).
David Scott [Fri, 9 Apr 2010 18:57:43 +0000 (19:57 +0100)]
CA-39291: Work around firewalls which kill idle TCP connections by inserting a small empty block into an export every 5s or so.
The failure happens whenever a disk has a lot of zeroes in it: the TCP connection goes idle while the server is scanning for the next non-zero block. Even setting SO_KEEPALIVE on the stunnel sockets and reducing the window probe interval down to 30s didn't fix it. We wish to keep the ability to have a basic client do an export via HTTP GET so we can't add application-level keepalives to the protocol... we must add them to the export itself.
Note this change is backwards compatible. The receiver code expects:
* a common prefix
* a monotonically increasing chunk number
* the first and last blocks to be the same size and included verbatim (even if all zeroes)
* blocks of zeroes the same size as the first block represented as gaps in the increasing chunk number sequence
Therefore including extra files of length 0 in the stream will be ignored provided they
* share the common prefix and chunk numbering scheme
* are not the first or last blocks
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Fri, 28 May 2010 14:25:56 +0000 (15:25 +0100)]
Move the generation of ocaml/util/version.ml into the Makefile so that it can be done before the main build. This unbreaks the rpm build since 'hg id' will fail when run in a plain non-repo directory of sources.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
Ewan Mellor [Thu, 29 Apr 2010 09:22:35 +0000 (10:22 +0100)]
CA-37279: Stunnel error from WLB connect "No route to host"
Understand "No route to host" as an expected error message from stunnel, and turn that into API error WLB_UNKNOWN_HOST if we see it when contacting WLB.
This is two patches, one for xen-api-libs.hg, and one for xen-api.hg.
Signed-off-by: Ewan Mellor <ewan.mellor@eu.citrix.com>
Zheng Li [Tue, 20 Apr 2010 18:18:53 +0000 (19:18 +0100)]
Cope with the stunnel zombie process issue.
Some versions of stunnel (old versions, or some new ones bulit with some particular Linux distribution versions) have the zombie process issue when called from xe. When it happens, the main stunnel process won't exit for long time after xe closing its communication channel, seems to be waiting for its children processes which are however staying in "defunct" status. The issue was also reported on the server side when stunnel is called by xapi daemon, so it would be useful to set the Stunnel.disconnect arguments properly there as well. However this demands more work to identify which setting is safe for each occurrence, so I'll leave it for future. Moreover, currently xe doesn't wait for the second stunnel connection process (HTTP Get/Put), this should also be fixed in the future. The question is: does xe really needs to care about the status of stunnel in most cases? If not, why not using the double fork tricks everywhere; if yes, a SSL library might be more appropriate than 3rd party tools such as stunnel.
Ian Campbell [Tue, 20 Apr 2010 16:51:08 +0000 (17:51 +0100)]
The OpenVswitch project is trying to standardise on using OpenVswitch rather than simply vswitch so accept both in network.conf and report the mode as openvswitch.
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Yang Hongyang [Tue, 20 Apr 2010 13:53:51 +0000 (14:53 +0100)]
Add strict check to VCPUs-params param-key,according to Xen Cloud Platform Administrator's Guide,there's only 'weight','cap' or 'mask' param-key available.
Zheng Li [Sat, 10 Apr 2010 09:15:21 +0000 (10:15 +0100)]
Fix a bug in the previous -debug-on-fail patch
I didn't realize that xe would initial an auxilary stunnel link in certain opearions, so that some of the global variables we stored for the main link might be overwritten unexpectedly. Now the protection is added. What a typical example about how global mutable variable is bad, but I won't really regret having chosen such a solution because it demanded the least code modification, and hence safer for such complex software in that sense.
David Scott [Mon, 12 Apr 2010 13:44:41 +0000 (14:44 +0100)]
CA-40134: when looking up a domid, always look up the domain by uuid rather than relying on the value in the master's database (which could be out of sync).
In particular if the master is suddenly powercycled then recent domid updates might be lost. This results in VMs which are impossible to shutdown.
Looking up the domain by uuid should be strictly better than using the master's version: both are moving targets (eg over migrate) and so we already rely on the per-VM lock to protect (most) accesses.
The only problem is the code is slightly inefficient: it still has to contact the master to look up the VM's uuid and then has to list all domains on the host to build up a table of uuid -> domid. This can be improved later.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
Rob Hoes [Fri, 9 Apr 2010 09:56:14 +0000 (10:56 +0100)]
CA-38844: CPUID maskability
Distinguish between not maskable ("no") (CPU does not have Intel FlexMigration or AMB Extended Migration), only base features are maskable ("base"), and base+extended features are maskable ("full") in Host.cpu_info:maskable.
Note: this patch should go in together with the flexmigration patch in xen-api-libs.hg.
David Scott [Wed, 7 Apr 2010 20:09:16 +0000 (21:09 +0100)]
CA-39952: explicitly blank {allowed,current}_operations fields in VM exports. The values stored here are redundant sources of potential import failures on older s/w versions.
The operation enums are often extended rendering them unparsable by older software versions. Although we don't guarantee that a new export can be imported on an old host it nevertheless should almost always work.. (apart from this)
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Wed, 7 Apr 2010 16:43:56 +0000 (17:43 +0100)]
CA-38463: backend vifs now have proper "device" symlinks in /sys so to tell the difference between them and a real "physical" interface, look to see whether they link to devices/xen-backend/...
This prevents PIF.scan from accidentally introducing vifX.Y as PIFs...
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Wed, 7 Apr 2010 16:06:31 +0000 (17:06 +0100)]
CA-39972: try extra hard to kill stunnel in 'service xapissl stop'
Previously we sent one SIGTERM and waited for up to 3 minutes. In a quick test, out of 1000 back-to-back 'service xapissl restart' calls, one took the full 3 minutes, as if the signal was ignored.
Now we send additional SIGTERMS as we go around the loop, one per second. In a quick test, 10000 back-to-back 'service xapissl restart' calls completed without any taking more than a few seconds.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Wed, 7 Apr 2010 16:06:30 +0000 (17:06 +0100)]
CA-39887: Totally throw away VM RRD updates when domains are paused.
Previously we kept the RRD data but silenced the dirty_memory signal. This would allow the following sequence:
1. domain created
domain = paused; memory = 0
<-- RRD updated with memory = 0
<-- memory not considered 'dirty' because domain is paused
2. domain built
domain = paused; memory = some interesting value
<-- RRD updated with memory = some interesting value
<-- memory not considered 'dirty' because domain is paused
3. domain unpaused
domain = running; memory = some interesting value
<-- RRD updated with memory = some interesting value
<-- memory not considered 'dirty' because memory value has *not* changed in the RRD
Now we ignore the RRD updates when the domain is paused. This means that, when the domain is finally unpaused, the new memory value will always be considered to have changed.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Wed, 7 Apr 2010 10:46:46 +0000 (11:46 +0100)]
CA-39401: on server boot only start VMs with auto_poweron=true which have their affinity set to the local host.
The auto_poweron=true mechanism made sense when all pools were of size 1. It's a bit odd with multi-host pools.. surely these days you would HA protect VMs you care about?
This change makes the auto_poweron mechanism a bit less inexplicable. At some point we ought to merge this with HA.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Wed, 7 Apr 2010 10:42:09 +0000 (11:42 +0100)]
CA-34888: the Java bindings tests expect there to be enough memory to install from a Win7 (64 bit) template plus a bit. Bump up the default size of the simulated host in xiu.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Wed, 7 Apr 2010 10:36:23 +0000 (11:36 +0100)]
CA-39745: When a failure to hotplug a disk is detected, check whether the underlying device was a physical CDROM with an empty drive. In this case throw HOST_CD_DRIVE_EMPTY.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
David Scott [Wed, 7 Apr 2010 10:35:06 +0000 (11:35 +0100)]
CA-39745: watch /local/domain/%d/error/backend/vbd/%d/%d/error for informative errors such as "2 creating vbd structure". This prevents us timing out after 20 minutes if something goes wrong with a blkback device.
A further patch will be needed to provide some decent error diagnosis.
Signed-off-by: David Scott <dave.scott@eu.citrix.com>
Rob Hoes [Wed, 7 Apr 2010 10:34:09 +0000 (11:34 +0100)]
CA-39461: Fix Network.pool_introduce
The bug in Network.pool_introduce led to problems when adding a host to a pool, when this host has a Network with a name different from any Network on the pool. Networks and PIFs were not properly recreated on the pool in this case.
David Scott [Tue, 6 Apr 2010 09:57:56 +0000 (10:57 +0100)]
CA-39889: hotplug PCI devices into guests in the order specified in the (original) other-config key.
There were two problems:
1. xapi was accidentally reversing the list provided in the other_config key (a side-effect of a fold)
2. when listing the PCI devices present in xenstore, the Device.PCI.list function was passing the device order number back in the place reserved for "PCI bus ID". This caused the devices to be hotplugged in reverse over reboot.
Now the behaviour is:
1. devices are hotplugged in the order they are found in the other_config key
2. the hotplug ordering is stable across start/internal reboot/external reboot
Signed-off-by: David Scott <dave.scott@eu.citrix.com>