ia64/xen-unstable

changeset 7067:93e27f7ca8a8

Merge
author djm@kirby.fc.hp.com
date Thu Sep 29 16:22:02 2005 -0600 (2005-09-29)
parents c0ac925e8f1d 4e1031ce3bc2
children 61b3b357d827
files Makefile buildconfigs/Rules.mk docs/src/user/installation.tex linux-2.6-xen-sparse/arch/ia64/Kconfig linux-2.6-xen-sparse/arch/ia64/xen-mkbuildtree-pre linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_ia64 linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c linux-2.6-xen-sparse/arch/xen/kernel/reboot.c linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c linux-2.6-xen-sparse/include/asm-xen/asm-ia64/hypervisor.h linux-2.6-xen-sparse/include/asm-xen/xenbus.h tools/check/check_hotplug tools/examples/Makefile tools/examples/xmexample.vmx tools/firmware/vmxassist/Makefile tools/firmware/vmxassist/vmxloader.c tools/ioemu/hw/cirrus_vga.c tools/ioemu/hw/pc.c tools/ioemu/hw/vga.c tools/ioemu/target-i386-dm/helper2.c tools/ioemu/vl.c tools/libxc/xc_vmx_build.c tools/libxc/xenguest.h tools/libxc/xg_private.h tools/python/xen/lowlevel/xc/xc.c tools/python/xen/lowlevel/xs/xs.c tools/python/xen/xend/PrettyPrint.py tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/image.py tools/python/xen/xend/server/DevController.py tools/python/xen/xend/xenstore/xsnode.py tools/python/xen/xend/xenstore/xstransact.py tools/python/xen/xm/main.py tools/xenstore/Makefile tools/xenstore/speedtest.c tools/xenstore/tdb.c tools/xenstore/tdb.h tools/xenstore/testsuite/04rm.test tools/xenstore/testsuite/08transaction.slowtest tools/xenstore/testsuite/08transaction.test tools/xenstore/testsuite/12readonly.test tools/xenstore/testsuite/14complexperms.test tools/xenstore/testsuite/16block-watch-crash.test tools/xenstore/xenstore_client.c tools/xenstore/xenstored.h tools/xenstore/xenstored_core.c tools/xenstore/xenstored_core.h tools/xenstore/xenstored_domain.c tools/xenstore/xenstored_transaction.c tools/xenstore/xenstored_transaction.h tools/xenstore/xenstored_watch.c tools/xenstore/xenstored_watch.h tools/xenstore/xs.c tools/xenstore/xs.h tools/xenstore/xs_lib.c tools/xenstore/xs_lib.h tools/xenstore/xs_random.c tools/xenstore/xs_stress.c tools/xenstore/xs_tdb_dump.c tools/xenstore/xs_test.c xen/arch/ia64/asm-offsets.c xen/arch/ia64/vmx/vmx_process.c xen/arch/ia64/xen/process.c xen/arch/ia64/xen/vcpu.c xen/arch/x86/mm.c xen/arch/x86/vmx_vmcs.c xen/common/grant_table.c xen/include/asm-ia64/vcpu.h xen/include/asm-x86/e820.h xen/include/asm-x86/mm.h xen/include/asm-x86/vmx_platform.h xen/include/xen/grant_table.h
line diff
     1.1 --- a/Makefile	Thu Sep 29 13:35:13 2005 -0600
     1.2 +++ b/Makefile	Thu Sep 29 16:22:02 2005 -0600
     1.3 @@ -164,7 +164,7 @@ help:
     1.4  uninstall: DESTDIR=
     1.5  uninstall: D=$(DESTDIR)
     1.6  uninstall:
     1.7 -	[ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s`
     1.8 +	[ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s` || true
     1.9  	rm -rf $(D)/etc/init.d/xend*
    1.10  	rm -rf $(D)/etc/hotplug/xen-backend.agent
    1.11  	rm -rf $(D)/var/run/xen* $(D)/var/lib/xen*
     2.1 --- a/buildconfigs/Rules.mk	Thu Sep 29 13:35:13 2005 -0600
     2.2 +++ b/buildconfigs/Rules.mk	Thu Sep 29 16:22:02 2005 -0600
     2.3 @@ -16,7 +16,7 @@ PRISTINE_SRC_PATH	?= .:..
     2.4  vpath pristine-% $(PRISTINE_SRC_PATH)
     2.5  
     2.6  # By default, build Linux with ARCH=xen (overridden by some non arch's)
     2.7 -ifneq ($(ARCH),ia64)
     2.8 +ifneq ($(XEN_TARGET_ARCH),ia64)
     2.9  LINUX_ARCH	?= xen
    2.10  else
    2.11  LINUX_ARCH	?= ia64
     3.1 --- a/docs/src/user/installation.tex	Thu Sep 29 13:35:13 2005 -0600
     3.2 +++ b/docs/src/user/installation.tex	Thu Sep 29 16:22:02 2005 -0600
     3.3 @@ -21,6 +21,9 @@ required if you wish to build from sourc
     3.4  \item [$\dag$] The \path{iproute2} package.
     3.5  \item [$\dag$] The Linux bridge-utils\footnote{Available from {\tt
     3.6        http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl})
     3.7 +\item [$\dag$] The Linux hotplug system\footnote{Available from {\tt
     3.8 +      http://linux-hotplug.sourceforge.net/}} (e.g., \path{/sbin/hotplug}
     3.9 +      and related scripts)
    3.10  \item [$\dag$] An installation of Twisted~v1.3 or
    3.11    above\footnote{Available from {\tt http://www.twistedmatrix.com}}.
    3.12    There may be a binary package available for your distribution;
     4.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c	Thu Sep 29 13:35:13 2005 -0600
     4.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c	Thu Sep 29 16:22:02 2005 -0600
     4.3 @@ -1394,9 +1394,7 @@ static void handle_vcpu_hotplug_event(st
     4.4  			return;
     4.5  
     4.6  		/* get the state value */
     4.7 -		xenbus_transaction_start("cpu");
     4.8  		err = xenbus_scanf(dir, "availability", "%s", state);
     4.9 -		xenbus_transaction_end(0);
    4.10  
    4.11  		if (err != 1) {
    4.12  			printk(KERN_ERR
     5.1 --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Thu Sep 29 13:35:13 2005 -0600
     5.2 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Thu Sep 29 16:22:02 2005 -0600
     5.3 @@ -324,7 +324,7 @@ static void shutdown_handler(struct xenb
     5.4      int err;
     5.5  
     5.6   again:
     5.7 -    err = xenbus_transaction_start("control");
     5.8 +    err = xenbus_transaction_start();
     5.9      if (err)
    5.10  	return;
    5.11      str = (char *)xenbus_read("control", "shutdown", NULL);
    5.12 @@ -337,7 +337,7 @@ static void shutdown_handler(struct xenb
    5.13      xenbus_write("control", "shutdown", "");
    5.14  
    5.15      err = xenbus_transaction_end(0);
    5.16 -    if (err == -ETIMEDOUT) {
    5.17 +    if (err == -EAGAIN) {
    5.18  	kfree(str);
    5.19  	goto again;
    5.20      }
    5.21 @@ -366,7 +366,7 @@ static void sysrq_handler(struct xenbus_
    5.22      int err;
    5.23  
    5.24   again:
    5.25 -    err = xenbus_transaction_start("control");
    5.26 +    err = xenbus_transaction_start();
    5.27      if (err)
    5.28  	return;
    5.29      if (!xenbus_scanf("control", "sysrq", "%c", &sysrq_key)) {
    5.30 @@ -379,7 +379,7 @@ static void sysrq_handler(struct xenbus_
    5.31  	xenbus_printf("control", "sysrq", "%c", '\0');
    5.32  
    5.33      err = xenbus_transaction_end(0);
    5.34 -    if (err == -ETIMEDOUT)
    5.35 +    if (err == -EAGAIN)
    5.36  	goto again;
    5.37  
    5.38      if (sysrq_key != '\0') {
     6.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Thu Sep 29 13:35:13 2005 -0600
     6.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Thu Sep 29 16:22:02 2005 -0600
     6.3 @@ -80,8 +80,9 @@ static void frontend_changed(struct xenb
     6.4  		return;
     6.5  	}
     6.6  
     6.7 +again:
     6.8  	/* Supply the information about the device the frontend needs */
     6.9 -	err = xenbus_transaction_start(be->dev->nodename);
    6.10 +	err = xenbus_transaction_start();
    6.11  	if (err) {
    6.12  		xenbus_dev_error(be->dev, err, "starting transaction");
    6.13  		return;
    6.14 @@ -119,7 +120,15 @@ static void frontend_changed(struct xenb
    6.15  		goto abort;
    6.16  	}
    6.17  
    6.18 -	xenbus_transaction_end(0);
    6.19 +	err = xenbus_transaction_end(0);
    6.20 +	if (err == -EAGAIN)
    6.21 +		goto again;
    6.22 +	if (err) {
    6.23 +		xenbus_dev_error(be->dev, err, "ending transaction",
    6.24 +				 ring_ref, evtchn);
    6.25 +		goto abort;
    6.26 +	}
    6.27 +
    6.28  	xenbus_dev_ok(be->dev);
    6.29  
    6.30  	return;
     7.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Sep 29 13:35:13 2005 -0600
     7.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Sep 29 16:22:02 2005 -0600
     7.3 @@ -572,7 +572,8 @@ static int talk_to_backend(struct xenbus
     7.4  		goto out;
     7.5  	}
     7.6  
     7.7 -	err = xenbus_transaction_start(dev->nodename);
     7.8 +again:
     7.9 +	err = xenbus_transaction_start();
    7.10  	if (err) {
    7.11  		xenbus_dev_error(dev, err, "starting transaction");
    7.12  		goto destroy_blkring;
    7.13 @@ -603,6 +604,8 @@ static int talk_to_backend(struct xenbus
    7.14  
    7.15  	err = xenbus_transaction_end(0);
    7.16  	if (err) {
    7.17 +		if (err == -EAGAIN)
    7.18 +			goto again;
    7.19  		xenbus_dev_error(dev, err, "completing transaction");
    7.20  		goto destroy_blkring;
    7.21  	}
     8.1 --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Thu Sep 29 13:35:13 2005 -0600
     8.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Thu Sep 29 16:22:02 2005 -0600
     8.3 @@ -1122,7 +1122,8 @@ static int talk_to_backend(struct xenbus
     8.4  		goto out;
     8.5  	}
     8.6  
     8.7 -	err = xenbus_transaction_start(dev->nodename);
     8.8 +again:
     8.9 +	err = xenbus_transaction_start();
    8.10  	if (err) {
    8.11  		xenbus_dev_error(dev, err, "starting transaction");
    8.12  		goto destroy_ring;
    8.13 @@ -1160,6 +1161,8 @@ static int talk_to_backend(struct xenbus
    8.14  
    8.15  	err = xenbus_transaction_end(0);
    8.16  	if (err) {
    8.17 +		if (err == -EAGAIN)
    8.18 +			goto again;
    8.19  		xenbus_dev_error(dev, err, "completing transaction");
    8.20  		goto destroy_ring;
    8.21  	}
     9.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Thu Sep 29 13:35:13 2005 -0600
     9.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Thu Sep 29 16:22:02 2005 -0600
     9.3 @@ -93,7 +93,8 @@ static void frontend_changed(struct xenb
     9.4  	 * Tell the front-end that we are ready to go -
     9.5  	 * unless something bad happens
     9.6  	 */
     9.7 -	err = xenbus_transaction_start(be->dev->nodename);
     9.8 +again:
     9.9 +	err = xenbus_transaction_start();
    9.10  	if (err) {
    9.11  		xenbus_dev_error(be->dev, err, "starting transaction");
    9.12  		return;
    9.13 @@ -127,7 +128,14 @@ static void frontend_changed(struct xenb
    9.14  		goto abort;
    9.15  	}
    9.16  
    9.17 -	xenbus_transaction_end(0);
    9.18 +	err = xenbus_transaction_end(0);
    9.19 +	if (err == -EAGAIN)
    9.20 +		goto again;
    9.21 +	if (err) {
    9.22 +		xenbus_dev_error(be->dev, err, "end of transaction");
    9.23 +		goto abort;
    9.24 +	}
    9.25 +
    9.26  	xenbus_dev_ok(be->dev);
    9.27  	return;
    9.28  abort:
    10.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Thu Sep 29 13:35:13 2005 -0600
    10.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Thu Sep 29 16:22:02 2005 -0600
    10.3 @@ -331,7 +331,8 @@ static int talk_to_backend(struct xenbus
    10.4  		goto out;
    10.5  	}
    10.6  
    10.7 -	err = xenbus_transaction_start(dev->nodename);
    10.8 +again:
    10.9 +	err = xenbus_transaction_start();
   10.10  	if (err) {
   10.11  		xenbus_dev_error(dev, err, "starting transaction");
   10.12  		goto destroy_tpmring;
   10.13 @@ -363,6 +364,8 @@ static int talk_to_backend(struct xenbus
   10.14  	}
   10.15  
   10.16  	err = xenbus_transaction_end(0);
   10.17 +	if (err == -EAGAIN)
   10.18 +		goto again;
   10.19  	if (err) {
   10.20  		xenbus_dev_error(dev, err, "completing transaction");
   10.21  		goto destroy_tpmring;
    11.1 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c	Thu Sep 29 13:35:13 2005 -0600
    11.2 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c	Thu Sep 29 16:22:02 2005 -0600
    11.3 @@ -287,12 +287,11 @@ EXPORT_SYMBOL(xenbus_rm);
    11.4  
    11.5  /* Start a transaction: changes by others will not be seen during this
    11.6   * transaction, and changes will not be visible to others until end.
    11.7 - * Transaction only applies to the given subtree.
    11.8   * You can only have one transaction at any time.
    11.9   */
   11.10 -int xenbus_transaction_start(const char *subtree)
   11.11 +int xenbus_transaction_start(void)
   11.12  {
   11.13 -	return xs_error(xs_single(XS_TRANSACTION_START, subtree, NULL));
   11.14 +	return xs_error(xs_single(XS_TRANSACTION_START, "", NULL));
   11.15  }
   11.16  EXPORT_SYMBOL(xenbus_transaction_start);
   11.17  
    12.1 --- a/linux-2.6-xen-sparse/include/asm-xen/xenbus.h	Thu Sep 29 13:35:13 2005 -0600
    12.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/xenbus.h	Thu Sep 29 16:22:02 2005 -0600
    12.3 @@ -87,7 +87,7 @@ int xenbus_write(const char *dir, const 
    12.4  int xenbus_mkdir(const char *dir, const char *node);
    12.5  int xenbus_exists(const char *dir, const char *node);
    12.6  int xenbus_rm(const char *dir, const char *node);
    12.7 -int xenbus_transaction_start(const char *subtree);
    12.8 +int xenbus_transaction_start(void);
    12.9  int xenbus_transaction_end(int abort);
   12.10  
   12.11  /* Single read and scanf: returns -errno or num scanned if > 0. */
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/tools/check/check_hotplug	Thu Sep 29 16:22:02 2005 -0600
    13.3 @@ -0,0 +1,10 @@
    13.4 +#!/bin/bash
    13.5 +# CHECK-INSTALL
    13.6 +
    13.7 +function error {
    13.8 +   echo
    13.9 +   echo '  *** Check for the hotplug scripts (hotplug) FAILED'
   13.10 +   exit 1
   13.11 +}
   13.12 +
   13.13 +which hotplug 1>/dev/null 2>&1 || error
    14.1 --- a/tools/examples/Makefile	Thu Sep 29 13:35:13 2005 -0600
    14.2 +++ b/tools/examples/Makefile	Thu Sep 29 16:22:02 2005 -0600
    14.3 @@ -25,19 +25,13 @@ XEN_SCRIPTS += block-phy
    14.4  XEN_SCRIPTS += block-file
    14.5  XEN_SCRIPTS += block-enbd
    14.6  
    14.7 -# no 64-bit specifics in mem-map.sxp
    14.8 -# so place in /usr/lib, not /usr/lib64
    14.9 -XEN_BOOT_DIR = /usr/lib/xen/boot
   14.10 -XEN_BOOT = mem-map.sxp
   14.11 -
   14.12  XEN_HOTPLUG_DIR = /etc/hotplug
   14.13  XEN_HOTPLUG_SCRIPTS = xen-backend.agent
   14.14  
   14.15  all:
   14.16  build:
   14.17  
   14.18 -install: all install-initd install-configs install-scripts install-boot \
   14.19 -	 install-hotplug
   14.20 +install: all install-initd install-configs install-scripts install-hotplug
   14.21  
   14.22  install-initd:
   14.23  	[ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d
   14.24 @@ -62,14 +56,6 @@ install-scripts:
   14.25  	    $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
   14.26  	done
   14.27  
   14.28 -install-boot:
   14.29 -	[ -d $(DESTDIR)$(XEN_BOOT_DIR) ] || \
   14.30 -		$(INSTALL_DIR) $(DESTDIR)$(XEN_BOOT_DIR)
   14.31 -	for i in $(XEN_BOOT); \
   14.32 -	    do [ -a $(DESTDIR)$(XEN_BOOT_DIR)/$$i ] || \
   14.33 -	    $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_BOOT_DIR); \
   14.34 -	done
   14.35 -
   14.36  install-hotplug:
   14.37  	[ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \
   14.38  		$(INSTALL_DIR) $(DESTDIR)$(XEN_HOTPLUG_DIR)
    15.1 --- a/tools/examples/mem-map.sxp	Thu Sep 29 13:35:13 2005 -0600
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,10 +0,0 @@
    15.4 -(memmap
    15.5 - (0000000000000000  000000000009f800 "AddressRangeMemory"   WB)
    15.6 - (000000000009f800  00000000000a0000 "AddressRangeReserved" UC)
    15.7 - (00000000000a0000  00000000000bffff "AddressRangeIO"       UC)
    15.8 - (00000000000f0000  0000000000100000 "AddressRangeReserved" UC)
    15.9 - (0000000000100000  0000000008000000 "AddressRangeMemory"   WB)
   15.10 - (0000000007fff000  0000000008000000 "AddressRangeShared"   WB)
   15.11 - (0000000008000000  0000000008003000 "AddressRangeNVS"      UC)
   15.12 - (0000000008003000  000000000800d000 "AddressRangeACPI"     WB)
   15.13 - (00000000fec00000  0000000100000000 "AddressRangeIO"       UC))
    16.1 --- a/tools/examples/xmexample.vmx	Thu Sep 29 13:35:13 2005 -0600
    16.2 +++ b/tools/examples/xmexample.vmx	Thu Sep 29 16:22:02 2005 -0600
    16.3 @@ -60,9 +60,6 @@ disk = [ 'file:/var/images/min-el3-i386.
    16.4  # New stuff
    16.5  device_model = '/usr/' + arch_libdir + '/xen/bin/qemu-dm'
    16.6  
    16.7 -# Advanced users only. Don't touch if you don't know what you're doing
    16.8 -memmap = '/usr/lib/xen/boot/mem-map.sxp'
    16.9 -
   16.10  #-----------------------------------------------------------------------------
   16.11  # Disk image for 
   16.12  #cdrom=
    17.1 --- a/tools/firmware/vmxassist/Makefile	Thu Sep 29 13:35:13 2005 -0600
    17.2 +++ b/tools/firmware/vmxassist/Makefile	Thu Sep 29 16:22:02 2005 -0600
    17.3 @@ -44,7 +44,7 @@ all: vmxloader
    17.4  vmxloader: roms.h vmxloader.c acpi.h acpi_madt.c
    17.5  	${CC} ${CFLAGS} ${DEFINES} -c vmxloader.c -c acpi_madt.c
    17.6  	$(CC) -o vmxloader.tmp -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,0x100000 vmxloader.o acpi_madt.o
    17.7 -	objcopy --change-addresses=0xC0000000 vmxloader.tmp vmxloader
    17.8 +	objcopy vmxloader.tmp vmxloader
    17.9  	rm -f vmxloader.tmp
   17.10  
   17.11  vmxassist.bin: vmxassist.ld ${OBJECTS}
    18.1 --- a/tools/firmware/vmxassist/vmxloader.c	Thu Sep 29 13:35:13 2005 -0600
    18.2 +++ b/tools/firmware/vmxassist/vmxloader.c	Thu Sep 29 16:22:02 2005 -0600
    18.3 @@ -34,28 +34,39 @@ int acpi_madt_update(unsigned char* acpi
    18.4  /*
    18.5   * C runtime start off
    18.6   */
    18.7 -asm("					\n\
    18.8 -	.text				\n\
    18.9 -	.globl	_start			\n\
   18.10 -_start:					\n\
   18.11 -	cli				\n\
   18.12 -	movl	$stack_top, %esp	\n\
   18.13 -	movl	%esp, %ebp		\n\
   18.14 -	call    main			\n\
   18.15 -	jmp	halt			\n\
   18.16 -					\n\
   18.17 -	.globl	halt			\n\
   18.18 -halt:					\n\
   18.19 -	sti				\n\
   18.20 -	jmp	.			\n\
   18.21 -					\n\
   18.22 -	.bss				\n\
   18.23 -	.align	8			\n\
   18.24 -	.globl	stack, stack_top	\n\
   18.25 -stack:					\n\
   18.26 -	.skip	0x4000			\n\
   18.27 -stack_top:				\n\
   18.28 -");
   18.29 +asm(
   18.30 +"	.text				\n"
   18.31 +"	.globl	_start			\n"
   18.32 +"_start:				\n"
   18.33 +"	cld				\n"
   18.34 +"	cli				\n"
   18.35 +"	lgdt	gdt_desr		\n"
   18.36 +"	movl	$stack_top, %esp	\n"
   18.37 +"	movl	%esp, %ebp		\n"
   18.38 +"	call	main			\n"
   18.39 +"	jmp	halt			\n"
   18.40 +"					\n"
   18.41 +"gdt_desr:				\n"
   18.42 +"	.word	gdt_end - gdt - 1	\n"
   18.43 +"	.long	gdt			\n"
   18.44 +"					\n"
   18.45 +"	.align	8			\n"
   18.46 +"gdt:					\n"
   18.47 +"	.quad	0x0000000000000000	\n"
   18.48 +"	.quad	0x00CF92000000FFFF	\n"
   18.49 +"	.quad	0x00CF9A000000FFFF	\n"
   18.50 +"gdt_end:				\n"
   18.51 +"					\n"
   18.52 +"halt:					\n"
   18.53 +"	sti				\n"
   18.54 +"	jmp	.			\n"
   18.55 +"					\n"
   18.56 +"	.bss				\n"
   18.57 +"	.align	8			\n"
   18.58 +"stack:					\n"
   18.59 +"	.skip	0x4000			\n"
   18.60 +"stack_top:				\n"
   18.61 +);
   18.62  
   18.63  void *
   18.64  memcpy(void *dest, const void *src, unsigned n)
   18.65 @@ -95,7 +106,7 @@ cirrus_check(void)
   18.66  }
   18.67  
   18.68  int
   18.69 -main()
   18.70 +main(void)
   18.71  {
   18.72  	puts("VMXAssist Loader\n");
   18.73  	puts("Loading ROMBIOS ...\n");
    19.1 --- a/tools/ioemu/hw/cirrus_vga.c	Thu Sep 29 13:35:13 2005 -0600
    19.2 +++ b/tools/ioemu/hw/cirrus_vga.c	Thu Sep 29 16:22:02 2005 -0600
    19.3 @@ -231,6 +231,8 @@ typedef struct CirrusVGAState {
    19.4      int cirrus_linear_io_addr;
    19.5      int cirrus_linear_bitblt_io_addr;
    19.6      int cirrus_mmio_io_addr;
    19.7 +    unsigned long cirrus_lfb_addr;
    19.8 +    unsigned long cirrus_lfb_end;
    19.9      uint32_t cirrus_addr_mask;
   19.10      uint32_t linear_mmio_mask;
   19.11      uint8_t cirrus_shadow_gr0;
   19.12 @@ -2447,6 +2449,10 @@ static void cirrus_update_memory_access(
   19.13  {
   19.14      unsigned mode;
   19.15  
   19.16 +    extern void unset_vram_mapping(unsigned long addr, unsigned long end);
   19.17 +    extern void set_vram_mapping(unsigned long addr, unsigned long end);
   19.18 +    extern int vga_accelerate;
   19.19 +
   19.20      if ((s->sr[0x17] & 0x44) == 0x44) {
   19.21          goto generic_io;
   19.22      } else if (s->cirrus_srcptr != s->cirrus_srcptr_end) {
   19.23 @@ -2454,17 +2460,21 @@ static void cirrus_update_memory_access(
   19.24      } else {
   19.25  	if ((s->gr[0x0B] & 0x14) == 0x14) {
   19.26              goto generic_io;
   19.27 -	} else if (s->gr[0x0B] & 0x02) {
   19.28 -            goto generic_io;
   19.29 -        }
   19.30 -        
   19.31 -	mode = s->gr[0x05] & 0x7;
   19.32 -	if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
   19.33 +    } else if (s->gr[0x0B] & 0x02) {
   19.34 +        goto generic_io;
   19.35 +    }
   19.36 +
   19.37 +    mode = s->gr[0x05] & 0x7;
   19.38 +    if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
   19.39 +            if (vga_accelerate && s->cirrus_lfb_addr && s->cirrus_lfb_end)
   19.40 +                set_vram_mapping(s->cirrus_lfb_addr, s->cirrus_lfb_end);
   19.41              s->cirrus_linear_write[0] = cirrus_linear_mem_writeb;
   19.42              s->cirrus_linear_write[1] = cirrus_linear_mem_writew;
   19.43              s->cirrus_linear_write[2] = cirrus_linear_mem_writel;
   19.44          } else {
   19.45          generic_io:
   19.46 +            if (vga_accelerate && s->cirrus_lfb_addr && s->cirrus_lfb_end)
   19.47 +                 unset_vram_mapping(s->cirrus_lfb_addr, s->cirrus_lfb_end);
   19.48              s->cirrus_linear_write[0] = cirrus_linear_writeb;
   19.49              s->cirrus_linear_write[1] = cirrus_linear_writew;
   19.50              s->cirrus_linear_write[2] = cirrus_linear_writel;
   19.51 @@ -3058,6 +3068,8 @@ static void cirrus_pci_lfb_map(PCIDevice
   19.52      /* XXX: add byte swapping apertures */
   19.53      cpu_register_physical_memory(addr, s->vram_size,
   19.54  				 s->cirrus_linear_io_addr);
   19.55 +    s->cirrus_lfb_addr = addr;
   19.56 +    s->cirrus_lfb_end = addr + VGA_RAM_SIZE;
   19.57      cpu_register_physical_memory(addr + 0x1000000, 0x400000,
   19.58  				 s->cirrus_linear_bitblt_io_addr);
   19.59  }
    20.1 --- a/tools/ioemu/hw/pc.c	Thu Sep 29 13:35:13 2005 -0600
    20.2 +++ b/tools/ioemu/hw/pc.c	Thu Sep 29 16:22:02 2005 -0600
    20.3 @@ -385,6 +385,7 @@ void pc_init(int ram_size, int vga_ram_s
    20.4      unsigned long bios_offset, vga_bios_offset;
    20.5      int bios_size, isa_bios_size;
    20.6      PCIBus *pci_bus;
    20.7 +    extern void * shared_vram;
    20.8      
    20.9      linux_boot = (kernel_filename != NULL);
   20.10  
   20.11 @@ -511,14 +512,14 @@ void pc_init(int ram_size, int vga_ram_s
   20.12      if (cirrus_vga_enabled) {
   20.13          if (pci_enabled) {
   20.14              pci_cirrus_vga_init(pci_bus, 
   20.15 -                                ds, phys_ram_base + ram_size, ram_size, 
   20.16 +                                ds, shared_vram, ram_size, 
   20.17                                  vga_ram_size);
   20.18          } else {
   20.19 -            isa_cirrus_vga_init(ds, phys_ram_base + ram_size, ram_size, 
   20.20 +            isa_cirrus_vga_init(ds, shared_vram, ram_size, 
   20.21                                  vga_ram_size);
   20.22          }
   20.23      } else {
   20.24 -        vga_initialize(pci_bus, ds, phys_ram_base + ram_size, ram_size, 
   20.25 +        vga_initialize(pci_bus, ds, shared_vram, ram_size, 
   20.26                         vga_ram_size);
   20.27      }
   20.28  
    21.1 --- a/tools/ioemu/hw/vga.c	Thu Sep 29 13:35:13 2005 -0600
    21.2 +++ b/tools/ioemu/hw/vga.c	Thu Sep 29 16:22:02 2005 -0600
    21.3 @@ -1568,6 +1568,8 @@ void vga_update_display(void)
    21.4              s->graphic_mode = graphic_mode;
    21.5              full_update = 1;
    21.6          }
    21.7 +
    21.8 +        full_update = 1;
    21.9          switch(graphic_mode) {
   21.10          case GMODE_TEXT:
   21.11              vga_draw_text(s, full_update);
   21.12 @@ -1848,6 +1850,7 @@ void vga_common_init(VGAState *s, Displa
   21.13                       unsigned long vga_ram_offset, int vga_ram_size)
   21.14  {
   21.15      int i, j, v, b;
   21.16 +    extern void* shared_vram;
   21.17  
   21.18      for(i = 0;i < 256; i++) {
   21.19          v = 0;
   21.20 @@ -1876,7 +1879,7 @@ void vga_common_init(VGAState *s, Displa
   21.21  
   21.22      /* qemu's vga mem is not detached from phys_ram_base and can cause DM abort
   21.23       * when guest write vga mem, so allocate a new one */
   21.24 -    s->vram_ptr = qemu_mallocz(vga_ram_size);
   21.25 +    s->vram_ptr = shared_vram;
   21.26  
   21.27      s->vram_offset = vga_ram_offset;
   21.28      s->vram_size = vga_ram_size;
    22.1 --- a/tools/ioemu/target-i386-dm/helper2.c	Thu Sep 29 13:35:13 2005 -0600
    22.2 +++ b/tools/ioemu/target-i386-dm/helper2.c	Thu Sep 29 16:22:02 2005 -0600
    22.3 @@ -54,6 +54,8 @@
    22.4  #include "exec-all.h"
    22.5  #include "vl.h"
    22.6  
    22.7 +void *shared_vram;
    22.8 +
    22.9  shared_iopage_t *shared_page = NULL;
   22.10  extern int reset_requested;
   22.11  
    23.1 --- a/tools/ioemu/vl.c	Thu Sep 29 13:35:13 2005 -0600
    23.2 +++ b/tools/ioemu/vl.c	Thu Sep 29 16:22:02 2005 -0600
    23.3 @@ -134,6 +134,7 @@ int pci_enabled = 1;
    23.4  int prep_enabled = 0;
    23.5  int rtc_utc = 1;
    23.6  int cirrus_vga_enabled = 1;
    23.7 +int vga_accelerate = 1;
    23.8  int graphic_width = 800;
    23.9  int graphic_height = 600;
   23.10  int graphic_depth = 15;
   23.11 @@ -141,6 +142,12 @@ int full_screen = 0;
   23.12  TextConsole *vga_console;
   23.13  CharDriverState *serial_hds[MAX_SERIAL_PORTS];
   23.14  int xc_handle;
   23.15 +unsigned long *vgapage_array;
   23.16 +unsigned long *freepage_array;
   23.17 +unsigned long free_pages;
   23.18 +void *vtop_table;
   23.19 +unsigned long toptab;
   23.20 +unsigned long vgaram_pages;
   23.21  
   23.22  /***********************************************************/
   23.23  /* x86 ISA bus support */
   23.24 @@ -2162,6 +2169,7 @@ void help(void)
   23.25             "-isa            simulate an ISA-only system (default is PCI system)\n"
   23.26             "-std-vga        simulate a standard VGA card with VESA Bochs Extensions\n"
   23.27             "                (default is CL-GD5446 PCI VGA)\n"
   23.28 +           "-vgaacc [0|1]   1 to accelerate CL-GD5446 speed, default is 1\n"
   23.29  #endif
   23.30             "-loadvm file    start right away with a saved state (loadvm in monitor)\n"
   23.31             "\n"
   23.32 @@ -2251,6 +2259,7 @@ enum {
   23.33      QEMU_OPTION_serial,
   23.34      QEMU_OPTION_loadvm,
   23.35      QEMU_OPTION_full_screen,
   23.36 +    QEMU_OPTION_vgaacc,
   23.37  };
   23.38  
   23.39  typedef struct QEMUOption {
   23.40 @@ -2327,6 +2336,7 @@ const QEMUOption qemu_options[] = {
   23.41      { "pci", 0, QEMU_OPTION_pci },
   23.42      { "nic-pcnet", 0, QEMU_OPTION_nic_pcnet },
   23.43      { "cirrusvga", 0, QEMU_OPTION_cirrusvga },
   23.44 +    { "vgaacc", HAS_ARG, QEMU_OPTION_vgaacc },
   23.45      { NULL },
   23.46  };
   23.47  
   23.48 @@ -2343,6 +2353,177 @@ static uint8_t *signal_stack;
   23.49  #define NET_IF_USER  1
   23.50  #define NET_IF_DUMMY 2
   23.51  
   23.52 +#include <xg_private.h>
   23.53 +
   23.54 +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
   23.55 +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
   23.56 +
   23.57 +#ifdef __i386__
   23.58 +#define _LEVEL_3_ 0
   23.59 +#else
   23.60 +#define _LEVEL_3_ 1
   23.61 +#endif
   23.62 +
   23.63 +#if _LEVEL_3_
   23.64 +#define L3_PROT (_PAGE_PRESENT)
   23.65 +#define L1_PAGETABLE_ENTRIES    512
   23.66 +#else
   23.67 +#define L1_PAGETABLE_ENTRIES    1024
   23.68 +#endif
   23.69 +
   23.70 +inline int
   23.71 +get_vl2_table(unsigned long count, unsigned long start)
   23.72 +{
   23.73 +#if _LEVEL_3_
   23.74 +    return ((start + (count << PAGE_SHIFT)) >> L3_PAGETABLE_SHIFT) & 0x3;
   23.75 +#else
   23.76 +    return 0;
   23.77 +#endif
   23.78 +}
   23.79 +
   23.80 +int
   23.81 +setup_mapping(int xc_handle, u32 dom, unsigned long toptab, unsigned long  *mem_page_array, unsigned long *page_table_array, unsigned long v_start, unsigned long v_end)
   23.82 +{
   23.83 +    l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
   23.84 +    l2_pgentry_t *vl2tab[4], *vl2e=NULL, *vl2_table = NULL;
   23.85 +    unsigned long l1tab;
   23.86 +    unsigned long ppt_alloc = 0;
   23.87 +    unsigned long count;
   23.88 +    int i = 0;
   23.89 +#if _LEVEL_3_
   23.90 +    l3_pgentry_t *vl3tab = NULL;
   23.91 +    unsigned long l2tab;
   23.92 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
   23.93 +                                        PROT_READ|PROT_WRITE, 
   23.94 +                                        toptab >> PAGE_SHIFT)) == NULL )
   23.95 +        goto error_out;
   23.96 +    for (i = 0; i < 4 ; i++) {
   23.97 +        l2tab = vl3tab[i] & PAGE_MASK;
   23.98 +        vl2tab[i] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   23.99 +          PROT_READ|PROT_WRITE,
  23.100 +          l2tab >> PAGE_SHIFT);
  23.101 +        if(vl2tab[i] == NULL)
  23.102 +            goto error_out;
  23.103 +    }
  23.104 +    munmap(vl3tab, PAGE_SIZE);
  23.105 +    vl3tab = NULL;
  23.106 +#else
  23.107 +    if ( (vl2tab[0] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  23.108 +                                           PROT_READ|PROT_WRITE, 
  23.109 +                                           toptab >> PAGE_SHIFT)) == NULL )
  23.110 +        goto error_out;
  23.111 +#endif
  23.112 +
  23.113 +    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
  23.114 +    {
  23.115 +        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  23.116 +        {
  23.117 +            vl2_table = vl2tab[get_vl2_table(count, v_start)];
  23.118 +            vl2e = &vl2_table[l2_table_offset(
  23.119 +                v_start + (count << PAGE_SHIFT))];
  23.120 +
  23.121 +            l1tab = page_table_array[ppt_alloc++] << PAGE_SHIFT;
  23.122 +            if ( vl1tab != NULL )
  23.123 +                munmap(vl1tab, PAGE_SIZE);
  23.124 +
  23.125 +            if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  23.126 +                                                PROT_READ|PROT_WRITE,
  23.127 +                                                l1tab >> PAGE_SHIFT)) == NULL )
  23.128 +            {
  23.129 +                goto error_out;
  23.130 +            }
  23.131 +            memset(vl1tab, 0, PAGE_SIZE);
  23.132 +            vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))];
  23.133 +            *vl2e = l1tab | L2_PROT;
  23.134 +        }
  23.135 +
  23.136 +        *vl1e = (mem_page_array[count] << PAGE_SHIFT) | L1_PROT;
  23.137 +        vl1e++;
  23.138 +    }
  23.139 +error_out:
  23.140 +    if(vl1tab)  munmap(vl1tab, PAGE_SIZE);
  23.141 +    for(i = 0; i < 4; i++)
  23.142 +        if(vl2tab[i]) munmap(vl2tab[i], PAGE_SIZE);
  23.143 +    return ppt_alloc;
  23.144 +}
  23.145 +
  23.146 +void
  23.147 +unsetup_mapping(int xc_handle, u32 dom, unsigned long toptab, unsigned long v_start, unsigned long v_end)
  23.148 +{
  23.149 +    l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
  23.150 +    l2_pgentry_t *vl2tab[4], *vl2e=NULL, *vl2_table = NULL;
  23.151 +    unsigned long l1tab;
  23.152 +    unsigned long count;
  23.153 +    int i = 0;
  23.154 +#if _LEVEL_3_
  23.155 +    l3_pgentry_t *vl3tab = NULL;
  23.156 +    unsigned long l2tab;
  23.157 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  23.158 +                                        PROT_READ|PROT_WRITE, 
  23.159 +                                        toptab >> PAGE_SHIFT)) == NULL )
  23.160 +        goto error_out;
  23.161 +    for (i = 0; i < 4 ; i ++){
  23.162 +        l2tab = vl3tab[i] & PAGE_MASK;
  23.163 +        vl2tab[i] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  23.164 +          PROT_READ|PROT_WRITE,
  23.165 +          l2tab >> PAGE_SHIFT);
  23.166 +        if(vl2tab[i] == NULL)
  23.167 +            goto error_out;
  23.168 +    }
  23.169 +    munmap(vl3tab, PAGE_SIZE);
  23.170 +    vl3tab = NULL;
  23.171 +#else
  23.172 +    if ( (vl2tab[0] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  23.173 +                                        PROT_READ|PROT_WRITE, 
  23.174 +                                        toptab >> PAGE_SHIFT)) == NULL )
  23.175 +        goto error_out;
  23.176 +#endif
  23.177 +
  23.178 +    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ){
  23.179 +        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  23.180 +        {
  23.181 +            vl2_table = vl2tab[get_vl2_table(count, v_start)];
  23.182 +            vl2e = &vl2_table[l2_table_offset(v_start + (count << PAGE_SHIFT))];
  23.183 +            l1tab = *vl2e & PAGE_MASK;
  23.184 +
  23.185 +            if(l1tab == 0)
  23.186 +                continue;
  23.187 +            if ( vl1tab != NULL )
  23.188 +                munmap(vl1tab, PAGE_SIZE);
  23.189 +
  23.190 +            if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  23.191 +                      PROT_READ|PROT_WRITE,
  23.192 +                      l1tab >> PAGE_SHIFT)) == NULL )
  23.193 +            {
  23.194 +                goto error_out;
  23.195 +            }
  23.196 +            vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))];
  23.197 +            *vl2e = 0;
  23.198 +        }
  23.199 +
  23.200 +        *vl1e = 0;
  23.201 +        vl1e++;
  23.202 +    }
  23.203 +error_out:
  23.204 +    if(vl1tab)  munmap(vl1tab, PAGE_SIZE);
  23.205 +    for(i = 0; i < 4; i++)
  23.206 +        if(vl2tab[i]) munmap(vl2tab[i], PAGE_SIZE);
  23.207 +}
  23.208 +
  23.209 +void set_vram_mapping(unsigned long addr, unsigned long end)
  23.210 +{
  23.211 +    end = addr + VGA_RAM_SIZE;
  23.212 +    setup_mapping(xc_handle, domid, toptab,
  23.213 +      vgapage_array, freepage_array, addr, end);
  23.214 +}
  23.215 +
  23.216 +void unset_vram_mapping(unsigned long addr, unsigned long end)
  23.217 +{
  23.218 +    end = addr + VGA_RAM_SIZE;
  23.219 +    /* FIXME Flush the shadow page */
  23.220 +    unsetup_mapping(xc_handle, domid, toptab, addr, end);
  23.221 +}
  23.222 +
  23.223  int main(int argc, char **argv)
  23.224  {
  23.225  #ifdef CONFIG_GDBSTUB
  23.226 @@ -2366,8 +2547,9 @@ int main(int argc, char **argv)
  23.227      char serial_devices[MAX_SERIAL_PORTS][128];
  23.228      int serial_device_index;
  23.229      const char *loadvm = NULL;
  23.230 -    unsigned long nr_pages, *page_array;
  23.231 +    unsigned long nr_pages, extra_pages, ram_pages, *page_array;
  23.232      extern void *shared_page;
  23.233 +    extern void *shared_vram;
  23.234      /* change the qemu-dm to daemon, just like bochs dm */
  23.235  //    daemon(0, 0);
  23.236      
  23.237 @@ -2674,6 +2856,17 @@ int main(int argc, char **argv)
  23.238              case QEMU_OPTION_cirrusvga:
  23.239                  cirrus_vga_enabled = 1;
  23.240                  break;
  23.241 +            case QEMU_OPTION_vgaacc:
  23.242 +                {
  23.243 +                    const char *p;
  23.244 +                    p = optarg;
  23.245 +                    vga_accelerate = strtol(p, (char **)&p, 0);
  23.246 +                    if (*p != '\0') {
  23.247 +                        fprintf(stderr, "qemu: invalid vgaacc option\n");
  23.248 +                        exit(1);
  23.249 +                    }
  23.250 +                    break;
  23.251 +                }
  23.252              case QEMU_OPTION_std_vga:
  23.253                  cirrus_vga_enabled = 0;
  23.254                  break;
  23.255 @@ -2803,12 +2996,25 @@ int main(int argc, char **argv)
  23.256      /* init the memory */
  23.257      phys_ram_size = ram_size + vga_ram_size + bios_size;
  23.258  
  23.259 -    #define PAGE_SHIFT 12
  23.260 -    #define PAGE_SIZE  (1 << PAGE_SHIFT)
  23.261 -
  23.262 -    nr_pages = ram_size/PAGE_SIZE;
  23.263 +    ram_pages = ram_size/PAGE_SIZE;
  23.264 +    vgaram_pages =  (vga_ram_size -1)/PAGE_SIZE + 1;
  23.265 +    free_pages = vgaram_pages / L1_PAGETABLE_ENTRIES;
  23.266 +    extra_pages = vgaram_pages + free_pages;
  23.267 +
  23.268      xc_handle = xc_interface_open();
  23.269 -    
  23.270 +
  23.271 +    xc_dominfo_t info;
  23.272 +    xc_domain_getinfo(xc_handle, domid, 1, &info);
  23.273 +
  23.274 +    nr_pages = info.nr_pages + extra_pages;
  23.275 +
  23.276 +    if ( xc_domain_setmaxmem(xc_handle, domid,
  23.277 +            (nr_pages) * PAGE_SIZE/1024 ) != 0)
  23.278 +    {
  23.279 +        perror("set maxmem");
  23.280 +        exit(-1);
  23.281 +    }
  23.282 +   
  23.283      if ( (page_array = (unsigned long *)
  23.284  	  malloc(nr_pages * sizeof(unsigned long))) == NULL)
  23.285      {
  23.286 @@ -2816,6 +3022,12 @@ int main(int argc, char **argv)
  23.287  	    exit(-1);
  23.288      }
  23.289  
  23.290 +    if (xc_domain_memory_increase_reservation(xc_handle, domid, 
  23.291 +          extra_pages , 0, 0, NULL) != 0) {
  23.292 +        perror("increase reservation");
  23.293 +        exit(-1);
  23.294 +    }
  23.295 +
  23.296      if ( xc_get_pfn_list(xc_handle, domid, page_array, nr_pages) != nr_pages )
  23.297      {
  23.298  	    perror("xc_get_pfn_list");
  23.299 @@ -2825,15 +3037,36 @@ int main(int argc, char **argv)
  23.300      if ((phys_ram_base =  xc_map_foreign_batch(xc_handle, domid,
  23.301  						 PROT_READ|PROT_WRITE,
  23.302  						 page_array,
  23.303 -						 nr_pages - 1)) == 0) {
  23.304 +						 ram_pages - 1)) == 0) {
  23.305  	    perror("xc_map_foreign_batch");
  23.306  	    exit(-1);
  23.307      }
  23.308  
  23.309      shared_page = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
  23.310  				       PROT_READ|PROT_WRITE,
  23.311 -				       page_array[nr_pages - 1]);
  23.312 -
  23.313 + 				       page_array[ram_pages - 1]);
  23.314 +
  23.315 +    vgapage_array = &page_array[nr_pages - vgaram_pages];
  23.316 +
  23.317 +    if ((shared_vram =  xc_map_foreign_batch(xc_handle, domid,
  23.318 + 						 PROT_READ|PROT_WRITE,
  23.319 + 						 vgapage_array,
  23.320 + 						 vgaram_pages)) == 0) {
  23.321 + 	    perror("xc_map_foreign_batch vgaram ");
  23.322 + 	    exit(-1);
  23.323 +     }
  23.324 +
  23.325 +
  23.326 +
  23.327 +    memset(shared_vram, 0, vgaram_pages * PAGE_SIZE);
  23.328 +    toptab = page_array[ram_pages] << PAGE_SHIFT;
  23.329 +
  23.330 +    vtop_table = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
  23.331 +				       PROT_READ|PROT_WRITE,
  23.332 + 				       page_array[ram_pages]);
  23.333 +
  23.334 +    freepage_array = &page_array[nr_pages - extra_pages];
  23.335 + 
  23.336  
  23.337      fprintf(logfile, "shared page at pfn:%lx, mfn: %lx\n", (nr_pages-1), 
  23.338             (page_array[nr_pages - 1]));
    24.1 --- a/tools/libxc/linux_boot_params.h	Thu Sep 29 13:35:13 2005 -0600
    24.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.3 @@ -1,166 +0,0 @@
    24.4 -#ifndef __LINUX_BOOT_PARAMS_H__
    24.5 -#define __LINUX_BOOT_PARAMS_H__
    24.6 -
    24.7 -#include <asm/types.h>
    24.8 -
    24.9 -#define E820MAX	32
   24.10 -
   24.11 -struct mem_map {
   24.12 -    int nr_map;
   24.13 -    struct entry {
   24.14 -        u64 addr;	/* start of memory segment */
   24.15 -        u64 size;	/* size of memory segment */
   24.16 -        u32 type;		/* type of memory segment */
   24.17 -#define E820_RAM        1
   24.18 -#define E820_RESERVED   2
   24.19 -#define E820_ACPI       3 /* usable as RAM once ACPI tables have been read */
   24.20 -#define E820_NVS        4
   24.21 -#define E820_IO         16
   24.22 -#define E820_SHARED     17
   24.23 -#define E820_XENSTORE   18
   24.24 -
   24.25 -        u32 caching_attr;    /* used by hypervisor */
   24.26 -#define MEMMAP_UC	0
   24.27 -#define MEMMAP_WC	1
   24.28 -#define MEMMAP_WT	4
   24.29 -#define MEMMAP_WP	5
   24.30 -#define MEMMAP_WB	6
   24.31 -
   24.32 -    }map[E820MAX];
   24.33 -};
   24.34 -
   24.35 -struct e820entry {
   24.36 -	u64 addr;	/* start of memory segment */
   24.37 -	u64 size;	/* size of memory segment */
   24.38 -	u32 type;	/* type of memory segment */
   24.39 -}__attribute__((packed));
   24.40 -
   24.41 -struct e820map {
   24.42 -    u32 nr_map;
   24.43 -    struct e820entry map[E820MAX];
   24.44 -}__attribute__((packed));
   24.45 -
   24.46 -struct drive_info_struct { __u8 dummy[32]; }; 
   24.47 -
   24.48 -struct sys_desc_table { 
   24.49 -    __u16 length; 
   24.50 -    __u8 table[318]; 
   24.51 -}; 
   24.52 -
   24.53 -struct screen_info {
   24.54 -    unsigned char  orig_x;		/* 0x00 */
   24.55 -    unsigned char  orig_y;		/* 0x01 */
   24.56 -    unsigned short dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
   24.57 -    unsigned short orig_video_page;	/* 0x04 */
   24.58 -    unsigned char  orig_video_mode;	/* 0x06 */
   24.59 -    unsigned char  orig_video_cols;	/* 0x07 */
   24.60 -    unsigned short unused2;		/* 0x08 */
   24.61 -    unsigned short orig_video_ega_bx;	/* 0x0a */
   24.62 -    unsigned short unused3;		/* 0x0c */
   24.63 -    unsigned char  orig_video_lines;	/* 0x0e */
   24.64 -    unsigned char  orig_video_isVGA;	/* 0x0f */
   24.65 -    unsigned short orig_video_points;	/* 0x10 */
   24.66 -    
   24.67 -    /* VESA graphic mode -- linear frame buffer */
   24.68 -    unsigned short lfb_width;		/* 0x12 */
   24.69 -    unsigned short lfb_height;		/* 0x14 */
   24.70 -    unsigned short lfb_depth;		/* 0x16 */
   24.71 -    unsigned int   lfb_base;		/* 0x18 */
   24.72 -    unsigned int   lfb_size;		/* 0x1c */
   24.73 -    unsigned short dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
   24.74 -    unsigned short lfb_linelength;	/* 0x24 */
   24.75 -    unsigned char  red_size;		/* 0x26 */
   24.76 -    unsigned char  red_pos;		/* 0x27 */
   24.77 -    unsigned char  green_size;		/* 0x28 */
   24.78 -    unsigned char  green_pos;		/* 0x29 */
   24.79 -    unsigned char  blue_size;		/* 0x2a */
   24.80 -    unsigned char  blue_pos;		/* 0x2b */
   24.81 -    unsigned char  rsvd_size;		/* 0x2c */
   24.82 -    unsigned char  rsvd_pos;		/* 0x2d */
   24.83 -    unsigned short vesapm_seg;		/* 0x2e */
   24.84 -    unsigned short vesapm_off;		/* 0x30 */
   24.85 -    unsigned short pages;		/* 0x32 */
   24.86 -					/* 0x34 -- 0x3f reserved for future expansion */
   24.87 -};
   24.88 -
   24.89 -struct screen_info_overlap { 
   24.90 -    __u8 reserved1[2]; /* 0x00 */ 
   24.91 -    __u16 ext_mem_k; /* 0x02 */ 
   24.92 -    __u8 reserved2[0x20 - 0x04]; /* 0x04 */ 
   24.93 -    __u16 cl_magic; /* 0x20 */ 
   24.94 -#define CL_MAGIC_VALUE 0xA33F 
   24.95 -    __u16 cl_offset; /* 0x22 */ 
   24.96 -    __u8 reserved3[0x40 - 0x24]; /* 0x24 */ 
   24.97 -}; 
   24.98 -
   24.99 -
  24.100 -struct apm_bios_info {
  24.101 -    __u16 version;
  24.102 -    __u16  cseg;
  24.103 -    __u32   offset;
  24.104 -    __u16  cseg_16;
  24.105 -    __u16  dseg;
  24.106 -    __u16  flags;
  24.107 -    __u16  cseg_len;
  24.108 -    __u16  cseg_16_len;
  24.109 -    __u16  dseg_len;
  24.110 -};
  24.111 - 
  24.112 -struct linux_boot_params { 
  24.113 -    union { /* 0x00 */ 
  24.114 -       struct screen_info info; 
  24.115 -       struct screen_info_overlap overlap; 
  24.116 -    } screen; 
  24.117 - 
  24.118 -    struct apm_bios_info apm_bios_info; /* 0x40 */ 
  24.119 -    __u8 reserved4[0x80 - 0x54]; /* 0x54 */ 
  24.120 -    struct drive_info_struct drive_info; /* 0x80 */ 
  24.121 -    struct sys_desc_table sys_desc_table; /* 0xa0 */ 
  24.122 -    __u32 alt_mem_k; /* 0x1e0 */ 
  24.123 -    __u8 reserved5[4]; /* 0x1e4 */ 
  24.124 -    __u8 e820_map_nr; /* 0x1e8 */ 
  24.125 -    __u8 reserved6[8]; /* 0x1e9 */ 
  24.126 -    __u8 setup_sects; /* 0x1f1 */ 
  24.127 -    __u16 mount_root_rdonly; /* 0x1f2 */ 
  24.128 -    __u16 syssize; /* 0x1f4 */ 
  24.129 -    __u16 swapdev; /* 0x1f6 */ 
  24.130 -    __u16 ramdisk_flags; /* 0x1f8 */ 
  24.131 -#define RAMDISK_IMAGE_START_MASK 0x07FF 
  24.132 -#define RAMDISK_PROMPT_FLAG 0x8000 
  24.133 -#define RAMDISK_LOAD_FLAG 0x4000 
  24.134 -    __u16 vid_mode; /* 0x1fa */ 
  24.135 -    __u16 root_dev; /* 0x1fc */ 
  24.136 -    __u8 reserved9[1]; /* 0x1fe */ 
  24.137 -    __u8 aux_device_info; /* 0x1ff */ 
  24.138 -    /* 2.00+ */ 
  24.139 -    __u8 reserved10[2]; /* 0x200 */ 
  24.140 -    __u8 header_magic[4]; /* 0x202 */ 
  24.141 -    __u16 protocol_version; /* 0x206 */ 
  24.142 -    __u8 reserved11[8]; /* 0x208 */ 
  24.143 -    __u8 loader_type; /* 0x210 */ 
  24.144 -#define LOADER_TYPE_LOADLIN 1 
  24.145 -#define LOADER_TYPE_BOOTSECT_LOADER 2 
  24.146 -#define LOADER_TYPE_SYSLINUX 3 
  24.147 -#define LOADER_TYPE_ETHERBOOT 4 
  24.148 -#define LOADER_TYPE_UNKNOWN 0xFF 
  24.149 -    __u8 loader_flags; /* 0x211 */ 
  24.150 -    __u8 reserved12[2]; /* 0x212 */ 
  24.151 -    __u32 code32_start; /* 0x214 */ 
  24.152 -    __u32 initrd_start; /* 0x218 */ 
  24.153 -    __u32 initrd_size; /* 0x21c */ 
  24.154 -    __u8 reserved13[4]; /* 0x220 */ 
  24.155 -    /* 2.01+ */ 
  24.156 -    __u16 heap_end_ptr; /* 0x224 */ 
  24.157 -    __u8 reserved14[2]; /* 0x226 */ 
  24.158 -    /* 2.02+ */ 
  24.159 -    __u32 cmd_line_ptr; /* 0x228 */ 
  24.160 -    /* 2.03+ */ 
  24.161 -    __u32 ramdisk_max; /* 0x22c */ 
  24.162 -    __u8 reserved15[0x2d0 - 0x230]; /* 0x230 */ 
  24.163 -    struct e820entry e820_map[E820MAX]; /* 0x2d0 */ 
  24.164 -    __u64 shared_info; /* 0x550 */
  24.165 -    __u8 padding[0x800 - 0x558]; /* 0x558 */ 
  24.166 -    __u8 cmd_line[0x800]; /* 0x800 */
  24.167 -} __attribute__((packed)); 
  24.168 -
  24.169 -#endif /* __LINUX_BOOT_PARAMS_H__ */
    25.1 --- a/tools/libxc/xc_vmx_build.c	Thu Sep 29 13:35:13 2005 -0600
    25.2 +++ b/tools/libxc/xc_vmx_build.c	Thu Sep 29 16:22:02 2005 -0600
    25.3 @@ -10,7 +10,8 @@
    25.4  #include <unistd.h>
    25.5  #include <zlib.h>
    25.6  #include <xen/io/ioreq.h>
    25.7 -#include "linux_boot_params.h"
    25.8 +
    25.9 +#define VMX_LOADER_ENTR_ADDR  0x00100000
   25.10  
   25.11  #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
   25.12  #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
   25.13 @@ -18,13 +19,29 @@
   25.14  #define L3_PROT (_PAGE_PRESENT)
   25.15  #endif
   25.16  
   25.17 +#define E820MAX	128
   25.18 +
   25.19 +#define E820_RAM          1
   25.20 +#define E820_RESERVED     2
   25.21 +#define E820_ACPI         3
   25.22 +#define E820_NVS          4
   25.23 +#define E820_IO          16
   25.24 +#define E820_SHARED_PAGE 17
   25.25 +#define E820_XENSTORE    18
   25.26 +
   25.27 +#define E820_MAP_PAGE        0x00090000
   25.28 +#define E820_MAP_NR_OFFSET   0x000001E8
   25.29 +#define E820_MAP_OFFSET      0x000002D0
   25.30 +
   25.31 +struct e820entry {
   25.32 +    u64 addr;
   25.33 +    u64 size;
   25.34 +    u32 type;
   25.35 +} __attribute__((packed));
   25.36 +
   25.37  #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
   25.38  #define round_pgdown(_p)  ((_p)&PAGE_MASK)
   25.39  
   25.40 -#define LINUX_BOOT_PARAMS_ADDR   0x00090000
   25.41 -#define LINUX_KERNEL_ENTR_ADDR   0x00100000
   25.42 -#define LINUX_PAGE_OFFSET        0xC0000000
   25.43 -
   25.44  static int
   25.45  parseelfimage(
   25.46      char *elfbase, unsigned long elfsize, struct domain_setup_info *dsi);
   25.47 @@ -33,78 +50,70 @@ loadelfimage(
   25.48      char *elfbase, int xch, u32 dom, unsigned long *parray,
   25.49      struct domain_setup_info *dsi);
   25.50  
   25.51 -static void build_e820map(struct mem_map *mem_mapp, unsigned long mem_size)
   25.52 +static unsigned char build_e820map(void *e820_page, unsigned long mem_size)
   25.53  {
   25.54 -    int nr_map = 0;
   25.55 +    struct e820entry *e820entry =
   25.56 +        (struct e820entry *)(((unsigned char *)e820_page) + E820_MAP_OFFSET);
   25.57 +    unsigned char nr_map = 0;
   25.58  
   25.59      /* XXX: Doesn't work for > 4GB yet */
   25.60 -    mem_mapp->map[nr_map].addr = 0x0;
   25.61 -    mem_mapp->map[nr_map].size = 0x9F800;
   25.62 -    mem_mapp->map[nr_map].type = E820_RAM;
   25.63 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
   25.64 +    e820entry[nr_map].addr = 0x0;
   25.65 +    e820entry[nr_map].size = 0x9F800;
   25.66 +    e820entry[nr_map].type = E820_RAM;
   25.67      nr_map++;
   25.68  
   25.69 -    mem_mapp->map[nr_map].addr = 0x9F800;
   25.70 -    mem_mapp->map[nr_map].size = 0x800;
   25.71 -    mem_mapp->map[nr_map].type = E820_RESERVED;
   25.72 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   25.73 +    e820entry[nr_map].addr = 0x9F800;
   25.74 +    e820entry[nr_map].size = 0x800;
   25.75 +    e820entry[nr_map].type = E820_RESERVED;
   25.76      nr_map++;
   25.77  
   25.78 -    mem_mapp->map[nr_map].addr = 0xA0000;
   25.79 -    mem_mapp->map[nr_map].size = 0x20000;
   25.80 -    mem_mapp->map[nr_map].type = E820_IO;
   25.81 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   25.82 +    e820entry[nr_map].addr = 0xA0000;
   25.83 +    e820entry[nr_map].size = 0x20000;
   25.84 +    e820entry[nr_map].type = E820_IO;
   25.85      nr_map++;
   25.86  
   25.87 -    mem_mapp->map[nr_map].addr = 0xF0000;
   25.88 -    mem_mapp->map[nr_map].size = 0x10000;
   25.89 -    mem_mapp->map[nr_map].type = E820_RESERVED;
   25.90 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   25.91 +    e820entry[nr_map].addr = 0xF0000;
   25.92 +    e820entry[nr_map].size = 0x10000;
   25.93 +    e820entry[nr_map].type = E820_RESERVED;
   25.94      nr_map++;
   25.95  
   25.96  #define STATIC_PAGES    2       /* for ioreq_t and store_mfn */
   25.97      /* Most of the ram goes here */
   25.98 -    mem_mapp->map[nr_map].addr = 0x100000;
   25.99 -    mem_mapp->map[nr_map].size = mem_size - 0x100000 - STATIC_PAGES*PAGE_SIZE;
  25.100 -    mem_mapp->map[nr_map].type = E820_RAM;
  25.101 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  25.102 +    e820entry[nr_map].addr = 0x100000;
  25.103 +    e820entry[nr_map].size = mem_size - 0x100000 - STATIC_PAGES*PAGE_SIZE;
  25.104 +    e820entry[nr_map].type = E820_RAM;
  25.105      nr_map++;
  25.106  
  25.107      /* Statically allocated special pages */
  25.108  
  25.109      /* Shared ioreq_t page */
  25.110 -    mem_mapp->map[nr_map].addr = mem_size - PAGE_SIZE;
  25.111 -    mem_mapp->map[nr_map].size = PAGE_SIZE;
  25.112 -    mem_mapp->map[nr_map].type = E820_SHARED;
  25.113 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  25.114 +    e820entry[nr_map].addr = mem_size - PAGE_SIZE;
  25.115 +    e820entry[nr_map].size = PAGE_SIZE;
  25.116 +    e820entry[nr_map].type = E820_SHARED_PAGE;
  25.117      nr_map++;
  25.118  
  25.119      /* For xenstore */
  25.120 -    mem_mapp->map[nr_map].addr = mem_size - 2*PAGE_SIZE;
  25.121 -    mem_mapp->map[nr_map].size = PAGE_SIZE;
  25.122 -    mem_mapp->map[nr_map].type = E820_XENSTORE;
  25.123 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  25.124 -    nr_map++;
  25.125 -
  25.126 -    mem_mapp->map[nr_map].addr = mem_size;
  25.127 -    mem_mapp->map[nr_map].size = 0x3 * PAGE_SIZE;
  25.128 -    mem_mapp->map[nr_map].type = E820_NVS;
  25.129 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
  25.130 +    e820entry[nr_map].addr = mem_size - 2*PAGE_SIZE;
  25.131 +    e820entry[nr_map].size = PAGE_SIZE;
  25.132 +    e820entry[nr_map].type = E820_XENSTORE;
  25.133      nr_map++;
  25.134  
  25.135 -    mem_mapp->map[nr_map].addr = mem_size + 0x3 * PAGE_SIZE;
  25.136 -    mem_mapp->map[nr_map].size = 0xA * PAGE_SIZE;
  25.137 -    mem_mapp->map[nr_map].type = E820_ACPI;
  25.138 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  25.139 +    e820entry[nr_map].addr = mem_size;
  25.140 +    e820entry[nr_map].size = 0x3 * PAGE_SIZE;
  25.141 +    e820entry[nr_map].type = E820_NVS;
  25.142      nr_map++;
  25.143  
  25.144 -    mem_mapp->map[nr_map].addr = 0xFEC00000;
  25.145 -    mem_mapp->map[nr_map].size = 0x1400000;
  25.146 -    mem_mapp->map[nr_map].type = E820_IO;
  25.147 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
  25.148 +    e820entry[nr_map].addr = mem_size + 0x3 * PAGE_SIZE;
  25.149 +    e820entry[nr_map].size = 0xA * PAGE_SIZE;
  25.150 +    e820entry[nr_map].type = E820_ACPI;
  25.151      nr_map++;
  25.152  
  25.153 -    mem_mapp->nr_map = nr_map;
  25.154 +    e820entry[nr_map].addr = 0xFEC00000;
  25.155 +    e820entry[nr_map].size = 0x1400000;
  25.156 +    e820entry[nr_map].type = E820_IO;
  25.157 +    nr_map++;
  25.158 +
  25.159 +    return (*(((unsigned char *)e820_page) + E820_MAP_NR_OFFSET) = nr_map);
  25.160  }
  25.161  
  25.162  /*
  25.163 @@ -112,19 +121,19 @@ static void build_e820map(struct mem_map
  25.164   * vmxloader will use it to config ACPI MADT table
  25.165   */
  25.166  #define VCPU_MAGIC 0x76637075 /* "vcpu" */
  25.167 -static int 
  25.168 -set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list, 
  25.169 +static int
  25.170 +set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list,
  25.171               struct domain_setup_info *dsi, unsigned long vcpus)
  25.172  {
  25.173      char          *va_map;
  25.174      unsigned long *va_vcpus;
  25.175 -    
  25.176 +
  25.177      va_map = xc_map_foreign_range(
  25.178          xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  25.179 -        pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]);    
  25.180 +        pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]);
  25.181      if ( va_map == NULL )
  25.182          return -1;
  25.183 -    
  25.184 +
  25.185      va_vcpus = (unsigned long *)(va_map + 0x800);
  25.186      *va_vcpus++ = VCPU_MAGIC;
  25.187      *va_vcpus++ = vcpus;
  25.188 @@ -164,24 +173,23 @@ static int zap_mmio_range(int xc_handle,
  25.189      return 0;
  25.190  }
  25.191  
  25.192 -static int zap_mmio_ranges(int xc_handle, u32 dom,
  25.193 -                           unsigned long l2tab,
  25.194 -                           struct mem_map *mem_mapp)
  25.195 +static int zap_mmio_ranges(int xc_handle, u32 dom, unsigned long l2tab,
  25.196 +                           unsigned char e820_map_nr, unsigned char *e820map)
  25.197  {
  25.198 -    int i;
  25.199 +    unsigned int i;
  25.200 +    struct e820entry *e820entry = (struct e820entry *)e820map;
  25.201 +
  25.202      l2_pgentry_32_t *vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  25.203                                                     PROT_READ|PROT_WRITE,
  25.204                                                     l2tab >> PAGE_SHIFT);
  25.205      if ( vl2tab == 0 )
  25.206          return -1;
  25.207  
  25.208 -    for ( i = 0; i < mem_mapp->nr_map; i++ )
  25.209 +    for ( i = 0; i < e820_map_nr; i++ )
  25.210      {
  25.211 -        if ( (mem_mapp->map[i].type == E820_IO) &&
  25.212 -             (mem_mapp->map[i].caching_attr == MEMMAP_UC) &&
  25.213 +        if ( (e820entry[i].type == E820_IO) &&
  25.214               (zap_mmio_range(xc_handle, dom, vl2tab,
  25.215 -                             mem_mapp->map[i].addr,
  25.216 -                             mem_mapp->map[i].size) == -1) )
  25.217 +                             e820entry[i].addr, e820entry[i].size) == -1))
  25.218              return -1;
  25.219      }
  25.220  
  25.221 @@ -200,7 +208,7 @@ static int zap_mmio_range(int xc_handle,
  25.222      unsigned long vl3e;
  25.223      l1_pgentry_t *vl1tab;
  25.224      l2_pgentry_t *vl2tab;
  25.225 - 
  25.226 +
  25.227      mmio_addr = mmio_range_start & PAGE_MASK;
  25.228      for ( ; mmio_addr < mmio_range_end; mmio_addr += PAGE_SIZE )
  25.229      {
  25.230 @@ -239,22 +247,22 @@ static int zap_mmio_range(int xc_handle,
  25.231      return 0;
  25.232  }
  25.233  
  25.234 -static int zap_mmio_ranges(int xc_handle, u32 dom,
  25.235 -                           unsigned long l3tab,
  25.236 -                           struct mem_map *mem_mapp)
  25.237 +static int zap_mmio_ranges(int xc_handle, u32 dom, unsigned long l3tab,
  25.238 +                           unsigned char e820_map_nr, unsigned char *e820map)
  25.239  {
  25.240 -    int i;
  25.241 +    unsigned int i;
  25.242 +    struct e820entry *e820entry = (struct e820entry *)e820map;
  25.243 +
  25.244      l3_pgentry_t *vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  25.245                                                  PROT_READ|PROT_WRITE,
  25.246                                                  l3tab >> PAGE_SHIFT);
  25.247      if (vl3tab == 0)
  25.248          return -1;
  25.249 -    for (i = 0; i < mem_mapp->nr_map; i++) {
  25.250 -        if ((mem_mapp->map[i].type == E820_IO)
  25.251 -            && (mem_mapp->map[i].caching_attr == MEMMAP_UC))
  25.252 -            if (zap_mmio_range(xc_handle, dom, vl3tab,
  25.253 -                               mem_mapp->map[i].addr, mem_mapp->map[i].size) == -1)
  25.254 -                return -1;
  25.255 +    for ( i = 0; i < e820_map_nr; i++ ) {
  25.256 +        if ( (e820entry[i].type == E820_IO) &&
  25.257 +             (zap_mmio_range(xc_handle, dom, vl3tab,
  25.258 +                             e820entry[i].addr, e820entry[i].size) == -1) )
  25.259 +            return -1;
  25.260      }
  25.261      munmap(vl3tab, PAGE_SIZE);
  25.262      return 0;
  25.263 @@ -265,18 +273,14 @@ static int zap_mmio_ranges(int xc_handle
  25.264  static int setup_guest(int xc_handle,
  25.265                         u32 dom, int memsize,
  25.266                         char *image, unsigned long image_size,
  25.267 -                       gzFile initrd_gfd, unsigned long initrd_len,
  25.268                         unsigned long nr_pages,
  25.269                         vcpu_guest_context_t *ctxt,
  25.270 -                       const char *cmdline,
  25.271                         unsigned long shared_info_frame,
  25.272                         unsigned int control_evtchn,
  25.273                         unsigned long flags,
  25.274                         unsigned int vcpus,
  25.275                         unsigned int store_evtchn,
  25.276 -                       unsigned long *store_mfn,
  25.277 -                       struct mem_map *mem_mapp
  25.278 -    )
  25.279 +                       unsigned long *store_mfn)
  25.280  {
  25.281      l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
  25.282      l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
  25.283 @@ -289,8 +293,8 @@ static int setup_guest(int xc_handle,
  25.284      unsigned long l1tab;
  25.285      unsigned long count, i;
  25.286      shared_info_t *shared_info;
  25.287 -    struct linux_boot_params * boot_paramsp;
  25.288 -    __u16 * boot_gdtp;
  25.289 +    void *e820_page;
  25.290 +    unsigned char e820_map_nr;
  25.291      xc_mmu_t *mmu = NULL;
  25.292      int rc;
  25.293  
  25.294 @@ -298,12 +302,6 @@ static int setup_guest(int xc_handle,
  25.295      unsigned long ppt_alloc;
  25.296  
  25.297      struct domain_setup_info dsi;
  25.298 -    unsigned long vinitrd_start;
  25.299 -    unsigned long vinitrd_end;
  25.300 -    unsigned long vboot_params_start;
  25.301 -    unsigned long vboot_params_end;
  25.302 -    unsigned long vboot_gdt_start;
  25.303 -    unsigned long vboot_gdt_end;
  25.304      unsigned long vpt_start;
  25.305      unsigned long vpt_end;
  25.306      unsigned long v_end;
  25.307 @@ -322,27 +320,8 @@ static int setup_guest(int xc_handle,
  25.308          goto error_out;
  25.309      }
  25.310  
  25.311 -    /*
  25.312 -     * Why do we need this? The number of page-table frames depends on the 
  25.313 -     * size of the bootstrap address space. But the size of the address space 
  25.314 -     * depends on the number of page-table frames (since each one is mapped 
  25.315 -     * read-only). We have a pair of simultaneous equations in two unknowns, 
  25.316 -     * which we solve by exhaustive search.
  25.317 -     */
  25.318 -    vboot_params_start = LINUX_BOOT_PARAMS_ADDR;
  25.319 -    vboot_params_end   = vboot_params_start + PAGE_SIZE;
  25.320 -    vboot_gdt_start    = vboot_params_end;
  25.321 -    vboot_gdt_end      = vboot_gdt_start + PAGE_SIZE;
  25.322 -
  25.323      /* memsize is in megabytes */
  25.324      v_end              = memsize << 20;
  25.325 -    /* leaving the top 4k untouched for IO requests page use */
  25.326 -    vinitrd_end        = v_end - PAGE_SIZE;
  25.327 -    vinitrd_start      = vinitrd_end - initrd_len;
  25.328 -    vinitrd_start      = vinitrd_start & (~(PAGE_SIZE - 1));
  25.329 -
  25.330 -    if(initrd_len == 0)
  25.331 -        vinitrd_start = vinitrd_end = 0;
  25.332  
  25.333  #ifdef __i386__
  25.334      nr_pt_pages = 1 + ((memsize + 3) >> 2);
  25.335 @@ -353,24 +332,17 @@ static int setup_guest(int xc_handle,
  25.336      vpt_end     = vpt_start + (nr_pt_pages * PAGE_SIZE);
  25.337  
  25.338      printf("VIRTUAL MEMORY ARRANGEMENT:\n"
  25.339 -           " Boot_params:   %08lx->%08lx\n"
  25.340 -           " boot_gdt:      %08lx->%08lx\n"
  25.341 -           " Loaded kernel: %08lx->%08lx\n"
  25.342 -           " Init. ramdisk: %08lx->%08lx\n"
  25.343 +           " Loaded VMX loader: %08lx->%08lx\n"
  25.344             " Page tables:   %08lx->%08lx\n"
  25.345             " TOTAL:         %08lx->%08lx\n",
  25.346 -           vboot_params_start, vboot_params_end,
  25.347 -           vboot_gdt_start, vboot_gdt_end,
  25.348 -           dsi.v_kernstart, dsi.v_kernend, 
  25.349 -           vinitrd_start, vinitrd_end,
  25.350 +           dsi.v_kernstart, dsi.v_kernend,
  25.351             vpt_start, vpt_end,
  25.352             dsi.v_start, v_end);
  25.353      printf(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
  25.354 -    printf(" INITRD LENGTH: %08lx\n", initrd_len);
  25.355  
  25.356      if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
  25.357      {
  25.358 -        printf("Initial guest OS requires too much space\n"
  25.359 +        ERROR("Initial guest OS requires too much space\n"
  25.360                 "(%luMB is greater than %luMB limit)\n",
  25.361                 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
  25.362          goto error_out;
  25.363 @@ -390,23 +362,6 @@ static int setup_guest(int xc_handle,
  25.364  
  25.365      loadelfimage(image, xc_handle, dom, page_array, &dsi);
  25.366  
  25.367 -    /* Load the initial ramdisk image. */
  25.368 -    if ( initrd_len != 0 )
  25.369 -    {
  25.370 -        for ( i = (vinitrd_start - dsi.v_start); 
  25.371 -              i < (vinitrd_end - dsi.v_start); i += PAGE_SIZE )
  25.372 -        {
  25.373 -            char page[PAGE_SIZE];
  25.374 -            if ( gzread(initrd_gfd, page, PAGE_SIZE) == -1 )
  25.375 -            {
  25.376 -                PERROR("Error reading initrd image, could not");
  25.377 -                goto error_out;
  25.378 -            }
  25.379 -            xc_copy_to_domain_page(xc_handle, dom,
  25.380 -                                   page_array[i>>PAGE_SHIFT], page);
  25.381 -        }
  25.382 -    }
  25.383 -
  25.384      if ( (mmu = xc_init_mmu_updates(xc_handle, dom)) == NULL )
  25.385          goto error_out;
  25.386  
  25.387 @@ -428,15 +383,14 @@ static int setup_guest(int xc_handle,
  25.388      l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  25.389      ctxt->ctrlreg[3] = l2tab;
  25.390  
  25.391 -    /* Initialise the page tables. */
  25.392 -    if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  25.393 -                                        PROT_READ|PROT_WRITE, 
  25.394 +    if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  25.395 +                                        PROT_READ|PROT_WRITE,
  25.396                                          l2tab >> PAGE_SHIFT)) == NULL )
  25.397          goto error_out;
  25.398      memset(vl2tab, 0, PAGE_SIZE);
  25.399      vl2e = &vl2tab[l2_table_offset(dsi.v_start)];
  25.400      for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
  25.401 -    {    
  25.402 +    {
  25.403          if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  25.404          {
  25.405              l1tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  25.406 @@ -460,23 +414,35 @@ static int setup_guest(int xc_handle,
  25.407      munmap(vl1tab, PAGE_SIZE);
  25.408      munmap(vl2tab, PAGE_SIZE);
  25.409  #else
  25.410 -    /* here l3tab means pdpt, only 4 entry is used */
  25.411      l3tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  25.412      ctxt->ctrlreg[3] = l3tab;
  25.413  
  25.414 -    /* Initialise the page tables. */
  25.415 -    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  25.416 -                                        PROT_READ|PROT_WRITE, 
  25.417 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  25.418 +                                        PROT_READ|PROT_WRITE,
  25.419                                          l3tab >> PAGE_SHIFT)) == NULL )
  25.420          goto error_out;
  25.421      memset(vl3tab, 0, PAGE_SIZE);
  25.422  
  25.423 +    /* Fill in every PDPT entry. */
  25.424 +    for ( i = 0; i < L3_PAGETABLE_ENTRIES_PAE; i++ )
  25.425 +    {
  25.426 +        l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  25.427 +        if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  25.428 +                                            PROT_READ|PROT_WRITE,
  25.429 +                                            l2tab >> PAGE_SHIFT)) == NULL )
  25.430 +            goto error_out;
  25.431 +        memset(vl2tab, 0, PAGE_SIZE);
  25.432 +        munmap(vl2tab, PAGE_SIZE);
  25.433 +        vl3tab[i] = l2tab | L3_PROT;
  25.434 +    }
  25.435 +
  25.436      vl3e = &vl3tab[l3_table_offset(dsi.v_start)];
  25.437  
  25.438      for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
  25.439      {
  25.440 -        if (!(count % (1 << (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))){
  25.441 -            l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  25.442 +        if (!(count & (1 << (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))){
  25.443 +            l2tab = vl3tab[count >> (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]
  25.444 +                & PAGE_MASK;
  25.445  
  25.446              if (vl2tab != NULL)
  25.447                  munmap(vl2tab, PAGE_SIZE);
  25.448 @@ -486,8 +452,6 @@ static int setup_guest(int xc_handle,
  25.449                                                  l2tab >> PAGE_SHIFT)) == NULL )
  25.450                  goto error_out;
  25.451  
  25.452 -            memset(vl2tab, 0, PAGE_SIZE);
  25.453 -            *vl3e++ = l2tab | L3_PROT;
  25.454              vl2e = &vl2tab[l2_table_offset(dsi.v_start + (count << PAGE_SHIFT))];
  25.455          }
  25.456          if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  25.457 @@ -519,103 +483,31 @@ static int setup_guest(int xc_handle,
  25.458      for ( count = 0; count < nr_pages; count++ )
  25.459      {
  25.460          if ( xc_add_mmu_update(xc_handle, mmu,
  25.461 -                               (page_array[count] << PAGE_SHIFT) | 
  25.462 +                               (page_array[count] << PAGE_SHIFT) |
  25.463                                 MMU_MACHPHYS_UPDATE, count) )
  25.464              goto error_out;
  25.465      }
  25.466  
  25.467      set_nr_vcpus(xc_handle, dom, page_array, &dsi, vcpus);
  25.468  
  25.469 -    if ((boot_paramsp = xc_map_foreign_range(
  25.470 -        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  25.471 -        page_array[(vboot_params_start-dsi.v_start)>>PAGE_SHIFT])) == 0)
  25.472 -        goto error_out;
  25.473 -
  25.474 -    memset(boot_paramsp, 0, sizeof(*boot_paramsp));
  25.475 -
  25.476 -    strncpy((char *)boot_paramsp->cmd_line, cmdline, 0x800);
  25.477 -    boot_paramsp->cmd_line[0x800-1] = '\0';
  25.478 -    boot_paramsp->cmd_line_ptr = ((unsigned long) vboot_params_start) + offsetof(struct linux_boot_params, cmd_line);
  25.479 -
  25.480 -    boot_paramsp->setup_sects = 0;
  25.481 -    boot_paramsp->mount_root_rdonly = 1;
  25.482 -    boot_paramsp->swapdev = 0x0; 
  25.483 -    boot_paramsp->ramdisk_flags = 0x0; 
  25.484 -    boot_paramsp->root_dev = 0x0; /* We must tell kernel root dev by kernel command line. */
  25.485 -
  25.486 -    /* we don't have a ps/2 mouse now.
  25.487 -     * 0xAA means a aux mouse is there.
  25.488 -     * See detect_auxiliary_port() in pc_keyb.c.
  25.489 -     */
  25.490 -    boot_paramsp->aux_device_info = 0x0; 
  25.491 -
  25.492 -    boot_paramsp->header_magic[0] = 0x48; /* "H" */
  25.493 -    boot_paramsp->header_magic[1] = 0x64; /* "d" */
  25.494 -    boot_paramsp->header_magic[2] = 0x72; /* "r" */
  25.495 -    boot_paramsp->header_magic[3] = 0x53; /* "S" */
  25.496 -
  25.497 -    boot_paramsp->protocol_version = 0x0203; /* 2.03 */
  25.498 -    boot_paramsp->loader_type = 0x71; /* GRUB */
  25.499 -    boot_paramsp->loader_flags = 0x1; /* loaded high */
  25.500 -    boot_paramsp->code32_start = LINUX_KERNEL_ENTR_ADDR; /* 1MB */
  25.501 -    boot_paramsp->initrd_start = vinitrd_start;
  25.502 -    boot_paramsp->initrd_size = initrd_len;
  25.503 -
  25.504 -    i = ((memsize - 1) << 10) - 4;
  25.505 -    boot_paramsp->alt_mem_k = i; /* alt_mem_k */
  25.506 -    boot_paramsp->screen.overlap.ext_mem_k = i & 0xFFFF; /* ext_mem_k */
  25.507 +    *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT];
  25.508 +    shared_page_frame = (v_end - PAGE_SIZE) >> PAGE_SHIFT;
  25.509  
  25.510 -    /*
  25.511 -     * Stuff SCREAN_INFO
  25.512 -     */
  25.513 -    boot_paramsp->screen.info.orig_x = 0;
  25.514 -    boot_paramsp->screen.info.orig_y = 0;
  25.515 -    boot_paramsp->screen.info.orig_video_page = 8;
  25.516 -    boot_paramsp->screen.info.orig_video_mode = 3;
  25.517 -    boot_paramsp->screen.info.orig_video_cols = 80;
  25.518 -    boot_paramsp->screen.info.orig_video_ega_bx = 0;
  25.519 -    boot_paramsp->screen.info.orig_video_lines = 25;
  25.520 -    boot_paramsp->screen.info.orig_video_isVGA = 1;
  25.521 -    boot_paramsp->screen.info.orig_video_points = 0x0010;
  25.522 -
  25.523 -    /* seems we may NOT stuff boot_paramsp->apm_bios_info */
  25.524 -    /* seems we may NOT stuff boot_paramsp->drive_info */
  25.525 -    /* seems we may NOT stuff boot_paramsp->sys_desc_table */
  25.526 -    *((unsigned short *) &boot_paramsp->drive_info.dummy[0]) = 800;
  25.527 -    boot_paramsp->drive_info.dummy[2] = 4;
  25.528 -    boot_paramsp->drive_info.dummy[14] = 32;
  25.529 -
  25.530 -    /* memsize is in megabytes */
  25.531 -    /* If you need to create a special e820map, comment this line
  25.532 -       and use mem-map.sxp */
  25.533 -    build_e820map(mem_mapp, memsize << 20);
  25.534 -    *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT];
  25.535 +    if ((e820_page = xc_map_foreign_range(
  25.536 +        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  25.537 +        page_array[E820_MAP_PAGE >> PAGE_SHIFT])) == 0)
  25.538 +        goto error_out;
  25.539 +    memset(e820_page, 0, PAGE_SIZE);
  25.540 +    e820_map_nr = build_e820map(e820_page, v_end);
  25.541  #if defined (__i386__)
  25.542 -    if (zap_mmio_ranges(xc_handle, dom, l2tab, mem_mapp) == -1)
  25.543 +    if (zap_mmio_ranges(xc_handle, dom, l2tab, e820_map_nr,
  25.544 +                        ((unsigned char *)e820_page) + E820_MAP_OFFSET) == -1)
  25.545  #else
  25.546 -        if (zap_mmio_ranges(xc_handle, dom, l3tab, mem_mapp) == -1)
  25.547 +    if (zap_mmio_ranges(xc_handle, dom, l3tab, e820_map_nr,
  25.548 +                        ((unsigned char *)e820_page) + E820_MAP_OFFSET) == -1)
  25.549  #endif
  25.550 -            goto error_out;
  25.551 -    boot_paramsp->e820_map_nr = mem_mapp->nr_map;
  25.552 -    for (i=0; i<mem_mapp->nr_map; i++) {
  25.553 -        boot_paramsp->e820_map[i].addr = mem_mapp->map[i].addr; 
  25.554 -        boot_paramsp->e820_map[i].size = mem_mapp->map[i].size; 
  25.555 -        boot_paramsp->e820_map[i].type = mem_mapp->map[i].type; 
  25.556 -        if (mem_mapp->map[i].type == E820_SHARED)
  25.557 -            shared_page_frame = (mem_mapp->map[i].addr >> PAGE_SHIFT);
  25.558 -    }
  25.559 -    munmap(boot_paramsp, PAGE_SIZE); 
  25.560 -
  25.561 -    if ((boot_gdtp = xc_map_foreign_range(
  25.562 -        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  25.563 -        page_array[(vboot_gdt_start-dsi.v_start)>>PAGE_SHIFT])) == 0)
  25.564          goto error_out;
  25.565 -    memset(boot_gdtp, 0, PAGE_SIZE);
  25.566 -    boot_gdtp[12*4 + 0] = boot_gdtp[13*4 + 0] = 0xffff; /* limit */
  25.567 -    boot_gdtp[12*4 + 1] = boot_gdtp[13*4 + 1] = 0x0000; /* base */
  25.568 -    boot_gdtp[12*4 + 2] = 0x9a00; boot_gdtp[13*4 + 2] = 0x9200; /* perms */
  25.569 -    boot_gdtp[12*4 + 3] = boot_gdtp[13*4 + 3] = 0x00cf; /* granu + top of limit */
  25.570 -    munmap(boot_gdtp, PAGE_SIZE);
  25.571 +    munmap(e820_page, PAGE_SIZE);
  25.572  
  25.573      /* shared_info page starts its life empty. */
  25.574      if ((shared_info = xc_map_foreign_range(
  25.575 @@ -651,20 +543,21 @@ static int setup_guest(int xc_handle,
  25.576      /*
  25.577       * Initial register values:
  25.578       */
  25.579 -    ctxt->user_regs.ds = 0x68;
  25.580 -    ctxt->user_regs.es = 0x0;
  25.581 -    ctxt->user_regs.fs = 0x0;
  25.582 -    ctxt->user_regs.gs = 0x0;
  25.583 -    ctxt->user_regs.ss = 0x68;
  25.584 -    ctxt->user_regs.cs = 0x60;
  25.585 +    ctxt->user_regs.ds = 0;
  25.586 +    ctxt->user_regs.es = 0;
  25.587 +    ctxt->user_regs.fs = 0;
  25.588 +    ctxt->user_regs.gs = 0;
  25.589 +    ctxt->user_regs.ss = 0;
  25.590 +    ctxt->user_regs.cs = 0;
  25.591      ctxt->user_regs.eip = dsi.v_kernentry;
  25.592 -    ctxt->user_regs.edx = vboot_gdt_start;
  25.593 -    ctxt->user_regs.eax = 0x800;
  25.594 -    ctxt->user_regs.esp = vboot_gdt_end;
  25.595 +    ctxt->user_regs.edx = 0;
  25.596 +    ctxt->user_regs.eax = 0;
  25.597 +    ctxt->user_regs.esp = 0;
  25.598      ctxt->user_regs.ebx = 0; /* startup_32 expects this to be 0 to signal boot cpu */
  25.599 -    ctxt->user_regs.ecx = mem_mapp->nr_map;
  25.600 -    ctxt->user_regs.esi = vboot_params_start;
  25.601 -    ctxt->user_regs.edi = vboot_params_start + 0x2d0;
  25.602 +    ctxt->user_regs.ecx = 0;
  25.603 +    ctxt->user_regs.esi = 0;
  25.604 +    ctxt->user_regs.edi = 0;
  25.605 +    ctxt->user_regs.ebp = 0;
  25.606  
  25.607      ctxt->user_regs.eflags = 0;
  25.608  
  25.609 @@ -684,9 +577,9 @@ static int vmx_identify(void)
  25.610      int eax, ecx;
  25.611  
  25.612  #ifdef __i386__
  25.613 -    __asm__ __volatile__ ("pushl %%ebx; cpuid; popl %%ebx" 
  25.614 -                          : "=a" (eax), "=c" (ecx) 
  25.615 -                          : "0" (1) 
  25.616 +    __asm__ __volatile__ ("pushl %%ebx; cpuid; popl %%ebx"
  25.617 +                          : "=a" (eax), "=c" (ecx)
  25.618 +                          : "0" (1)
  25.619                            : "dx");
  25.620  #elif defined __x86_64__
  25.621      __asm__ __volatile__ ("pushq %%rbx; cpuid; popq %%rbx"
  25.622 @@ -705,9 +598,6 @@ int xc_vmx_build(int xc_handle,
  25.623                   u32 domid,
  25.624                   int memsize,
  25.625                   const char *image_name,
  25.626 -                 struct mem_map *mem_mapp,
  25.627 -                 const char *ramdisk_name,
  25.628 -                 const char *cmdline,
  25.629                   unsigned int control_evtchn,
  25.630                   unsigned long flags,
  25.631                   unsigned int vcpus,
  25.632 @@ -715,20 +605,18 @@ int xc_vmx_build(int xc_handle,
  25.633                   unsigned long *store_mfn)
  25.634  {
  25.635      dom0_op_t launch_op, op;
  25.636 -    int initrd_fd = -1;
  25.637 -    gzFile initrd_gfd = NULL;
  25.638      int rc, i;
  25.639      vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
  25.640      unsigned long nr_pages;
  25.641      char         *image = NULL;
  25.642 -    unsigned long image_size, initrd_size=0;
  25.643 +    unsigned long image_size;
  25.644  
  25.645      if ( vmx_identify() < 0 )
  25.646      {
  25.647          PERROR("CPU doesn't support VMX Extensions");
  25.648          goto error_out;
  25.649      }
  25.650 -    
  25.651 +
  25.652      if ( (nr_pages = xc_get_tot_pages(xc_handle, domid)) < 0 )
  25.653      {
  25.654          PERROR("Could not find total pages for domain");
  25.655 @@ -738,32 +626,15 @@ int xc_vmx_build(int xc_handle,
  25.656      if ( (image = xc_read_kernel_image(image_name, &image_size)) == NULL )
  25.657          goto error_out;
  25.658  
  25.659 -    if ( (ramdisk_name != NULL) && (strlen(ramdisk_name) != 0) )
  25.660 +    if ( mlock(&st_ctxt, sizeof(st_ctxt) ) )
  25.661      {
  25.662 -        if ( (initrd_fd = open(ramdisk_name, O_RDONLY)) < 0 )
  25.663 -        {
  25.664 -            PERROR("Could not open the initial ramdisk image");
  25.665 -            goto error_out;
  25.666 -        }
  25.667 -
  25.668 -        initrd_size = xc_get_filesz(initrd_fd);
  25.669 -
  25.670 -        if ( (initrd_gfd = gzdopen(initrd_fd, "rb")) == NULL )
  25.671 -        {
  25.672 -            PERROR("Could not allocate decompression state for initrd");
  25.673 -            goto error_out;
  25.674 -        }
  25.675 -    }
  25.676 -
  25.677 -    if ( mlock(&st_ctxt, sizeof(st_ctxt) ) )
  25.678 -    {   
  25.679          PERROR("xc_vmx_build: ctxt mlock failed");
  25.680          return 1;
  25.681      }
  25.682  
  25.683      op.cmd = DOM0_GETDOMAININFO;
  25.684      op.u.getdomaininfo.domain = (domid_t)domid;
  25.685 -    if ( (xc_dom0_op(xc_handle, &op) < 0) || 
  25.686 +    if ( (xc_dom0_op(xc_handle, &op) < 0) ||
  25.687           ((u16)op.u.getdomaininfo.domain != domid) )
  25.688      {
  25.689          PERROR("Could not get info on domain");
  25.690 @@ -783,21 +654,14 @@ int xc_vmx_build(int xc_handle,
  25.691          goto error_out;
  25.692      }
  25.693  
  25.694 -    if ( setup_guest(xc_handle, domid, memsize, image, image_size, 
  25.695 -                     initrd_gfd, initrd_size, nr_pages, 
  25.696 -                     ctxt, cmdline,
  25.697 -                     op.u.getdomaininfo.shared_info_frame,
  25.698 -                     control_evtchn, flags, vcpus, store_evtchn, store_mfn,
  25.699 -                     mem_mapp) < 0 )
  25.700 +    if ( setup_guest(xc_handle, domid, memsize, image, image_size, nr_pages,
  25.701 +                     ctxt, op.u.getdomaininfo.shared_info_frame, control_evtchn,
  25.702 +                     flags, vcpus, store_evtchn, store_mfn) < 0)
  25.703      {
  25.704          ERROR("Error constructing guest OS");
  25.705          goto error_out;
  25.706      }
  25.707  
  25.708 -    if ( initrd_fd >= 0 )
  25.709 -        close(initrd_fd);
  25.710 -    if ( initrd_gfd )
  25.711 -        gzclose(initrd_gfd);
  25.712      free(image);
  25.713  
  25.714      ctxt->flags = VGCF_VMX_GUEST;
  25.715 @@ -813,15 +677,10 @@ int xc_vmx_build(int xc_handle,
  25.716  
  25.717      /* No LDT. */
  25.718      ctxt->ldt_ents = 0;
  25.719 -    
  25.720 +
  25.721      /* Use the default Xen-provided GDT. */
  25.722      ctxt->gdt_ents = 0;
  25.723  
  25.724 -    /* Ring 1 stack is the initial stack. */
  25.725 -/*
  25.726 -  ctxt->kernel_ss = FLAT_KERNEL_DS;
  25.727 -  ctxt->kernel_sp = vstartinfo_start;
  25.728 -*/
  25.729      /* No debugging. */
  25.730      memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg));
  25.731  
  25.732 @@ -845,14 +704,10 @@ int xc_vmx_build(int xc_handle,
  25.733  
  25.734      launch_op.cmd = DOM0_SETDOMAININFO;
  25.735      rc = xc_dom0_op(xc_handle, &launch_op);
  25.736 -    
  25.737 +
  25.738      return rc;
  25.739  
  25.740   error_out:
  25.741 -    if ( initrd_gfd != NULL )
  25.742 -        gzclose(initrd_gfd);
  25.743 -    else if ( initrd_fd >= 0 )
  25.744 -        close(initrd_fd);
  25.745      free(image);
  25.746  
  25.747      return -1;
  25.748 @@ -864,7 +719,7 @@ static inline int is_loadable_phdr(Elf32
  25.749              ((phdr->p_flags & (PF_W|PF_X)) != 0));
  25.750  }
  25.751  
  25.752 -static int parseelfimage(char *elfbase, 
  25.753 +static int parseelfimage(char *elfbase,
  25.754                           unsigned long elfsize,
  25.755                           struct domain_setup_info *dsi)
  25.756  {
  25.757 @@ -899,11 +754,11 @@ static int parseelfimage(char *elfbase,
  25.758          ERROR("ELF image has no section-header strings table (shstrtab).");
  25.759          return -EINVAL;
  25.760      }
  25.761 -    shdr = (Elf32_Shdr *)(elfbase + ehdr->e_shoff + 
  25.762 +    shdr = (Elf32_Shdr *)(elfbase + ehdr->e_shoff +
  25.763                            (ehdr->e_shstrndx*ehdr->e_shentsize));
  25.764      shstrtab = elfbase + shdr->sh_offset;
  25.765 -    
  25.766 -    for ( h = 0; h < ehdr->e_phnum; h++ ) 
  25.767 +
  25.768 +    for ( h = 0; h < ehdr->e_phnum; h++ )
  25.769      {
  25.770          phdr = (Elf32_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
  25.771          if ( !is_loadable_phdr(phdr) )
  25.772 @@ -914,8 +769,8 @@ static int parseelfimage(char *elfbase,
  25.773              kernend = phdr->p_paddr + phdr->p_memsz;
  25.774      }
  25.775  
  25.776 -    if ( (kernstart > kernend) || 
  25.777 -         (ehdr->e_entry < kernstart) || 
  25.778 +    if ( (kernstart > kernend) ||
  25.779 +         (ehdr->e_entry < kernstart) ||
  25.780           (ehdr->e_entry > kernend) )
  25.781      {
  25.782          ERROR("Malformed ELF image.");
  25.783 @@ -924,9 +779,9 @@ static int parseelfimage(char *elfbase,
  25.784  
  25.785      dsi->v_start = 0x00000000;
  25.786  
  25.787 -    dsi->v_kernstart = kernstart - LINUX_PAGE_OFFSET;
  25.788 -    dsi->v_kernend   = kernend - LINUX_PAGE_OFFSET;
  25.789 -    dsi->v_kernentry = LINUX_KERNEL_ENTR_ADDR;
  25.790 +    dsi->v_kernstart = kernstart;
  25.791 +    dsi->v_kernend   = kernend;
  25.792 +    dsi->v_kernentry = VMX_LOADER_ENTR_ADDR;
  25.793  
  25.794      dsi->v_end       = dsi->v_kernend;
  25.795  
  25.796 @@ -945,18 +800,18 @@ loadelfimage(
  25.797      char         *va;
  25.798      unsigned long pa, done, chunksz;
  25.799  
  25.800 -    for ( h = 0; h < ehdr->e_phnum; h++ ) 
  25.801 +    for ( h = 0; h < ehdr->e_phnum; h++ )
  25.802      {
  25.803          phdr = (Elf32_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
  25.804          if ( !is_loadable_phdr(phdr) )
  25.805              continue;
  25.806 -        
  25.807 +
  25.808          for ( done = 0; done < phdr->p_filesz; done += chunksz )
  25.809          {
  25.810 -            pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET;
  25.811 +            pa = (phdr->p_paddr + done) - dsi->v_start;
  25.812              if ((va = xc_map_foreign_range(
  25.813                  xch, dom, PAGE_SIZE, PROT_WRITE,
  25.814 -                parray[pa>>PAGE_SHIFT])) == 0)
  25.815 +                parray[pa >> PAGE_SHIFT])) == 0)
  25.816                  return -1;
  25.817              chunksz = phdr->p_filesz - done;
  25.818              if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
  25.819 @@ -968,10 +823,10 @@ loadelfimage(
  25.820  
  25.821          for ( ; done < phdr->p_memsz; done += chunksz )
  25.822          {
  25.823 -            pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET;
  25.824 +            pa = (phdr->p_paddr + done) - dsi->v_start;
  25.825              if ((va = xc_map_foreign_range(
  25.826                  xch, dom, PAGE_SIZE, PROT_WRITE,
  25.827 -                parray[pa>>PAGE_SHIFT])) == 0)
  25.828 +                parray[pa >> PAGE_SHIFT])) == 0)
  25.829                  return -1;
  25.830              chunksz = phdr->p_memsz - done;
  25.831              if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
    26.1 --- a/tools/libxc/xenguest.h	Thu Sep 29 13:35:13 2005 -0600
    26.2 +++ b/tools/libxc/xenguest.h	Thu Sep 29 16:22:02 2005 -0600
    26.3 @@ -57,9 +57,6 @@ int xc_vmx_build(int xc_handle,
    26.4                   uint32_t domid,
    26.5                   int memsize,
    26.6                   const char *image_name,
    26.7 -                 struct mem_map *memmap,
    26.8 -                 const char *ramdisk_name,
    26.9 -                 const char *cmdline,
   26.10                   unsigned int control_evtchn,
   26.11                   unsigned long flags,
   26.12                   unsigned int vcpus,
    27.1 --- a/tools/libxc/xg_private.h	Thu Sep 29 13:35:13 2005 -0600
    27.2 +++ b/tools/libxc/xg_private.h	Thu Sep 29 16:22:02 2005 -0600
    27.3 @@ -28,25 +28,27 @@ unsigned long csum_page (void * page);
    27.4  #define _PAGE_PSE       0x080
    27.5  #define _PAGE_GLOBAL    0x100
    27.6  
    27.7 -#if defined(__i386__)
    27.8 -#define L1_PAGETABLE_SHIFT       12
    27.9 -#define L2_PAGETABLE_SHIFT       22
   27.10  #define L1_PAGETABLE_SHIFT_PAE   12
   27.11  #define L2_PAGETABLE_SHIFT_PAE   21
   27.12  #define L3_PAGETABLE_SHIFT_PAE   30
   27.13 +
   27.14 +#if defined(__i386__)
   27.15 +#define L1_PAGETABLE_SHIFT       12
   27.16 +#define L2_PAGETABLE_SHIFT       22
   27.17  #elif defined(__x86_64__)
   27.18 -#define L1_PAGETABLE_SHIFT      12
   27.19 -#define L2_PAGETABLE_SHIFT      21
   27.20 -#define L3_PAGETABLE_SHIFT      30
   27.21 -#define L4_PAGETABLE_SHIFT      39
   27.22 +#define L1_PAGETABLE_SHIFT       12
   27.23 +#define L2_PAGETABLE_SHIFT       21
   27.24 +#define L3_PAGETABLE_SHIFT       30
   27.25 +#define L4_PAGETABLE_SHIFT       39
   27.26  #endif
   27.27  
   27.28 -#if defined(__i386__) 
   27.29 -#define ENTRIES_PER_L1_PAGETABLE 1024
   27.30 -#define ENTRIES_PER_L2_PAGETABLE 1024
   27.31  #define L1_PAGETABLE_ENTRIES_PAE  512
   27.32  #define L2_PAGETABLE_ENTRIES_PAE  512
   27.33  #define L3_PAGETABLE_ENTRIES_PAE    4
   27.34 +
   27.35 +#if defined(__i386__) 
   27.36 +#define L1_PAGETABLE_ENTRIES   1024
   27.37 +#define L2_PAGETABLE_ENTRIES   1024
   27.38  #elif defined(__x86_64__)
   27.39  #define L1_PAGETABLE_ENTRIES    512
   27.40  #define L2_PAGETABLE_ENTRIES    512
   27.41 @@ -70,17 +72,18 @@ typedef unsigned long l3_pgentry_t;
   27.42  typedef unsigned long l4_pgentry_t;
   27.43  #endif
   27.44  
   27.45 -#if defined(__i386__)
   27.46 -#define l1_table_offset(_a) \
   27.47 -          (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1))
   27.48 -#define l2_table_offset(_a) \
   27.49 -          ((_a) >> L2_PAGETABLE_SHIFT)
   27.50  #define l1_table_offset_pae(_a) \
   27.51    (((_a) >> L1_PAGETABLE_SHIFT_PAE) & (L1_PAGETABLE_ENTRIES_PAE - 1))
   27.52  #define l2_table_offset_pae(_a) \
   27.53    (((_a) >> L2_PAGETABLE_SHIFT_PAE) & (L2_PAGETABLE_ENTRIES_PAE - 1))
   27.54  #define l3_table_offset_pae(_a) \
   27.55  	(((_a) >> L3_PAGETABLE_SHIFT_PAE) & (L3_PAGETABLE_ENTRIES_PAE - 1))
   27.56 +
   27.57 +#if defined(__i386__)
   27.58 +#define l1_table_offset(_a) \
   27.59 +          (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))
   27.60 +#define l2_table_offset(_a) \
   27.61 +          ((_a) >> L2_PAGETABLE_SHIFT)
   27.62  #elif defined(__x86_64__)
   27.63  #define l1_table_offset(_a) \
   27.64    (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))
    28.1 --- a/tools/python/xen/lowlevel/xc/xc.c	Thu Sep 29 13:35:13 2005 -0600
    28.2 +++ b/tools/python/xen/lowlevel/xc/xc.c	Thu Sep 29 16:22:02 2005 -0600
    28.3 @@ -17,7 +17,6 @@
    28.4  #include <arpa/inet.h>
    28.5  
    28.6  #include "xc_private.h"
    28.7 -#include "linux_boot_params.h"
    28.8  
    28.9  /* Needed for Python versions earlier than 2.3. */
   28.10  #ifndef PyMODINIT_FUNC
   28.11 @@ -310,80 +309,24 @@ static PyObject *pyxc_vmx_build(PyObject
   28.12      XcObject *xc = (XcObject *)self;
   28.13  
   28.14      u32   dom;
   28.15 -    char *image, *ramdisk = NULL, *cmdline = "";
   28.16 -    PyObject *memmap;
   28.17 +    char *image;
   28.18      int   control_evtchn, store_evtchn;
   28.19      int flags = 0, vcpus = 1;
   28.20 -    int numItems, i;
   28.21      int memsize;
   28.22 -    struct mem_map mem_map;
   28.23      unsigned long store_mfn = 0;
   28.24  
   28.25      static char *kwd_list[] = { "dom", "control_evtchn", "store_evtchn",
   28.26 -                                "memsize", "image", "memmap",
   28.27 -				"ramdisk", "cmdline", "flags",
   28.28 -				"vcpus", NULL };
   28.29 +                                "memsize", "image", "flags", "vcpus", NULL };
   28.30  
   28.31 -    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisO!|ssii", kwd_list, 
   28.32 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisii", kwd_list,
   28.33                                        &dom, &control_evtchn, &store_evtchn,
   28.34 -                                      &memsize,
   28.35 -                                      &image, &PyList_Type, &memmap,
   28.36 -				      &ramdisk, &cmdline, &flags, &vcpus) )
   28.37 +                                      &memsize, &image, &flags, &vcpus) )
   28.38          return NULL;
   28.39  
   28.40 -    memset(&mem_map, 0, sizeof(mem_map));
   28.41 -    /* Parse memmap */
   28.42 -
   28.43 -    /* get the number of lines passed to us */
   28.44 -    numItems = PyList_Size(memmap) - 1;	/* removing the line 
   28.45 -					   containing "memmap" */
   28.46 -    mem_map.nr_map = numItems;
   28.47 -   
   28.48 -    /* should raise an error here. */
   28.49 -    if (numItems < 0) return NULL; /* Not a list */
   28.50 -
   28.51 -    /* iterate over items of the list, grabbing ranges and parsing them */
   28.52 -    for (i = 1; i <= numItems; i++) {	// skip over "memmap"
   28.53 -	    PyObject *item, *f1, *f2, *f3, *f4;
   28.54 -	    int numFields;
   28.55 -	    unsigned long lf1, lf2, lf3, lf4;
   28.56 -	    char *sf1, *sf2;
   28.57 -	    
   28.58 -	    /* grab the string object from the next element of the list */
   28.59 -	    item = PyList_GetItem(memmap, i); /* Can't fail */
   28.60 -
   28.61 -	    /* get the number of lines passed to us */
   28.62 -	    numFields = PyList_Size(item);
   28.63 +    if ( xc_vmx_build(xc->xc_handle, dom, memsize, image, control_evtchn,
   28.64 +                      flags, vcpus, store_evtchn, &store_mfn) != 0 )
   28.65 +        return PyErr_SetFromErrno(xc_error);
   28.66  
   28.67 -	    if (numFields != 4)
   28.68 -		    return NULL;
   28.69 -
   28.70 -	    f1 = PyList_GetItem(item, 0);
   28.71 -	    f2 = PyList_GetItem(item, 1);
   28.72 -	    f3 = PyList_GetItem(item, 2);
   28.73 -	    f4 = PyList_GetItem(item, 3);
   28.74 -
   28.75 -	    /* Convert objects to strings/longs */
   28.76 -	    sf1 = PyString_AsString(f1);
   28.77 -	    sf2 = PyString_AsString(f2);
   28.78 -	    lf3 = PyLong_AsLong(f3);
   28.79 -	    lf4 = PyLong_AsLong(f4);
   28.80 -	    if ( sscanf(sf1, "%lx", &lf1) != 1 )
   28.81 -                return NULL;
   28.82 -	    if ( sscanf(sf2, "%lx", &lf2) != 1 )
   28.83 -                return NULL;
   28.84 -
   28.85 -            mem_map.map[i-1].addr = lf1;
   28.86 -            mem_map.map[i-1].size = lf2 - lf1;
   28.87 -            mem_map.map[i-1].type = lf3;
   28.88 -            mem_map.map[i-1].caching_attr = lf4;
   28.89 -    }
   28.90 -
   28.91 -    if ( xc_vmx_build(xc->xc_handle, dom, memsize, image, &mem_map,
   28.92 -                        ramdisk, cmdline, control_evtchn, flags,
   28.93 -                        vcpus, store_evtchn, &store_mfn) != 0 )
   28.94 -        return PyErr_SetFromErrno(xc_error);
   28.95 -    
   28.96      return Py_BuildValue("{s:i}", "store_mfn", store_mfn);
   28.97  }
   28.98  
    29.1 --- a/tools/python/xen/lowlevel/xs/xs.c	Thu Sep 29 13:35:13 2005 -0600
    29.2 +++ b/tools/python/xen/lowlevel/xs/xs.c	Thu Sep 29 16:22:02 2005 -0600
    29.3 @@ -582,9 +582,8 @@ static PyObject *xspy_unwatch(PyObject *
    29.4  }
    29.5  
    29.6  #define xspy_transaction_start_doc "\n"				\
    29.7 -	"Start a transaction on a path.\n"			\
    29.8 +	"Start a transaction.\n"				\
    29.9  	"Only one transaction can be active at a time.\n"	\
   29.10 -	" path [string]: xenstore path.\n"			\
   29.11  	"\n"							\
   29.12  	"Returns None on success.\n"				\
   29.13  	"Raises RuntimeError on error.\n"			\
   29.14 @@ -593,8 +592,8 @@ static PyObject *xspy_unwatch(PyObject *
   29.15  static PyObject *xspy_transaction_start(PyObject *self, PyObject *args,
   29.16                                          PyObject *kwds)
   29.17  {
   29.18 -    static char *kwd_spec[] = { "path", NULL };
   29.19 -    static char *arg_spec = "s|";
   29.20 +    static char *kwd_spec[] = { NULL };
   29.21 +    static char *arg_spec = "";
   29.22      char *path = NULL;
   29.23  
   29.24      struct xs_handle *xh = xshandle(self);
   29.25 @@ -606,7 +605,7 @@ static PyObject *xspy_transaction_start(
   29.26      if (!PyArg_ParseTupleAndKeywords(args, kwds, arg_spec, kwd_spec, &path))
   29.27          goto exit;
   29.28      Py_BEGIN_ALLOW_THREADS
   29.29 -    xsval = xs_transaction_start(xh, path);
   29.30 +    xsval = xs_transaction_start(xh);
   29.31      Py_END_ALLOW_THREADS
   29.32      if (!xsval) {
   29.33          PyErr_SetFromErrno(PyExc_RuntimeError);
   29.34 @@ -623,7 +622,7 @@ static PyObject *xspy_transaction_start(
   29.35  	"Attempts to commit the transaction unless abort is true.\n"	\
   29.36  	" abort [int]: abort flag (default 0).\n"			\
   29.37  	"\n"								\
   29.38 -	"Returns None on success.\n"					\
   29.39 +	"Returns True on success, False if you need to try again.\n"	\
   29.40  	"Raises RuntimeError on error.\n"				\
   29.41  	"\n"
   29.42  
   29.43 @@ -646,11 +645,16 @@ static PyObject *xspy_transaction_end(Py
   29.44      xsval = xs_transaction_end(xh, abort);
   29.45      Py_END_ALLOW_THREADS
   29.46      if (!xsval) {
   29.47 +	if (errno == EAGAIN) {
   29.48 +	    Py_INCREF(Py_False);
   29.49 +	    val = Py_False;
   29.50 +	    goto exit;
   29.51 +	}
   29.52          PyErr_SetFromErrno(PyExc_RuntimeError);
   29.53          goto exit;
   29.54      }
   29.55 -    Py_INCREF(Py_None);
   29.56 -    val = Py_None;
   29.57 +    Py_INCREF(Py_True);
   29.58 +    val = Py_True;
   29.59   exit:
   29.60      return val;
   29.61  }
    30.1 --- a/tools/python/xen/util/memmap.py	Thu Sep 29 13:35:13 2005 -0600
    30.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    30.3 @@ -1,41 +0,0 @@
    30.4 -mem_caching_attr = {
    30.5 -    'UC' : 0,
    30.6 -    'WC' : 1,
    30.7 -    'WT' : 4,
    30.8 -    'WP' : 5,
    30.9 -    'WB' : 6,
   30.10 -    };
   30.11 -
   30.12 -e820_mem_type = {
   30.13 -    'AddressRangeMemory'    : 1,
   30.14 -    'AddressRangeReserved'  : 2,
   30.15 -    'AddressRangeACPI'      : 3,
   30.16 -    'AddressRangeNVS'       : 4,
   30.17 -    'AddressRangeIO'        : 16,
   30.18 -    'AddressRangeShared'    : 17,
   30.19 -};
   30.20 -
   30.21 -MT_COL = 2
   30.22 -MA_COL = 3
   30.23 -
   30.24 -def strmap(row):
   30.25 -   if (type(row) != type([])):
   30.26 -       return row
   30.27 -   row[MT_COL] = e820_mem_type[row[MT_COL]]
   30.28 -   row[MA_COL] = mem_caching_attr[row[MA_COL]]
   30.29 -   return row
   30.30 -
   30.31 -def memmap_parse(memmap):
   30.32 -    return map(strmap, memmap)
   30.33 -
   30.34 -if __name__ == '__main__':
   30.35 -   memmap = [ 'memmap',
   30.36 -              [ '1', '2', 'AddressRangeMemory', 'UC'],
   30.37 -              [ '1', '2', 'AddressRangeReserved', 'UC'],
   30.38 -              [ '1', '2', 'AddressRangeACPI', 'WB'],
   30.39 -              [ '1', '2', 'AddressRangeNVS', 'WB'],
   30.40 -              [ '1', '2', 'AddressRangeIO', 'WB'],
   30.41 -              [ '1', '2', 'AddressRangeShared', 'WB']]
   30.42 -   print memmap_parse(memmap);
   30.43 -
   30.44 -
    31.1 --- a/tools/python/xen/util/tempfile.py	Thu Sep 29 13:35:13 2005 -0600
    31.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.3 @@ -1,451 +0,0 @@
    31.4 -"""Temporary files.
    31.5 -
    31.6 -This module provides generic, low- and high-level interfaces for
    31.7 -creating temporary files and directories.  The interfaces listed
    31.8 -as "safe" just below can be used without fear of race conditions.
    31.9 -Those listed as "unsafe" cannot, and are provided for backward
   31.10 -compatibility only.
   31.11 -
   31.12 -This module also provides some data items to the user:
   31.13 -
   31.14 -  TMP_MAX  - maximum number of names that will be tried before
   31.15 -             giving up.
   31.16 -  template - the default prefix for all temporary names.
   31.17 -             You may change this to control the default prefix.
   31.18 -  tempdir  - If this is set to a string before the first use of
   31.19 -             any routine from this module, it will be considered as
   31.20 -             another candidate location to store temporary files.
   31.21 -"""
   31.22 -
   31.23 -__all__ = [
   31.24 -    "NamedTemporaryFile", "TemporaryFile", # high level safe interfaces
   31.25 -    "mkstemp", "mkdtemp",                  # low level safe interfaces
   31.26 -    "mktemp",                              # deprecated unsafe interface
   31.27 -    "TMP_MAX", "gettempprefix",            # constants
   31.28 -    "tempdir", "gettempdir"
   31.29 -   ]
   31.30 -
   31.31 -
   31.32 -# Imports.
   31.33 -
   31.34 -import os as _os
   31.35 -import errno as _errno
   31.36 -from random import Random as _Random
   31.37 -
   31.38 -if _os.name == 'mac':
   31.39 -    import Carbon.Folder as _Folder
   31.40 -    import Carbon.Folders as _Folders
   31.41 -
   31.42 -try:
   31.43 -    import fcntl as _fcntl
   31.44 -    # If PYTHONCASEOK is set on Windows, stinking FCNTL.py gets
   31.45 -    # imported, and we don't get an ImportError then.  Provoke
   31.46 -    # an AttributeError instead in that case.
   31.47 -    _fcntl.fcntl
   31.48 -except (ImportError, AttributeError):
   31.49 -    def _set_cloexec(fd):
   31.50 -        pass
   31.51 -else:
   31.52 -    def _set_cloexec(fd):
   31.53 -        flags = _fcntl.fcntl(fd, _fcntl.F_GETFD, 0)
   31.54 -        if flags >= 0:
   31.55 -            # flags read successfully, modify
   31.56 -            flags |= _fcntl.FD_CLOEXEC
   31.57 -            _fcntl.fcntl(fd, _fcntl.F_SETFD, flags)
   31.58 -
   31.59 -
   31.60 -try:
   31.61 -    import thread as _thread
   31.62 -except ImportError:
   31.63 -    import dummy_thread as _thread
   31.64 -_allocate_lock = _thread.allocate_lock
   31.65 -
   31.66 -_text_openflags = _os.O_RDWR | _os.O_CREAT | _os.O_EXCL
   31.67 -if hasattr(_os, 'O_NOINHERIT'):
   31.68 -    _text_openflags |= _os.O_NOINHERIT
   31.69 -if hasattr(_os, 'O_NOFOLLOW'):
   31.70 -    _text_openflags |= _os.O_NOFOLLOW
   31.71 -
   31.72 -_bin_openflags = _text_openflags
   31.73 -if hasattr(_os, 'O_BINARY'):
   31.74 -    _bin_openflags |= _os.O_BINARY
   31.75 -
   31.76 -if hasattr(_os, 'TMP_MAX'):
   31.77 -    TMP_MAX = _os.TMP_MAX
   31.78 -else:
   31.79 -    TMP_MAX = 10000
   31.80 -
   31.81 -template = "tmp"
   31.82 -
   31.83 -tempdir = None
   31.84 -
   31.85 -# Internal routines.
   31.86 -
   31.87 -_once_lock = _allocate_lock()
   31.88 -
   31.89 -class _RandomNameSequence:
   31.90 -    """An instance of _RandomNameSequence generates an endless
   31.91 -    sequence of unpredictable strings which can safely be incorporated
   31.92 -    into file names.  Each string is six characters long.  Multiple
   31.93 -    threads can safely use the same instance at the same time.
   31.94 -
   31.95 -    _RandomNameSequence is an iterator."""
   31.96 -
   31.97 -    characters = ("abcdefghijklmnopqrstuvwxyz" +
   31.98 -                  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
   31.99 -                  "0123456789-_")
  31.100 -
  31.101 -    def __init__(self):
  31.102 -        self.mutex = _allocate_lock()
  31.103 -        self.rng = _Random()
  31.104 -        self.normcase = _os.path.normcase
  31.105 -
  31.106 -    def __iter__(self):
  31.107 -        return self
  31.108 -
  31.109 -    def next(self):
  31.110 -        m = self.mutex
  31.111 -        c = self.characters
  31.112 -        choose = self.rng.choice
  31.113 -
  31.114 -        m.acquire()
  31.115 -        try:
  31.116 -            letters = [choose(c) for dummy in "123456"]
  31.117 -        finally:
  31.118 -            m.release()
  31.119 -
  31.120 -        return self.normcase(''.join(letters))
  31.121 -
  31.122 -def _candidate_tempdir_list():
  31.123 -    """Generate a list of candidate temporary directories which
  31.124 -    _get_default_tempdir will try."""
  31.125 -
  31.126 -    dirlist = []
  31.127 -
  31.128 -    # First, try the environment.
  31.129 -    for envname in 'TMPDIR', 'TEMP', 'TMP':
  31.130 -        dirname = _os.getenv(envname)
  31.131 -        if dirname: dirlist.append(dirname)
  31.132 -
  31.133 -    # Failing that, try OS-specific locations.
  31.134 -    if _os.name == 'mac':
  31.135 -        try:
  31.136 -            fsr = _Folder.FSFindFolder(_Folders.kOnSystemDisk,
  31.137 -                                              _Folders.kTemporaryFolderType, 1)
  31.138 -            dirname = fsr.as_pathname()
  31.139 -            dirlist.append(dirname)
  31.140 -        except _Folder.error:
  31.141 -            pass
  31.142 -    elif _os.name == 'riscos':
  31.143 -        dirname = _os.getenv('Wimp$ScrapDir')
  31.144 -        if dirname: dirlist.append(dirname)
  31.145 -    elif _os.name == 'nt':
  31.146 -        dirlist.extend([ r'c:\temp', r'c:\tmp', r'\temp', r'\tmp' ])
  31.147 -    else:
  31.148 -        dirlist.extend([ '/tmp', '/var/tmp', '/usr/tmp' ])
  31.149 -
  31.150 -    # As a last resort, the current directory.
  31.151 -    try:
  31.152 -        dirlist.append(_os.getcwd())
  31.153 -    except (AttributeError, _os.error):
  31.154 -        dirlist.append(_os.curdir)
  31.155 -
  31.156 -    return dirlist
  31.157 -
  31.158 -def _get_default_tempdir():
  31.159 -    """Calculate the default directory to use for temporary files.
  31.160 -    This routine should be called exactly once.
  31.161 -
  31.162 -    We determine whether or not a candidate temp dir is usable by
  31.163 -    trying to create and write to a file in that directory.  If this
  31.164 -    is successful, the test file is deleted.  To prevent denial of
  31.165 -    service, the name of the test file must be randomized."""
  31.166 -
  31.167 -    namer = _RandomNameSequence()
  31.168 -    dirlist = _candidate_tempdir_list()
  31.169 -    flags = _text_openflags
  31.170 -
  31.171 -    for dir in dirlist:
  31.172 -        if dir != _os.curdir:
  31.173 -            dir = _os.path.normcase(_os.path.abspath(dir))
  31.174 -        # Try only a few names per directory.
  31.175 -        for seq in xrange(100):
  31.176 -            name = namer.next()
  31.177 -            filename = _os.path.join(dir, name)
  31.178 -            try:
  31.179 -                fd = _os.open(filename, flags, 0600)
  31.180 -                fp = _os.fdopen(fd, 'w')
  31.181 -                fp.write('blat')
  31.182 -                fp.close()
  31.183 -                _os.unlink(filename)
  31.184 -                del fp, fd
  31.185 -                return dir
  31.186 -            except (OSError, IOError), e:
  31.187 -                if e[0] != _errno.EEXIST:
  31.188 -                    break # no point trying more names in this directory
  31.189 -                pass
  31.190 -    raise IOError, (_errno.ENOENT,
  31.191 -                    ("No usable temporary directory found in %s" % dirlist))
  31.192 -
  31.193 -_name_sequence = None
  31.194 -
  31.195 -def _get_candidate_names():
  31.196 -    """Common setup sequence for all user-callable interfaces."""
  31.197 -
  31.198 -    global _name_sequence
  31.199 -    if _name_sequence is None:
  31.200 -        _once_lock.acquire()
  31.201 -        try:
  31.202 -            if _name_sequence is None:
  31.203 -                _name_sequence = _RandomNameSequence()
  31.204 -        finally:
  31.205 -            _once_lock.release()
  31.206 -    return _name_sequence
  31.207 -
  31.208 -
  31.209 -def _mkstemp_inner(dir, pre, suf, flags):
  31.210 -    """Code common to mkstemp, TemporaryFile, and NamedTemporaryFile."""
  31.211 -
  31.212 -    names = _get_candidate_names()
  31.213 -
  31.214 -    for seq in xrange(TMP_MAX):
  31.215 -        name = names.next()
  31.216 -        file = _os.path.join(dir, pre + name + suf)
  31.217 -        try:
  31.218 -            fd = _os.open(file, flags, 0600)
  31.219 -            _set_cloexec(fd)
  31.220 -            return (fd, file)
  31.221 -        except OSError, e:
  31.222 -            if e.errno == _errno.EEXIST:
  31.223 -                continue # try again
  31.224 -            raise
  31.225 -
  31.226 -    raise IOError, (_errno.EEXIST, "No usable temporary file name found")
  31.227 -
  31.228 -
  31.229 -# User visible interfaces.
  31.230 -
  31.231 -def gettempprefix():
  31.232 -    """Accessor for tempdir.template."""
  31.233 -    return template
  31.234 -
  31.235 -tempdir = None
  31.236 -
  31.237 -def gettempdir():
  31.238 -    """Accessor for tempdir.tempdir."""
  31.239 -    global tempdir
  31.240 -    if tempdir is None:
  31.241 -        _once_lock.acquire()
  31.242 -        try:
  31.243 -            if tempdir is None:
  31.244 -                tempdir = _get_default_tempdir()
  31.245 -        finally:
  31.246 -            _once_lock.release()
  31.247 -    return tempdir
  31.248 -
  31.249 -def mkstemp(suffix="", prefix=template, dir=None, text=False):
  31.250 -    """mkstemp([suffix, [prefix, [dir, [text]]]])
  31.251 -    User-callable function to create and return a unique temporary
  31.252 -    file.  The return value is a pair (fd, name) where fd is the
  31.253 -    file descriptor returned by os.open, and name is the filename.
  31.254 -
  31.255 -    If 'suffix' is specified, the file name will end with that suffix,
  31.256 -    otherwise there will be no suffix.
  31.257 -
  31.258 -    If 'prefix' is specified, the file name will begin with that prefix,
  31.259 -    otherwise a default prefix is used.
  31.260 -
  31.261 -    If 'dir' is specified, the file will be created in that directory,
  31.262 -    otherwise a default directory is used.
  31.263 -
  31.264 -    If 'text' is specified and true, the file is opened in text
  31.265 -    mode.  Else (the default) the file is opened in binary mode.  On
  31.266 -    some operating systems, this makes no difference.
  31.267 -
  31.268 -    The file is readable and writable only by the creating user ID.
  31.269 -    If the operating system uses permission bits to indicate whether a
  31.270 -    file is executable, the file is executable by no one. The file
  31.271 -    descriptor is not inherited by children of this process.
  31.272 -
  31.273 -    Caller is responsible for deleting the file when done with it.
  31.274 -    """
  31.275 -
  31.276 -    if dir is None:
  31.277 -        dir = gettempdir()
  31.278 -
  31.279 -    if text:
  31.280 -        flags = _text_openflags
  31.281 -    else:
  31.282 -        flags = _bin_openflags
  31.283 -
  31.284 -    return _mkstemp_inner(dir, prefix, suffix, flags)
  31.285 -
  31.286 -
  31.287 -def mkdtemp(suffix="", prefix=template, dir=None):
  31.288 -    """mkdtemp([suffix, [prefix, [dir]]])
  31.289 -    User-callable function to create and return a unique temporary
  31.290 -    directory.  The return value is the pathname of the directory.
  31.291 -
  31.292 -    Arguments are as for mkstemp, except that the 'text' argument is
  31.293 -    not accepted.
  31.294 -
  31.295 -    The directory is readable, writable, and searchable only by the
  31.296 -    creating user.
  31.297 -
  31.298 -    Caller is responsible for deleting the directory when done with it.
  31.299 -    """
  31.300 -
  31.301 -    if dir is None:
  31.302 -        dir = gettempdir()
  31.303 -
  31.304 -    names = _get_candidate_names()
  31.305 -
  31.306 -    for seq in xrange(TMP_MAX):
  31.307 -        name = names.next()
  31.308 -        file = _os.path.join(dir, prefix + name + suffix)
  31.309 -        try:
  31.310 -            _os.mkdir(file, 0700)
  31.311 -            return file
  31.312 -        except OSError, e:
  31.313 -            if e.errno == _errno.EEXIST:
  31.314 -                continue # try again
  31.315 -            raise
  31.316 -
  31.317 -    raise IOError, (_errno.EEXIST, "No usable temporary directory name found")
  31.318 -
  31.319 -def mktemp(suffix="", prefix=template, dir=None):
  31.320 -    """mktemp([suffix, [prefix, [dir]]])
  31.321 -    User-callable function to return a unique temporary file name.  The
  31.322 -    file is not created.
  31.323 -
  31.324 -    Arguments are as for mkstemp, except that the 'text' argument is
  31.325 -    not accepted.
  31.326 -
  31.327 -    This function is unsafe and should not be used.  The file name
  31.328 -    refers to a file that did not exist at some point, but by the time
  31.329 -    you get around to creating it, someone else may have beaten you to
  31.330 -    the punch.
  31.331 -    """
  31.332 -
  31.333 -##    from warnings import warn as _warn
  31.334 -##    _warn("mktemp is a potential security risk to your program",
  31.335 -##          RuntimeWarning, stacklevel=2)
  31.336 -
  31.337 -    if dir is None:
  31.338 -        dir = gettempdir()
  31.339 -
  31.340 -    names = _get_candidate_names()
  31.341 -    for seq in xrange(TMP_MAX):
  31.342 -        name = names.next()
  31.343 -        file = _os.path.join(dir, prefix + name + suffix)
  31.344 -        if not _os.path.exists(file):
  31.345 -            return file
  31.346 -
  31.347 -    raise IOError, (_errno.EEXIST, "No usable temporary filename found")
  31.348 -
  31.349 -class _TemporaryFileWrapper:
  31.350 -    """Temporary file wrapper
  31.351 -
  31.352 -    This class provides a wrapper around files opened for
  31.353 -    temporary use.  In particular, it seeks to automatically
  31.354 -    remove the file when it is no longer needed.
  31.355 -    """
  31.356 -
  31.357 -    def __init__(self, file, name):
  31.358 -        self.file = file
  31.359 -        self.name = name
  31.360 -        self.close_called = False
  31.361 -
  31.362 -    def __getattr__(self, name):
  31.363 -        file = self.__dict__['file']
  31.364 -        a = getattr(file, name)
  31.365 -        if type(a) != type(0):
  31.366 -            setattr(self, name, a)
  31.367 -        return a
  31.368 -
  31.369 -    # NT provides delete-on-close as a primitive, so we don't need
  31.370 -    # the wrapper to do anything special.  We still use it so that
  31.371 -    # file.name is useful (i.e. not "(fdopen)") with NamedTemporaryFile.
  31.372 -    if _os.name != 'nt':
  31.373 -
  31.374 -        # Cache the unlinker so we don't get spurious errors at
  31.375 -        # shutdown when the module-level "os" is None'd out.  Note
  31.376 -        # that this must be referenced as self.unlink, because the
  31.377 -        # name TemporaryFileWrapper may also get None'd out before
  31.378 -        # __del__ is called.
  31.379 -        unlink = _os.unlink
  31.380 -
  31.381 -        def close(self):
  31.382 -            if not self.close_called:
  31.383 -                self.close_called = True
  31.384 -                self.file.close()
  31.385 -                self.unlink(self.name)
  31.386 -
  31.387 -        def __del__(self):
  31.388 -            self.close()
  31.389 -
  31.390 -def NamedTemporaryFile(mode='w+b', bufsize=-1, suffix="",
  31.391 -                       prefix=template, dir=None):
  31.392 -    """Create and return a temporary file.
  31.393 -    Arguments:
  31.394 -    'prefix', 'suffix', 'dir' -- as for mkstemp.
  31.395 -    'mode' -- the mode argument to os.fdopen (default "w+b").
  31.396 -    'bufsize' -- the buffer size argument to os.fdopen (default -1).
  31.397 -    The file is created as mkstemp() would do it.
  31.398 -
  31.399 -    Returns a file object; the name of the file is accessible as
  31.400 -    file.name.  The file will be automatically deleted when it is
  31.401 -    closed.
  31.402 -    """
  31.403 -
  31.404 -    if dir is None:
  31.405 -        dir = gettempdir()
  31.406 -
  31.407 -    if 'b' in mode:
  31.408 -        flags = _bin_openflags
  31.409 -    else:
  31.410 -        flags = _text_openflags
  31.411 -
  31.412 -    # Setting O_TEMPORARY in the flags causes the OS to delete
  31.413 -    # the file when it is closed.  This is only supported by Windows.
  31.414 -    if _os.name == 'nt':
  31.415 -        flags |= _os.O_TEMPORARY
  31.416 -
  31.417 -    (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags)
  31.418 -    file = _os.fdopen(fd, mode, bufsize)
  31.419 -    return _TemporaryFileWrapper(file, name)
  31.420 -
  31.421 -if _os.name != 'posix' or _os.sys.platform == 'cygwin':
  31.422 -    # On non-POSIX and Cygwin systems, assume that we cannot unlink a file
  31.423 -    # while it is open.
  31.424 -    TemporaryFile = NamedTemporaryFile
  31.425 -
  31.426 -else:
  31.427 -    def TemporaryFile(mode='w+b', bufsize=-1, suffix="",
  31.428 -                      prefix=template, dir=None):
  31.429 -        """Create and return a temporary file.
  31.430 -        Arguments:
  31.431 -        'prefix', 'suffix', 'directory' -- as for mkstemp.
  31.432 -        'mode' -- the mode argument to os.fdopen (default "w+b").
  31.433 -        'bufsize' -- the buffer size argument to os.fdopen (default -1).
  31.434 -        The file is created as mkstemp() would do it.
  31.435 -
  31.436 -        Returns a file object.  The file has no name, and will cease to
  31.437 -        exist when it is closed.
  31.438 -        """
  31.439 -
  31.440 -        if dir is None:
  31.441 -            dir = gettempdir()
  31.442 -
  31.443 -        if 'b' in mode:
  31.444 -            flags = _bin_openflags
  31.445 -        else:
  31.446 -            flags = _text_openflags
  31.447 -
  31.448 -        (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags)
  31.449 -        try:
  31.450 -            _os.unlink(name)
  31.451 -            return _os.fdopen(fd, mode, bufsize)
  31.452 -        except:
  31.453 -            _os.close(fd)
  31.454 -            raise
    32.1 --- a/tools/python/xen/xend/Blkctl.py	Thu Sep 29 13:35:13 2005 -0600
    32.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.3 @@ -1,43 +0,0 @@
    32.4 -"""Xend interface to block control scripts.
    32.5 -"""
    32.6 -import os
    32.7 -import os.path
    32.8 -import sys
    32.9 -import string
   32.10 -import xen.util.process
   32.11 -
   32.12 -from xen.xend import XendRoot
   32.13 -
   32.14 -xroot = XendRoot.instance()
   32.15 -
   32.16 -"""Where network control scripts live."""
   32.17 -SCRIPT_DIR = xroot.block_script_dir
   32.18 -
   32.19 -def block(op, type, dets, script=None):
   32.20 -    """Call a block control script.
   32.21 -    Xend calls this with op 'bind' when it is about to export a block device
   32.22 -    (other than a raw partition).  The script is called with unbind when a
   32.23 -    device is no longer in use and should be removed.
   32.24 -
   32.25 -    @param op:        operation (start, stop, status)
   32.26 -    @param type:      type of block device (determines the script used)
   32.27 -    @param dets:      arguments to the control script
   32.28 -    @param script:    block script name
   32.29 -    """
   32.30 -    
   32.31 -    if op not in ['bind', 'unbind']:
   32.32 -        raise ValueError('Invalid operation:' + op)
   32.33 -
   32.34 -    # Special case phy devices - they don't require any (un)binding
   32.35 -    # Parallax also doesn't need script-based binding.
   32.36 -    if (type == 'phy') or (type == 'parallax'):
   32.37 -        return dets
   32.38 -    
   32.39 -    if script is None:
   32.40 -        script = xroot.get_block_script(type)
   32.41 -    script = os.path.join(SCRIPT_DIR, script)
   32.42 -    args = [op] + string.split(dets, ':')
   32.43 -    args = ' '.join(args)
   32.44 -    ret = xen.util.process.runscript(script + ' ' + args)
   32.45 -    if len(ret):
   32.46 -        return ret.splitlines()[0]
    33.1 --- a/tools/python/xen/xend/PrettyPrint.py	Thu Sep 29 13:35:13 2005 -0600
    33.2 +++ b/tools/python/xen/xend/PrettyPrint.py	Thu Sep 29 16:22:02 2005 -0600
    33.3 @@ -252,7 +252,7 @@ class PrettyPrinter:
    33.4          self.block = self.block.parent
    33.5  
    33.6      def prettyprint(self, out=sys.stdout):
    33.7 -        self.top.prettyprint(Line(out, self.width))
    33.8 +        self.top.prettyprint(Line(out, self.width), self.width)
    33.9  
   33.10  class SXPPrettyPrinter(PrettyPrinter):
   33.11      """An SXP prettyprinter.
    34.1 --- a/tools/python/xen/xend/XendDB.py	Thu Sep 29 13:35:13 2005 -0600
    34.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.3 @@ -1,127 +0,0 @@
    34.4 -#============================================================================
    34.5 -# This library is free software; you can redistribute it and/or
    34.6 -# modify it under the terms of version 2.1 of the GNU Lesser General Public
    34.7 -# License as published by the Free Software Foundation.
    34.8 -#
    34.9 -# This library is distributed in the hope that it will be useful,
   34.10 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
   34.11 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   34.12 -# Lesser General Public License for more details.
   34.13 -#
   34.14 -# You should have received a copy of the GNU Lesser General Public
   34.15 -# License along with this library; if not, write to the Free Software
   34.16 -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   34.17 -#============================================================================
   34.18 -# Copyright (C) 2004, 2005 Mike Wray <mike.wray@hp.com>
   34.19 -#============================================================================
   34.20 -
   34.21 -import os
   34.22 -import os.path
   34.23 -import errno
   34.24 -import dircache
   34.25 -import time
   34.26 -
   34.27 -import sxp
   34.28 -import XendRoot
   34.29 -xroot = XendRoot.instance()
   34.30 -
   34.31 -class XendDB:
   34.32 -    """Persistence for Xend. Stores data in files and directories.
   34.33 -    """
   34.34 -
   34.35 -    def __init__(self, path=None):
   34.36 -        self.dbpath = xroot.get_dbroot()
   34.37 -        if path:
   34.38 -            self.dbpath = os.path.join(self.dbpath, path)
   34.39 -        pass
   34.40 -
   34.41 -    def listdir(self, dpath):
   34.42 -        try:
   34.43 -            return dircache.listdir(dpath)
   34.44 -        except:
   34.45 -            return []
   34.46 -
   34.47 -    def filepath(self, path):
   34.48 -        return os.path.join(self.dbpath, path)
   34.49 -        
   34.50 -    def fetch(self, path):
   34.51 -        fpath = self.filepath(path)
   34.52 -        return self.fetchfile(fpath)
   34.53 -
   34.54 -    def fetchfile(self, fpath):
   34.55 -        pin = sxp.Parser()
   34.56 -        fin = file(fpath, "rb")
   34.57 -        try:
   34.58 -            while 1:
   34.59 -                try:
   34.60 -                    buf = fin.read(1024)
   34.61 -                except IOError, ex:
   34.62 -                    if ex.errno == errno.EINTR:
   34.63 -                        continue
   34.64 -                    else:
   34.65 -                        raise
   34.66 -                pin.input(buf)
   34.67 -                if buf == '':
   34.68 -                    pin.input_eof()
   34.69 -                    break
   34.70 -        finally:
   34.71 -            fin.close()
   34.72 -        return pin.get_val()
   34.73 -
   34.74 -    def save(self, path, sxpr):
   34.75 -        fpath = self.filepath(path)
   34.76 -        return self.savefile(fpath, sxpr)
   34.77 -    
   34.78 -    def savefile(self, fpath, sxpr):
   34.79 -        backup = False
   34.80 -        fdir = os.path.dirname(fpath)
   34.81 -        if not os.path.isdir(fdir):
   34.82 -            os.makedirs(fdir)
   34.83 -        if os.path.exists(fpath):
   34.84 -            backup = True
   34.85 -            real_fpath = fpath
   34.86 -            fpath += ".new."
   34.87 -            
   34.88 -        fout = file(fpath, "wb+")
   34.89 -        try:
   34.90 -            try:
   34.91 -                t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
   34.92 -                fout.write("# %s %s\n" % (fpath, t))
   34.93 -                sxp.show(sxpr, out=fout)
   34.94 -            finally:
   34.95 -                fout.close()
   34.96 -        except:
   34.97 -            if backup:
   34.98 -                try:
   34.99 -                    os.unlink(fpath)
  34.100 -                except:
  34.101 -                    pass
  34.102 -                raise
  34.103 -        if backup:
  34.104 -            os.rename(fpath, real_fpath)
  34.105 -
  34.106 -    def fetchall(self, path):
  34.107 -        dpath = self.filepath(path)
  34.108 -        d = {}
  34.109 -        for k in self.listdir(dpath):
  34.110 -            try:
  34.111 -                v = self.fetchfile(os.path.join(dpath, k))
  34.112 -                d[k] = v
  34.113 -            except:
  34.114 -                pass
  34.115 -        return d
  34.116 -
  34.117 -    def saveall(self, path, d):
  34.118 -        for (k, v) in d.items():
  34.119 -            self.save(os.path.join(path, k), v)
  34.120 -
  34.121 -    def delete(self, path):
  34.122 -        dpath = self.filepath(path)
  34.123 -        os.unlink(dpath)
  34.124 -
  34.125 -    def ls(self, path):
  34.126 -        dpath = self.filepath(path)
  34.127 -        return self.listdir(dpath)
  34.128 -        
  34.129 -
  34.130 -        
    35.1 --- a/tools/python/xen/xend/XendDomain.py	Thu Sep 29 13:35:13 2005 -0600
    35.2 +++ b/tools/python/xen/xend/XendDomain.py	Thu Sep 29 16:22:02 2005 -0600
    35.3 @@ -433,12 +433,11 @@ class XendDomain:
    35.4              self.domain_shutdowns()
    35.5          return val
    35.6  
    35.7 +
    35.8      def domain_sysrq(self, id, key):
    35.9 -        """Send a SysRq to a domain
   35.10 -        """
   35.11 -        dominfo = self.domain_lookup(id)
   35.12 -        val = dominfo.send_sysrq(key)
   35.13 -        return val
   35.14 +        """Send a SysRq to the specified domain."""
   35.15 +        return self.callInfo(id, XendDomainInfo.send_sysrq, key)
   35.16 +
   35.17  
   35.18      def domain_shutdowns(self):
   35.19          """Process pending domain shutdowns.
   35.20 @@ -630,73 +629,45 @@ class XendDomain:
   35.21          except Exception, ex:
   35.22              raise XendError(str(ex))
   35.23  
   35.24 -    def domain_device_create(self, id, devconfig):
   35.25 -        """Create a new device for a domain.
   35.26  
   35.27 -        @param id:       domain id
   35.28 -        @param devconfig: device configuration
   35.29 +    def domain_device_create(self, domid, devconfig):
   35.30 +        """Create a new device for the specified domain.
   35.31          """
   35.32 -        dominfo = self.domain_lookup(id)
   35.33 -        val = dominfo.device_create(devconfig)
   35.34 -        dominfo.exportToDB()
   35.35 -        return val
   35.36 +        return self.callInfo(domid, XendDomainInfo.device_create, devconfig)
   35.37  
   35.38 -    def domain_device_configure(self, id, devconfig, devid):
   35.39 -        """Configure an existing device for a domain.
   35.40  
   35.41 -        @param id:   domain id
   35.42 -        @param devconfig: device configuration
   35.43 -        @param devid:  device id
   35.44 +    def domain_device_configure(self, domid, devconfig, devid):
   35.45 +        """Configure an existing device in the specified domain.
   35.46          @return: updated device configuration
   35.47          """
   35.48 -        dominfo = self.domain_lookup(id)
   35.49 -        val = dominfo.device_configure(devconfig, devid)
   35.50 -        dominfo.exportToDB()
   35.51 -        return val
   35.52 -    
   35.53 -    def domain_device_refresh(self, id, type, devid):
   35.54 -        """Refresh a device.
   35.55 +        return self.callInfo(domid, XendDomainInfo.device_configure,
   35.56 +                             devconfig, devid)
   35.57  
   35.58 -        @param id:  domain id
   35.59 -        @param devid:  device id
   35.60 -        @param type: device type
   35.61 -        """
   35.62 -        dominfo = self.domain_lookup(id)
   35.63 -        val = dominfo.device_refresh(type, devid)
   35.64 -        dominfo.exportToDB()
   35.65 -        return val
   35.66 -
   35.67 -    def domain_device_destroy(self, id, type, devid):
   35.68 -        """Destroy a device.
   35.69 -
   35.70 -        @param id:  domain id
   35.71 -        @param devid:  device id
   35.72 -        @param type: device type
   35.73 -        """
   35.74 -        dominfo = self.domain_lookup(id)
   35.75 -        return dominfo.destroyDevice(type, devid)
   35.76 +    
   35.77 +    def domain_device_refresh(self, domid, devtype, devid):
   35.78 +        """Refresh a device."""
   35.79 +        return self.callInfo(domid, XendDomainInfo.device_refresh, devtype,
   35.80 +                             devid)
   35.81  
   35.82  
   35.83 -    def domain_devtype_ls(self, id, type):
   35.84 -        """Get list of device sxprs for a domain.
   35.85 +    def domain_device_destroy(self, domid, devtype, devid):
   35.86 +        """Destroy a device."""
   35.87 +        return self.callInfo(domid, XendDomainInfo.destroyDevice, devtype,
   35.88 +                             devid)
   35.89  
   35.90 -        @param id:  domain
   35.91 -        @param type: device type
   35.92 -        @return: device sxprs
   35.93 -        """
   35.94 -        dominfo = self.domain_lookup(id)
   35.95 -        return dominfo.getDeviceSxprs(type)
   35.96  
   35.97 -    def domain_devtype_get(self, id, type, devid):
   35.98 +    def domain_devtype_ls(self, domid, devtype):
   35.99 +        """Get list of device sxprs for the specified domain."""
  35.100 +        return self.callInfo(domid, XendDomainInfo.getDeviceSxprs, devtype)
  35.101 +
  35.102 +
  35.103 +    def domain_devtype_get(self, domid, devtype, devid):
  35.104          """Get a device from a domain.
  35.105          
  35.106 -        @param id:  domain
  35.107 -        @param type: device type
  35.108 -        @param devid:  device id
  35.109          @return: device object (or None)
  35.110          """
  35.111 -        dominfo = self.domain_lookup(id)
  35.112 -        return dominfo.getDevice(type, devid)
  35.113 +        return self.callInfo(domid, XendDomainInfo.getDevice, devtype, devid)
  35.114 +
  35.115  
  35.116      def domain_vif_limit_set(self, id, vif, credit, period):
  35.117          """Limit the vif's transmission rate
  35.118 @@ -723,7 +694,7 @@ class XendDomain:
  35.119          """Set the memory limit for a domain.
  35.120  
  35.121          @param id: domain
  35.122 -        @param mem: memory limit (in MB)
  35.123 +        @param mem: memory limit (in MiB)
  35.124          @return: 0 on success, -1 on error
  35.125          """
  35.126          dominfo = self.domain_lookup(id)
  35.127 @@ -734,42 +705,37 @@ class XendDomain:
  35.128          except Exception, ex:
  35.129              raise XendError(str(ex))
  35.130  
  35.131 -    def domain_mem_target_set(self, id, mem):
  35.132 +    def domain_mem_target_set(self, domid, mem):
  35.133          """Set the memory target for a domain.
  35.134  
  35.135 -        @param id: domain
  35.136 -        @param mem: memory target (in MB)
  35.137 -        @return: 0 on success, -1 on error
  35.138 +        @param mem: memory target (in MiB)
  35.139          """
  35.140 -        dominfo = self.domain_lookup(id)
  35.141 -        return dominfo.setMemoryTarget(mem << 10)
  35.142 +        self.callInfo(domid, XendDomainInfo.setMemoryTarget, mem << 10)
  35.143  
  35.144 -    def domain_vcpu_hotplug(self, id, vcpu, state):
  35.145 -        """Enable or disable VCPU vcpu in DOM id
  35.146  
  35.147 -        @param id: domain
  35.148 +    def domain_vcpu_hotplug(self, domid, vcpu, state):
  35.149 +        """Enable or disable specified VCPU in specified domain
  35.150 +
  35.151          @param vcpu: target VCPU in domain
  35.152          @param state: which state VCPU will become
  35.153 -        @return: 0 on success, -1 on error
  35.154          """
  35.155 -
  35.156 -        dominfo = self.domain_lookup(id)
  35.157 -        return dominfo.vcpu_hotplug(vcpu, state)
  35.158 -
  35.159 -    def domain_dumpcore(self, id):
  35.160 -        """Save a core dump for a crashed domain.
  35.161 +        self.callInfo(domid, XendDomainInfo.vcpu_hotplug, vcpu, state)
  35.162  
  35.163 -        @param id: domain
  35.164 -        """
  35.165 -        dominfo = self.domain_lookup(id)
  35.166 -        corefile = "/var/xen/dump/%s.%s.core" % (dominfo.getName(),
  35.167 -                                                 dominfo.getDomid())
  35.168 -        try:
  35.169 -            xc.domain_dumpcore(dom=dominfo.getDomid(), corefile=corefile)
  35.170 -        except Exception, ex:
  35.171 -            log.warning("Dumpcore failed, id=%s name=%s: %s",
  35.172 -                        dominfo.getDomid(), dominfo.getName(), ex)
  35.173 -        
  35.174 +
  35.175 +    def domain_dumpcore(self, domid):
  35.176 +        """Save a core dump for a crashed domain."""
  35.177 +        self.callInfo(domid, XendDomainInfo.dumpCore)
  35.178 +
  35.179 +
  35.180 +    ## private:
  35.181 +
  35.182 +    def callInfo(self, domid, fn, *args, **kwargs):
  35.183 +        self.refresh()
  35.184 +        dominfo = self.domains.get(domid)
  35.185 +        if dominfo:
  35.186 +            return fn(dominfo, *args, **kwargs)
  35.187 +
  35.188 +
  35.189  def instance():
  35.190      """Singleton constructor. Use this instead of the class constructor.
  35.191      """
    36.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Thu Sep 29 13:35:13 2005 -0600
    36.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Thu Sep 29 16:22:02 2005 -0600
    36.3 @@ -34,6 +34,7 @@ from xen.util.blkif import blkdev_uname_
    36.4  
    36.5  from xen.xend.server.channel import EventChannel
    36.6  
    36.7 +from xen.xend import image
    36.8  from xen.xend import sxp
    36.9  from xen.xend.XendBootloader import bootloader
   36.10  from xen.xend.XendLogging import log
   36.11 @@ -319,6 +320,7 @@ class XendDomainInfo:
   36.12  
   36.13          try:
   36.14              defaultInfo('name',         lambda: "Domain-%d" % self.domid)
   36.15 +            defaultInfo('ssidref',      lambda: 0)
   36.16              defaultInfo('restart_mode', lambda: RESTART_ONREBOOT)
   36.17              defaultInfo('cpu_weight',   lambda: 1.0)
   36.18              defaultInfo('bootloader',   lambda: None)
   36.19 @@ -511,6 +513,19 @@ class XendDomainInfo:
   36.20                        self.info['backend'], 0)
   36.21  
   36.22  
   36.23 +    def dumpCore(self):
   36.24 +        """Create a core dump for this domain.  Nothrow guarantee."""
   36.25 +        
   36.26 +        try:
   36.27 +            corefile = "/var/xen/dump/%s.%s.core" % (self.info['name'],
   36.28 +                                                     self.domid)
   36.29 +            xc.domain_dumpcore(dom = self.domid, corefile = corefile)
   36.30 +
   36.31 +        except Exception, exn:
   36.32 +            log.error("XendDomainInfo.dumpCore failed: id = %s name = %s: %s",
   36.33 +                      self.domid, self.info['name'], str(exn))
   36.34 +
   36.35 +
   36.36      def closeStoreChannel(self):
   36.37          """Close the store channel, if any.  Nothrow guarantee."""
   36.38          
   36.39 @@ -614,7 +629,7 @@ class XendDomainInfo:
   36.40              sxpr.append(['maxmem', self.info['maxmem_KiB'] / 1024])
   36.41  
   36.42              if self.infoIsSet('device'):
   36.43 -                for (n, c) in self.info['device']:
   36.44 +                for (_, c) in self.info['device']:
   36.45                      sxpr.append(['device', c])
   36.46  
   36.47              def stateChar(name):
   36.48 @@ -706,13 +721,6 @@ class XendDomainInfo:
   36.49          """
   36.50          # todo - add support for scheduling params?
   36.51          try:
   36.52 -            if 'image' not in self.info:
   36.53 -                raise VmError('Missing image in configuration')
   36.54 -
   36.55 -            self.image = ImageHandler.create(self,
   36.56 -                                             self.info['image'],
   36.57 -                                             self.info['device'])
   36.58 -
   36.59              self.initDomain()
   36.60  
   36.61              # Create domain devices.
   36.62 @@ -737,6 +745,14 @@ class XendDomainInfo:
   36.63  
   36.64          self.domid = xc.domain_create(dom = self.domid or 0,
   36.65                                        ssidref = self.info['ssidref'])
   36.66 +
   36.67 +        if 'image' not in self.info:
   36.68 +            raise VmError('Missing image in configuration')
   36.69 +
   36.70 +        self.image = image.create(self,
   36.71 +                                  self.info['image'],
   36.72 +                                  self.info['device'])
   36.73 +
   36.74          if self.domid <= 0:
   36.75              raise VmError('Creating domain failed: name=%s' %
   36.76                            self.info['name'])
   36.77 @@ -839,20 +855,20 @@ class XendDomainInfo:
   36.78          """Release all vm devices.
   36.79          """
   36.80  
   36.81 -        t = xstransact("%s/device" % self.path)
   36.82 -
   36.83 -        for n in controllerClasses.keys():
   36.84 -            for d in t.list(n):
   36.85 -                try:
   36.86 -                    t.remove(d)
   36.87 -                except ex:
   36.88 -                    # Log and swallow any exceptions in removal -- there's
   36.89 -                    # nothing more we can do.
   36.90 -                    log.exception(
   36.91 -                        "Device release failed: %s; %s; %s; %s" %
   36.92 -                        (self.info['name'], n, d, str(ex)))
   36.93 -        t.commit()
   36.94 -
   36.95 +        while True:
   36.96 +            t = xstransact("%s/device" % self.path)
   36.97 +            for n in controllerClasses.keys():
   36.98 +                for d in t.list(n):
   36.99 +                    try:
  36.100 +                        t.remove(d)
  36.101 +                    except ex:
  36.102 +                        # Log and swallow any exceptions in removal --
  36.103 +                        # there's nothing more we can do.
  36.104 +                        log.exception(
  36.105 +                           "Device release failed: %s; %s; %s; %s" %
  36.106 +                            (self.info['name'], n, d, str(ex)))
  36.107 +            if t.commit():
  36.108 +                break
  36.109  
  36.110      def eventChannel(self, path=None):
  36.111          """Create an event channel to the domain.
  36.112 @@ -1085,19 +1101,6 @@ class XendDomainInfo:
  36.113  
  36.114  
  36.115  #============================================================================
  36.116 -# Register image handlers.
  36.117 -
  36.118 -from image import          \
  36.119 -     addImageHandlerClass, \
  36.120 -     ImageHandler,         \
  36.121 -     LinuxImageHandler,    \
  36.122 -     VmxImageHandler
  36.123 -
  36.124 -addImageHandlerClass(LinuxImageHandler)
  36.125 -addImageHandlerClass(VmxImageHandler)
  36.126 -
  36.127 -
  36.128 -#============================================================================
  36.129  # Register device controllers and their device config types.
  36.130  
  36.131  """A map from device-class names to the subclass of DevController that
    37.1 --- a/tools/python/xen/xend/image.py	Thu Sep 29 13:35:13 2005 -0600
    37.2 +++ b/tools/python/xen/xend/image.py	Thu Sep 29 16:22:02 2005 -0600
    37.3 @@ -33,6 +33,15 @@ xc = xen.lowlevel.xc.new()
    37.4  
    37.5  MAX_GUEST_CMDLINE = 1024
    37.6  
    37.7 +
    37.8 +def create(vm, imageConfig, deviceConfig):
    37.9 +    """Create an image handler for a vm.
   37.10 +
   37.11 +    @return ImageHandler instance
   37.12 +    """
   37.13 +    return findImageHandlerClass(imageConfig)(vm, imageConfig, deviceConfig)
   37.14 +
   37.15 +
   37.16  class ImageHandler:
   37.17      """Abstract base class for image handlers.
   37.18  
   37.19 @@ -48,81 +57,39 @@ class ImageHandler:
   37.20  
   37.21      The method destroy() is called when the domain is destroyed.
   37.22      The default is to do nothing.
   37.23 -    
   37.24 -    """
   37.25 -
   37.26 -    #======================================================================
   37.27 -    # Class vars and methods.
   37.28 -
   37.29 -    """Table of image handler classes for virtual machine images.
   37.30 -    Indexed by image type.
   37.31      """
   37.32 -    imageHandlerClasses = {}
   37.33 -
   37.34 -    def addImageHandlerClass(cls, h):
   37.35 -        """Add a handler class for an image type
   37.36 -        @param h:        handler: ImageHandler subclass
   37.37 -        """
   37.38 -        cls.imageHandlerClasses[h.ostype] = h
   37.39 -
   37.40 -    addImageHandlerClass = classmethod(addImageHandlerClass)
   37.41 -
   37.42 -    def findImageHandlerClass(cls, image):
   37.43 -        """Find the image handler class for an image config.
   37.44 -
   37.45 -        @param image config
   37.46 -        @return ImageHandler subclass or None
   37.47 -        """
   37.48 -        ty = sxp.name(image)
   37.49 -        if ty is None:
   37.50 -            raise VmError('missing image type')
   37.51 -        imageClass = cls.imageHandlerClasses.get(ty)
   37.52 -        if imageClass is None:
   37.53 -            raise VmError('unknown image type: ' + ty)
   37.54 -        return imageClass
   37.55 -
   37.56 -    findImageHandlerClass = classmethod(findImageHandlerClass)
   37.57 -
   37.58 -    def create(cls, vm, imageConfig, deviceConfig):
   37.59 -        """Create an image handler for a vm.
   37.60 -
   37.61 -        @return ImageHandler instance
   37.62 -        """
   37.63 -        imageClass = cls.findImageHandlerClass(imageConfig)
   37.64 -        return imageClass(vm, imageConfig, deviceConfig)
   37.65 -
   37.66 -    create = classmethod(create)
   37.67 -
   37.68 -    #======================================================================
   37.69 -    # Instance vars and methods.
   37.70  
   37.71      ostype = None
   37.72  
   37.73 -    kernel = None
   37.74 -    ramdisk = None
   37.75 -    cmdline = None
   37.76 -
   37.77 -    flags = 0
   37.78  
   37.79      def __init__(self, vm, imageConfig, deviceConfig):
   37.80          self.vm = vm
   37.81 +
   37.82 +        self.kernel = None
   37.83 +        self.ramdisk = None
   37.84 +        self.cmdline = None
   37.85 +        self.flags = 0
   37.86 +
   37.87          self.configure(imageConfig, deviceConfig)
   37.88  
   37.89      def configure(self, imageConfig, _):
   37.90          """Config actions common to all unix-like domains."""
   37.91  
   37.92 -        self.kernel = sxp.child_value(imageConfig, "kernel")
   37.93 +        def get_cfg(name, default = None):
   37.94 +            return sxp.child_value(imageConfig, name, default)
   37.95 +
   37.96 +        self.kernel = get_cfg("kernel")
   37.97          self.cmdline = ""
   37.98 -        ip = sxp.child_value(imageConfig, "ip", None)
   37.99 +        ip = get_cfg("ip")
  37.100          if ip:
  37.101              self.cmdline += " ip=" + ip
  37.102 -        root = sxp.child_value(imageConfig, "root")
  37.103 +        root = get_cfg("root")
  37.104          if root:
  37.105              self.cmdline += " root=" + root
  37.106 -        args = sxp.child_value(imageConfig, "args")
  37.107 +        args = get_cfg("args")
  37.108          if args:
  37.109              self.cmdline += " " + args
  37.110 -        self.ramdisk = sxp.child_value(imageConfig, "ramdisk", '')
  37.111 +        self.ramdisk = get_cfg("ramdisk", '')
  37.112          
  37.113          self.vm.storeVm(("image/ostype", self.ostype),
  37.114                          ("image/kernel", self.kernel),
  37.115 @@ -130,7 +97,7 @@ class ImageHandler:
  37.116                          ("image/ramdisk", self.ramdisk))
  37.117  
  37.118  
  37.119 -    def handleBootloading():
  37.120 +    def handleBootloading(self):
  37.121          self.unlink(self.kernel)
  37.122          self.unlink(self.ramdisk)
  37.123  
  37.124 @@ -194,7 +161,6 @@ class ImageHandler:
  37.125          if d.has_key('console_mfn'):
  37.126              self.vm.setConsoleRef(d.get('console_mfn'))
  37.127  
  37.128 -addImageHandlerClass = ImageHandler.addImageHandlerClass
  37.129  
  37.130  class LinuxImageHandler(ImageHandler):
  37.131  
  37.132 @@ -238,22 +204,19 @@ class VmxImageHandler(ImageHandler):
  37.133  
  37.134      def configure(self, imageConfig, deviceConfig):
  37.135          ImageHandler.configure(self, imageConfig, deviceConfig)
  37.136 -        
  37.137 -        self.memmap = sxp.child_value(imageConfig, 'memmap')
  37.138 +
  37.139          self.dmargs = self.parseDeviceModelArgs(imageConfig, deviceConfig)
  37.140          self.device_model = sxp.child_value(imageConfig, 'device_model')
  37.141          if not self.device_model:
  37.142              raise VmError("vmx: missing device model")
  37.143          self.display = sxp.child_value(imageConfig, 'display')
  37.144  
  37.145 -        self.vm.storeVm(("image/memmap", self.memmap),
  37.146 -                        ("image/dmargs", " ".join(self.dmargs)),
  37.147 +        self.vm.storeVm(("image/dmargs", " ".join(self.dmargs)),
  37.148                          ("image/device-model", self.device_model),
  37.149                          ("image/display", self.display))
  37.150  
  37.151          self.device_channel = None
  37.152          self.pid = 0
  37.153 -        self.memmap_value = []
  37.154  
  37.155          self.dmargs += self.configVNC(imageConfig)
  37.156  
  37.157 @@ -261,7 +224,6 @@ class VmxImageHandler(ImageHandler):
  37.158      def createImage(self):
  37.159          """Create a VM for the VMX environment.
  37.160          """
  37.161 -        self.parseMemmap()
  37.162          self.createDomain()
  37.163  
  37.164      def buildDomain(self):
  37.165 @@ -278,9 +240,6 @@ class VmxImageHandler(ImageHandler):
  37.166          log.debug("control_evtchn = %d", self.device_channel.port2)
  37.167          log.debug("store_evtchn   = %d", store_evtchn)
  37.168          log.debug("memsize        = %d", self.vm.getMemoryTarget() / 1024)
  37.169 -        log.debug("memmap         = %s", self.memmap_value)
  37.170 -        log.debug("cmdline        = %s", self.cmdline)
  37.171 -        log.debug("ramdisk        = %s", self.ramdisk)
  37.172          log.debug("flags          = %d", self.flags)
  37.173          log.debug("vcpus          = %d", self.vm.getVCpuCount())
  37.174  
  37.175 @@ -289,9 +248,6 @@ class VmxImageHandler(ImageHandler):
  37.176                             control_evtchn = self.device_channel.port2,
  37.177                             store_evtchn   = store_evtchn,
  37.178                             memsize        = self.vm.getMemoryTarget() / 1024,
  37.179 -                           memmap         = self.memmap_value,
  37.180 -                           cmdline        = self.cmdline,
  37.181 -                           ramdisk        = self.ramdisk,
  37.182                             flags          = self.flags,
  37.183                             vcpus          = self.vm.getVCpuCount())
  37.184          if isinstance(ret, dict):
  37.185 @@ -299,18 +255,11 @@ class VmxImageHandler(ImageHandler):
  37.186              return 0
  37.187          return ret
  37.188  
  37.189 -    def parseMemmap(self):
  37.190 -        if self.memmap is None:
  37.191 -            return
  37.192 -        memmap = sxp.parse(open(self.memmap))[0]
  37.193 -        from xen.util.memmap import memmap_parse
  37.194 -        self.memmap_value = memmap_parse(memmap)
  37.195 -        
  37.196      # Return a list of cmd line args to the device models based on the
  37.197      # xm config file
  37.198      def parseDeviceModelArgs(self, imageConfig, deviceConfig):
  37.199          dmargs = [ 'cdrom', 'boot', 'fda', 'fdb',
  37.200 -                   'localtime', 'serial', 'stdvga', 'isa', 'vcpus' ] 
  37.201 +                   'localtime', 'serial', 'stdvga', 'isa', 'vcpus' ]
  37.202          ret = []
  37.203          for a in dmargs:
  37.204              v = sxp.child_value(imageConfig, a)
  37.205 @@ -439,3 +388,28 @@ class VmxImageHandler(ImageHandler):
  37.206              return 16 * 1024
  37.207          else:
  37.208              return (1 + ((mem_mb + 3) >> 2)) * 4
  37.209 +
  37.210 +
  37.211 +"""Table of image handler classes for virtual machine images.  Indexed by
  37.212 +image type.
  37.213 +"""
  37.214 +imageHandlerClasses = {}
  37.215 +
  37.216 +
  37.217 +for h in LinuxImageHandler, VmxImageHandler:
  37.218 +    imageHandlerClasses[h.ostype] = h
  37.219 +
  37.220 +
  37.221 +def findImageHandlerClass(image):
  37.222 +    """Find the image handler class for an image config.
  37.223 +
  37.224 +    @param image config
  37.225 +    @return ImageHandler subclass or None
  37.226 +    """
  37.227 +    ty = sxp.name(image)
  37.228 +    if ty is None:
  37.229 +        raise VmError('missing image type')
  37.230 +    imageClass = imageHandlerClasses.get(ty)
  37.231 +    if imageClass is None:
  37.232 +        raise VmError('unknown image type: ' + ty)
  37.233 +    return imageClass
    38.1 --- a/tools/python/xen/xend/server/DevController.py	Thu Sep 29 13:35:13 2005 -0600
    38.2 +++ b/tools/python/xen/xend/server/DevController.py	Thu Sep 29 16:22:02 2005 -0600
    38.3 @@ -126,20 +126,21 @@ class DevController:
    38.4          compulsory to use it; subclasses may prefer to allocate IDs based upon
    38.5          the device configuration instead.
    38.6          """
    38.7 -        path = self.frontendMiscPath()
    38.8 -        t = xstransact(path)
    38.9 -        try:
   38.10 -            result = t.read("nextDeviceID")
   38.11 -            if result:
   38.12 -                result = int(result)
   38.13 -            else:
   38.14 -                result = 1
   38.15 -            t.write("nextDeviceID", str(result + 1))
   38.16 -            t.commit()
   38.17 -            return result
   38.18 -        except:
   38.19 -            t.abort()
   38.20 -            raise
   38.21 +        while True:
   38.22 +            path = self.frontendMiscPath()
   38.23 +            t = xstransact(path)
   38.24 +            try:
   38.25 +                result = t.read("nextDeviceID")
   38.26 +                if result:
   38.27 +                    result = int(result)
   38.28 +                else:
   38.29 +                    result = 1
   38.30 +                t.write("nextDeviceID", str(result + 1))
   38.31 +                if t.commit():
   38.32 +                    return result
   38.33 +            except:
   38.34 +                t.abort()
   38.35 +                raise
   38.36  
   38.37  
   38.38      ## private:
    39.1 --- a/tools/python/xen/xend/xenstore/xsnode.py	Thu Sep 29 13:35:13 2005 -0600
    39.2 +++ b/tools/python/xen/xend/xenstore/xsnode.py	Thu Sep 29 16:22:02 2005 -0600
    39.3 @@ -280,8 +280,8 @@ class XenStore:
    39.4                                 (', while writing %s : %s' % (str(path),
    39.5                                                               str(data))))
    39.6  
    39.7 -    def begin(self, path):
    39.8 -        self.getxs().transaction_start(path)
    39.9 +    def begin(self):
   39.10 +        self.getxs().transaction_start()
   39.11  
   39.12      def commit(self, abandon=False):
   39.13          self.getxs().transaction_end(abort=abandon)
    40.1 --- a/tools/python/xen/xend/xenstore/xstransact.py	Thu Sep 29 13:35:13 2005 -0600
    40.2 +++ b/tools/python/xen/xend/xenstore/xstransact.py	Thu Sep 29 16:22:02 2005 -0600
    40.3 @@ -14,16 +14,8 @@ class xstransact:
    40.4      def __init__(self, path):
    40.5          self.in_transaction = False
    40.6          self.path = path.rstrip("/")
    40.7 -        while True:
    40.8 -            try:
    40.9 -                xshandle().transaction_start(path)
   40.10 -                self.in_transaction = True
   40.11 -                return
   40.12 -            except RuntimeError, ex:
   40.13 -                if ex.args[0] == errno.ENOENT and path != "/":
   40.14 -                    path = "/".join(path.split("/")[0:-1]) or "/"
   40.15 -                else:
   40.16 -                    raise
   40.17 +        xshandle().transaction_start()
   40.18 +        self.in_transaction = True
   40.19  
   40.20      def __del__(self):
   40.21          if self.in_transaction:
   40.22 @@ -175,14 +167,8 @@ class xstransact:
   40.23              t = cls(path)
   40.24              try:
   40.25                  v = t.read(*args)
   40.26 -                t.commit()
   40.27 -                return v
   40.28 -            except RuntimeError, ex:
   40.29                  t.abort()
   40.30 -                if ex.args[0] == errno.ETIMEDOUT:
   40.31 -                    pass
   40.32 -                else:
   40.33 -                    raise
   40.34 +                return v
   40.35              except:
   40.36                  t.abort()
   40.37                  raise
   40.38 @@ -194,14 +180,8 @@ class xstransact:
   40.39              t = cls(path)
   40.40              try:
   40.41                  t.write(*args, **opts)
   40.42 -                t.commit()
   40.43 -                return
   40.44 -            except RuntimeError, ex:
   40.45 -                t.abort()
   40.46 -                if ex.args[0] == errno.ETIMEDOUT:
   40.47 -                    pass
   40.48 -                else:
   40.49 -                    raise
   40.50 +                if t.commit():
   40.51 +                    return
   40.52              except:
   40.53                  t.abort()
   40.54                  raise
   40.55 @@ -217,14 +197,8 @@ class xstransact:
   40.56              t = cls(path)
   40.57              try:
   40.58                  t.remove(*args)
   40.59 -                t.commit()
   40.60 -                return
   40.61 -            except RuntimeError, ex:
   40.62 -                t.abort()
   40.63 -                if ex.args[0] == errno.ETIMEDOUT:
   40.64 -                    pass
   40.65 -                else:
   40.66 -                    raise
   40.67 +                if t.commit():
   40.68 +                    return
   40.69              except:
   40.70                  t.abort()
   40.71                  raise
   40.72 @@ -236,14 +210,8 @@ class xstransact:
   40.73              t = cls(path)
   40.74              try:
   40.75                  v = t.list(*args)
   40.76 -                t.commit()
   40.77 -                return v
   40.78 -            except RuntimeError, ex:
   40.79 -                t.abort()
   40.80 -                if ex.args[0] == errno.ETIMEDOUT:
   40.81 -                    pass
   40.82 -                else:
   40.83 -                    raise
   40.84 +                if t.commit():
   40.85 +                    return v
   40.86              except:
   40.87                  t.abort()
   40.88                  raise
   40.89 @@ -255,14 +223,8 @@ class xstransact:
   40.90              t = cls(path)
   40.91              try:
   40.92                  v = t.gather(*args)
   40.93 -                t.commit()
   40.94 -                return v
   40.95 -            except RuntimeError, ex:
   40.96 -                t.abort()
   40.97 -                if ex.args[0] == errno.ETIMEDOUT:
   40.98 -                    pass
   40.99 -                else:
  40.100 -                    raise
  40.101 +                if t.commit():
  40.102 +                    return v
  40.103              except:
  40.104                  t.abort()
  40.105                  raise
  40.106 @@ -274,14 +236,8 @@ class xstransact:
  40.107              t = cls(path)
  40.108              try:
  40.109                  v = t.store(*args)
  40.110 -                t.commit()
  40.111 -                return v
  40.112 -            except RuntimeError, ex:
  40.113 -                t.abort()
  40.114 -                if ex.args[0] == errno.ETIMEDOUT:
  40.115 -                    pass
  40.116 -                else:
  40.117 -                    raise
  40.118 +                if t.commit():
  40.119 +                    return v
  40.120              except:
  40.121                  t.abort()
  40.122                  raise
    41.1 --- a/tools/python/xen/xm/main.py	Thu Sep 29 13:35:13 2005 -0600
    41.2 +++ b/tools/python/xen/xm/main.py	Thu Sep 29 16:22:02 2005 -0600
    41.3 @@ -1,5 +1,6 @@
    41.4  # (C) Copyright IBM Corp. 2005
    41.5  # Copyright (C) 2004 Mike Wray
    41.6 +# Copyright (c) 2005 XenSource Ltd
    41.7  #
    41.8  # Authors:
    41.9  #     Sean Dague <sean at dague dot net>
   41.10 @@ -169,12 +170,6 @@ def handle_xend_error(cmd, dom, ex):
   41.11  #
   41.12  #########################################################################
   41.13  
   41.14 -def xm_create(args):
   41.15 -    from xen.xm import create
   41.16 -    # ugly hack because the opt parser apparently wants
   41.17 -    # the subcommand name just to throw it away!
   41.18 -    create.main(["bogus"] + args)
   41.19 -
   41.20  def xm_save(args):
   41.21      arg_check(args,2,"save")
   41.22  
   41.23 @@ -196,13 +191,6 @@ def xm_restore(args):
   41.24      if id is not None:
   41.25          server.xend_domain_unpause(domid)
   41.26  
   41.27 -def xm_migrate(args):
   41.28 -    # TODO: arg_check
   41.29 -    from xen.xm import migrate
   41.30 -    # ugly hack because the opt parser apparently wants
   41.31 -    # the subcommand name just to throw it away!
   41.32 -    migrate.main(["bogus"] + args)
   41.33 -
   41.34  def xm_list(args):
   41.35      use_long = 0
   41.36      show_vcpus = 0
   41.37 @@ -290,14 +278,6 @@ def xm_show_vcpus(domsinfo):
   41.38  def xm_vcpu_list(args):
   41.39      xm_list(["-v"] + args)
   41.40  
   41.41 -def xm_destroy(args):
   41.42 -    arg_check(args,1,"destroy")
   41.43 -
   41.44 -    from xen.xm import destroy
   41.45 -    # ugly hack because the opt parser apparently wants
   41.46 -    # the subcommand name just to throw it away!
   41.47 -    destroy.main(["bogus"] + args)
   41.48 -            
   41.49  def xm_reboot(args):
   41.50      arg_check(args,1,"reboot")
   41.51      from xen.xm import shutdown
   41.52 @@ -305,20 +285,6 @@ def xm_reboot(args):
   41.53      # the subcommand name just to throw it away!
   41.54      shutdown.main(["bogus", "-R"] + args)
   41.55  
   41.56 -def xm_shutdown(args):
   41.57 -    arg_check(args,1,"shutdown")
   41.58 -
   41.59 -    from xen.xm import shutdown
   41.60 -    # ugly hack because the opt parser apparently wants
   41.61 -    # the subcommand name just to throw it away!
   41.62 -    shutdown.main(["bogus"] + args)
   41.63 -
   41.64 -def xm_sysrq(args):
   41.65 -    from xen.xm import sysrq
   41.66 -    # ugly hack because the opt parser apparently wants
   41.67 -    # the subcommand name just to throw it away!
   41.68 -    sysrq.main(["bogus"] + args)
   41.69 -
   41.70  def xm_pause(args):
   41.71      arg_check(args, 1, "pause")
   41.72      dom = args[0]
   41.73 @@ -333,6 +299,11 @@ def xm_unpause(args):
   41.74      from xen.xend.XendClient import server
   41.75      server.xend_domain_unpause(dom)
   41.76  
   41.77 +def xm_subcommand(command, args):
   41.78 +    cmd = __import__(command, globals(), locals(), 'xen.xm')
   41.79 +    cmd.main(["bogus"] + args)
   41.80 +
   41.81 +
   41.82  #############################################################
   41.83  
   41.84  def cpu_make_map(cpulist):
   41.85 @@ -506,14 +477,6 @@ def xm_network_list(args):
   41.86          sxp.show(x)
   41.87          print
   41.88  
   41.89 -def xm_network_attach(args):
   41.90 -
   41.91 -    print "Not implemented"
   41.92 -
   41.93 -def xm_network_detach(args):
   41.94 -
   41.95 -    print "Not implemented"
   41.96 -    
   41.97  def xm_block_list(args):
   41.98      arg_check(args,1,"block-list")
   41.99      dom = args[0]
  41.100 @@ -609,11 +572,8 @@ commands = {
  41.101      # domain commands
  41.102      "domid": xm_domid,
  41.103      "domname": xm_domname,
  41.104 -    "create": xm_create,
  41.105 -    "destroy": xm_destroy,
  41.106      "restore": xm_restore,
  41.107      "save": xm_save,
  41.108 -    "shutdown": xm_shutdown,
  41.109      "reboot": xm_reboot,
  41.110      "list": xm_list,
  41.111      # memory commands
  41.112 @@ -625,10 +585,7 @@ commands = {
  41.113      "vcpu-enable": xm_vcpu_enable,
  41.114      "vcpu-disable": xm_vcpu_disable,
  41.115      "vcpu-list": xm_vcpu_list,
  41.116 -    # migration
  41.117 -    "migrate": xm_migrate,
  41.118      # special
  41.119 -    "sysrq": xm_sysrq,
  41.120      "pause": xm_pause,
  41.121      "unpause": xm_unpause,
  41.122      # host commands
  41.123 @@ -647,14 +604,24 @@ commands = {
  41.124      # network
  41.125      "network-limit": xm_network_limit,
  41.126      "network-list": xm_network_list,
  41.127 -    "network-attach": xm_network_attach,
  41.128 -    "network-detach": xm_network_detach,
  41.129      # vnet
  41.130      "vnet-list": xm_vnet_list,
  41.131      "vnet-create": xm_vnet_create,
  41.132      "vnet-delete": xm_vnet_delete,
  41.133      }
  41.134  
  41.135 +## The commands supported by a separate argument parser in xend.xm.
  41.136 +subcommands = [
  41.137 +    'create',
  41.138 +    'destroy',
  41.139 +    'migrate',
  41.140 +    'sysrq',
  41.141 +    'shutdown'
  41.142 +    ]
  41.143 +
  41.144 +for c in subcommands:
  41.145 +    commands[c] = eval('lambda args: xm_subcommand("%s", args)' % c)
  41.146 +
  41.147  aliases = {
  41.148      "balloon": "mem-set",
  41.149      "vif-list": "network-list",
  41.150 @@ -669,6 +636,7 @@ help = {
  41.151      "--long": longhelp
  41.152     }
  41.153  
  41.154 +
  41.155  def xm_lookup_cmd(cmd):
  41.156      if commands.has_key(cmd):
  41.157          return commands[cmd]
  41.158 @@ -688,9 +656,7 @@ def deprecated(old,new):
  41.159      err('Option %s is the new replacement, see "xm help %s" for more info' % (new, new))
  41.160  
  41.161  def usage(cmd=None):
  41.162 -    if cmd == "full":
  41.163 -        print fullhelp
  41.164 -    elif help.has_key(cmd):
  41.165 +    if help.has_key(cmd):
  41.166          print help[cmd]
  41.167      else:
  41.168          print shorthelp
  41.169 @@ -701,7 +667,7 @@ def main(argv=sys.argv):
  41.170          usage()
  41.171      
  41.172      if re.compile('-*help').match(argv[1]):
  41.173 -	if len(argv) > 2 and help.has_key(argv[2]):
  41.174 +	if len(argv) > 2:
  41.175  	    usage(argv[2])
  41.176  	else:
  41.177  	    usage()
    42.1 --- a/tools/xenstore/Makefile	Thu Sep 29 13:35:13 2005 -0600
    42.2 +++ b/tools/xenstore/Makefile	Thu Sep 29 16:22:02 2005 -0600
    42.3 @@ -28,11 +28,11 @@ CLIENTS := xenstore-exists xenstore-list
    42.4  CLIENTS += xenstore-write
    42.5  CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
    42.6  
    42.7 -all: libxenstore.so xenstored $(CLIENTS)
    42.8 +all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump
    42.9  
   42.10  testcode: xs_test xenstored_test xs_random xs_dom0_test
   42.11  
   42.12 -xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o
   42.13 +xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o tdb.o
   42.14  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
   42.15  
   42.16  $(CLIENTS): libxenstore.so
   42.17 @@ -42,7 +42,10 @@ xenstored: xenstored_core.o xenstored_wa
   42.18  $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
   42.19  	$(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
   42.20  
   42.21 -xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o
   42.22 +xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o tdb.o
   42.23 +	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
   42.24 +
   42.25 +xs_tdb_dump: xs_tdb_dump.o utils.o tdb.o talloc.o
   42.26  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
   42.27  
   42.28  xs_test: xs_test.o xs_lib.o utils.o
   42.29 @@ -50,6 +53,11 @@ xs_random: xs_random.o xs_test_lib.o xs_
   42.30  xs_stress: xs_stress.o xs_test_lib.o xs_lib.o talloc.o utils.o
   42.31  xs_crashme: xs_crashme.o xs_lib.o talloc.o utils.o
   42.32  
   42.33 +speedtest: speedtest.o xs.o xs_lib.o utils.o talloc.o
   42.34 +
   42.35 +check-speed: speedtest xenstored_test $(TESTDIR)
   42.36 +	$(TESTENV) time ./speedtest 100
   42.37 +
   42.38  xs_test.o xs_stress.o xenstored_core_test.o xenstored_watch_test.o xenstored_transaction_test.o xenstored_domain_test.o xs_random.o xs_test_lib.o talloc_test.o fake_libxc.o xs_crashme.o: CFLAGS=$(BASECFLAGS) $(TESTFLAGS)
   42.39  
   42.40  xenstored_%_test.o: xenstored_%.c
   42.41 @@ -98,7 +106,7 @@ RANDSEED=$(shell date +%s)
   42.42  randomcheck: xs_random xenstored_test $(TESTDIR)
   42.43  	$(TESTENV) ./xs_random --simple --fast /tmp/xs_random 200000 $(RANDSEED) && echo
   42.44  	$(TESTENV) ./xs_random --fast /tmp/xs_random 100000 $(RANDSEED) && echo
   42.45 -	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
   42.46 +#	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
   42.47  
   42.48  crashme:  xs_crashme xenstored_test $(TESTDIR)
   42.49  	rm -rf $(TESTDIR)/store $(TESTDIR)/transactions /tmp/xs_crashme.vglog* /tmp/trace
    43.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    43.2 +++ b/tools/xenstore/speedtest.c	Thu Sep 29 16:22:02 2005 -0600
    43.3 @@ -0,0 +1,130 @@
    43.4 +/* 
    43.5 +    Xen Store Daemon Speed test
    43.6 +    Copyright (C) 2005 Rusty Russell IBM Corporation
    43.7 +
    43.8 +    This program is free software; you can redistribute it and/or modify
    43.9 +    it under the terms of the GNU General Public License as published by
   43.10 +    the Free Software Foundation; either version 2 of the License, or
   43.11 +    (at your option) any later version.
   43.12 +
   43.13 +    This program is distributed in the hope that it will be useful,
   43.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   43.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   43.16 +    GNU General Public License for more details.
   43.17 +
   43.18 +    You should have received a copy of the GNU General Public License
   43.19 +    along with this program; if not, write to the Free Software
   43.20 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   43.21 +*/
   43.22 +
   43.23 +#include <stdlib.h>
   43.24 +#include <sys/types.h>
   43.25 +#include <sys/wait.h>
   43.26 +#include <stdio.h>
   43.27 +#include <stdarg.h>
   43.28 +#include <unistd.h>
   43.29 +#include <fcntl.h>
   43.30 +#include <errno.h>
   43.31 +#include "utils.h"
   43.32 +#include "xs.h"
   43.33 +#include "list.h"
   43.34 +#include "talloc.h"
   43.35 +
   43.36 +static void do_command(const char *cmd)
   43.37 +{
   43.38 +	int ret;
   43.39 +
   43.40 +	ret = system(cmd);
   43.41 +	if (ret == -1 || !WIFEXITED(ret) || WEXITSTATUS(ret) != 0)
   43.42 +		barf_perror("Failed '%s': %i", cmd, ret);
   43.43 +}
   43.44 +
   43.45 +static int start_daemon(void)
   43.46 +{
   43.47 +	int fds[2], pid;
   43.48 +
   43.49 +	do_command(talloc_asprintf(NULL, "rm -rf testsuite/tmp/*"));
   43.50 +
   43.51 +	/* Start daemon. */
   43.52 +	pipe(fds);
   43.53 +	if ((pid = fork())) {
   43.54 +		/* Child writes PID when its ready: we wait for that. */
   43.55 +		char buffer[20];
   43.56 +		close(fds[1]);
   43.57 +		if (read(fds[0], buffer, sizeof(buffer)) < 0)
   43.58 +			barf("Failed to summon daemon");
   43.59 +		close(fds[0]);
   43.60 +	} else {
   43.61 +		dup2(fds[1], STDOUT_FILENO);
   43.62 +		close(fds[0]);
   43.63 +#if 0
   43.64 +		execlp("valgrind", "valgrind", "-q", "--suppressions=testsuite/vg-suppressions", "xenstored_test", "--output-pid",
   43.65 +		       "--no-fork", "--trace-file=/tmp/trace", NULL);
   43.66 +#else
   43.67 +		execlp("./xenstored_test", "xenstored_test", "--output-pid", "--no-fork", NULL);
   43.68 +//		execlp("strace", "strace", "-o", "/tmp/out", "./xenstored_test", "--output-pid", "--no-fork", NULL);
   43.69 +#endif
   43.70 +		exit(1);
   43.71 +	}
   43.72 +	return pid;
   43.73 +}
   43.74 +
   43.75 +static void kill_daemon(int pid)
   43.76 +{
   43.77 +	int saved_errno = errno;
   43.78 +	kill(pid, SIGTERM);
   43.79 +	errno = saved_errno;
   43.80 +}
   43.81 +
   43.82 +#define NUM_ENTRIES 50
   43.83 +
   43.84 +/* We create the given number of trees, each with NUM_ENTRIES, using
   43.85 + * transactions. */
   43.86 +int main(int argc, char *argv[])
   43.87 +{
   43.88 +	int i, j, pid, print;
   43.89 +	struct xs_handle *h;
   43.90 +
   43.91 +	if (argc != 2)
   43.92 +		barf("Usage: speedtest <numdomains>");
   43.93 +
   43.94 +	pid = start_daemon();
   43.95 +	h = xs_daemon_open();
   43.96 +	print = atoi(argv[1]) / 76;
   43.97 +	if (!print)
   43.98 +		print = 1;
   43.99 +	for (i = 0; i < atoi(argv[1]); i ++) {
  43.100 +		char name[64];
  43.101 +
  43.102 +		if (i % print == 0)
  43.103 +			write(1, ".", 1);
  43.104 +		if (!xs_transaction_start(h, "/")) {
  43.105 +			kill_daemon(pid);
  43.106 +			barf_perror("Starting transaction");
  43.107 +		}
  43.108 +		sprintf(name, "/%i", i);
  43.109 +		if (!xs_mkdir(h, name)) {
  43.110 +			kill_daemon(pid);
  43.111 +			barf_perror("Making directory %s", name);
  43.112 +		}
  43.113 +
  43.114 +		for (j = 0; j < NUM_ENTRIES; j++) {
  43.115 +			sprintf(name, "/%i/%i", i, j);
  43.116 +			if (!xs_write(h, name, name, strlen(name))) {
  43.117 +				kill_daemon(pid);
  43.118 +				barf_perror("Making directory %s", name);
  43.119 +			}
  43.120 +		}
  43.121 +		if (!xs_transaction_end(h, false)) {
  43.122 +			kill_daemon(pid);
  43.123 +			barf_perror("Ending transaction");
  43.124 +		}
  43.125 +	}
  43.126 +	write(1, "\n", 1);
  43.127 +
  43.128 +	kill_daemon(pid);
  43.129 +	wait(NULL);
  43.130 +	return 0;
  43.131 +}
  43.132 +	
  43.133 +	
    44.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    44.2 +++ b/tools/xenstore/tdb.c	Thu Sep 29 16:22:02 2005 -0600
    44.3 @@ -0,0 +1,2151 @@
    44.4 + /* 
    44.5 +   Unix SMB/CIFS implementation.
    44.6 +
    44.7 +   trivial database library
    44.8 +
    44.9 +   Copyright (C) Andrew Tridgell              1999-2004
   44.10 +   Copyright (C) Paul `Rusty' Russell		   2000
   44.11 +   Copyright (C) Jeremy Allison			   2000-2003
   44.12 +   
   44.13 +     ** NOTE! The following LGPL license applies to the tdb
   44.14 +     ** library. This does NOT imply that all of Samba is released
   44.15 +     ** under the LGPL
   44.16 +   
   44.17 +   This library is free software; you can redistribute it and/or
   44.18 +   modify it under the terms of the GNU Lesser General Public
   44.19 +   License as published by the Free Software Foundation; either
   44.20 +   version 2 of the License, or (at your option) any later version.
   44.21 +
   44.22 +   This library is distributed in the hope that it will be useful,
   44.23 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
   44.24 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   44.25 +   Lesser General Public License for more details.
   44.26 +
   44.27 +   You should have received a copy of the GNU Lesser General Public
   44.28 +   License along with this library; if not, write to the Free Software
   44.29 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   44.30 +*/
   44.31 +
   44.32 +
   44.33 +#ifndef _SAMBA_BUILD_
   44.34 +#if HAVE_CONFIG_H
   44.35 +#include <config.h>
   44.36 +#endif
   44.37 +
   44.38 +#include <stdlib.h>
   44.39 +#include <stdio.h>
   44.40 +#include <stdint.h>
   44.41 +#include <fcntl.h>
   44.42 +#include <unistd.h>
   44.43 +#include <string.h>
   44.44 +#include <fcntl.h>
   44.45 +#include <errno.h>
   44.46 +#include <sys/mman.h>
   44.47 +#include <sys/stat.h>
   44.48 +#include "tdb.h"
   44.49 +#include <stdarg.h>
   44.50 +#include "talloc.h"
   44.51 +#define HAVE_MMAP
   44.52 +#else
   44.53 +#include "includes.h"
   44.54 +#include "lib/tdb/include/tdb.h"
   44.55 +#include "system/time.h"
   44.56 +#include "system/shmem.h"
   44.57 +#include "system/filesys.h"
   44.58 +#endif
   44.59 +
   44.60 +#define TDB_MAGIC_FOOD "TDB file\n"
   44.61 +#define TDB_VERSION (0x26011967 + 6)
   44.62 +#define TDB_MAGIC (0x26011999U)
   44.63 +#define TDB_FREE_MAGIC (~TDB_MAGIC)
   44.64 +#define TDB_DEAD_MAGIC (0xFEE1DEAD)
   44.65 +#define TDB_ALIGNMENT 4
   44.66 +#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
   44.67 +#define DEFAULT_HASH_SIZE 131
   44.68 +#define TDB_PAGE_SIZE 0x2000
   44.69 +#define FREELIST_TOP (sizeof(struct tdb_header))
   44.70 +#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
   44.71 +#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
   44.72 +#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
   44.73 +#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
   44.74 +#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
   44.75 +#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1))
   44.76 +
   44.77 +
   44.78 +/* NB assumes there is a local variable called "tdb" that is the
   44.79 + * current context, also takes doubly-parenthesized print-style
   44.80 + * argument. */
   44.81 +#define TDB_LOG(x) tdb->log_fn x
   44.82 +
   44.83 +/* lock offsets */
   44.84 +#define GLOBAL_LOCK 0
   44.85 +#define ACTIVE_LOCK 4
   44.86 +
   44.87 +#ifndef MAP_FILE
   44.88 +#define MAP_FILE 0
   44.89 +#endif
   44.90 +
   44.91 +#ifndef MAP_FAILED
   44.92 +#define MAP_FAILED ((void *)-1)
   44.93 +#endif
   44.94 +
   44.95 +#ifndef discard_const_p
   44.96 +# if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
   44.97 +#  define discard_const(ptr) ((void *)((intptr_t)(ptr)))
   44.98 +# else
   44.99 +#  define discard_const(ptr) ((void *)(ptr))
  44.100 +# endif
  44.101 +# define discard_const_p(type, ptr) ((type *)discard_const(ptr))
  44.102 +#endif
  44.103 +
  44.104 +/* free memory if the pointer is valid and zero the pointer */
  44.105 +#ifndef SAFE_FREE
  44.106 +#define SAFE_FREE(x) do { if ((x) != NULL) {talloc_free(discard_const_p(void *, (x))); (x)=NULL;} } while(0)
  44.107 +#endif
  44.108 +
  44.109 +#define BUCKET(hash) ((hash) % tdb->header.hash_size)
  44.110 +TDB_DATA tdb_null;
  44.111 +
  44.112 +/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
  44.113 +static TDB_CONTEXT *tdbs = NULL;
  44.114 +
  44.115 +static int tdb_munmap(TDB_CONTEXT *tdb)
  44.116 +{
  44.117 +	if (tdb->flags & TDB_INTERNAL)
  44.118 +		return 0;
  44.119 +
  44.120 +#ifdef HAVE_MMAP
  44.121 +	if (tdb->map_ptr) {
  44.122 +		int ret = munmap(tdb->map_ptr, tdb->map_size);
  44.123 +		if (ret != 0)
  44.124 +			return ret;
  44.125 +	}
  44.126 +#endif
  44.127 +	tdb->map_ptr = NULL;
  44.128 +	return 0;
  44.129 +}
  44.130 +
  44.131 +static void tdb_mmap(TDB_CONTEXT *tdb)
  44.132 +{
  44.133 +	if (tdb->flags & TDB_INTERNAL)
  44.134 +		return;
  44.135 +
  44.136 +#ifdef HAVE_MMAP
  44.137 +	if (!(tdb->flags & TDB_NOMMAP)) {
  44.138 +		tdb->map_ptr = mmap(NULL, tdb->map_size, 
  44.139 +				    PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
  44.140 +				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
  44.141 +
  44.142 +		/*
  44.143 +		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
  44.144 +		 */
  44.145 +
  44.146 +		if (tdb->map_ptr == MAP_FAILED) {
  44.147 +			tdb->map_ptr = NULL;
  44.148 +			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n", 
  44.149 +				 tdb->map_size, strerror(errno)));
  44.150 +		}
  44.151 +	} else {
  44.152 +		tdb->map_ptr = NULL;
  44.153 +	}
  44.154 +#else
  44.155 +	tdb->map_ptr = NULL;
  44.156 +#endif
  44.157 +}
  44.158 +
  44.159 +/* Endian conversion: we only ever deal with 4 byte quantities */
  44.160 +static void *convert(void *buf, u32 size)
  44.161 +{
  44.162 +	u32 i, *p = buf;
  44.163 +	for (i = 0; i < size / 4; i++)
  44.164 +		p[i] = TDB_BYTEREV(p[i]);
  44.165 +	return buf;
  44.166 +}
  44.167 +#define DOCONV() (tdb->flags & TDB_CONVERT)
  44.168 +#define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
  44.169 +
  44.170 +/* the body of the database is made of one list_struct for the free space
  44.171 +   plus a separate data list for each hash value */
  44.172 +struct list_struct {
  44.173 +	tdb_off next; /* offset of the next record in the list */
  44.174 +	tdb_len rec_len; /* total byte length of record */
  44.175 +	tdb_len key_len; /* byte length of key */
  44.176 +	tdb_len data_len; /* byte length of data */
  44.177 +	u32 full_hash; /* the full 32 bit hash of the key */
  44.178 +	u32 magic;   /* try to catch errors */
  44.179 +	/* the following union is implied:
  44.180 +		union {
  44.181 +			char record[rec_len];
  44.182 +			struct {
  44.183 +				char key[key_len];
  44.184 +				char data[data_len];
  44.185 +			}
  44.186 +			u32 totalsize; (tailer)
  44.187 +		}
  44.188 +	*/
  44.189 +};
  44.190 +
  44.191 +/* a byte range locking function - return 0 on success
  44.192 +   this functions locks/unlocks 1 byte at the specified offset.
  44.193 +
  44.194 +   On error, errno is also set so that errors are passed back properly
  44.195 +   through tdb_open(). */
  44.196 +static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset, 
  44.197 +		      int rw_type, int lck_type, int probe)
  44.198 +{
  44.199 +	struct flock fl;
  44.200 +	int ret;
  44.201 +
  44.202 +	if (tdb->flags & TDB_NOLOCK)
  44.203 +		return 0;
  44.204 +	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
  44.205 +		errno = EACCES;
  44.206 +		return -1;
  44.207 +	}
  44.208 +
  44.209 +	fl.l_type = rw_type;
  44.210 +	fl.l_whence = SEEK_SET;
  44.211 +	fl.l_start = offset;
  44.212 +	fl.l_len = 1;
  44.213 +	fl.l_pid = 0;
  44.214 +
  44.215 +	do {
  44.216 +		ret = fcntl(tdb->fd,lck_type,&fl);
  44.217 +	} while (ret == -1 && errno == EINTR);
  44.218 +
  44.219 +	if (ret == -1) {
  44.220 +		if (!probe && lck_type != F_SETLK) {
  44.221 +			/* Ensure error code is set for log fun to examine. */
  44.222 +			tdb->ecode = TDB_ERR_LOCK;
  44.223 +			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n", 
  44.224 +				 tdb->fd, offset, rw_type, lck_type));
  44.225 +		}
  44.226 +		/* Generic lock error. errno set by fcntl.
  44.227 +		 * EAGAIN is an expected return from non-blocking
  44.228 +		 * locks. */
  44.229 +		if (errno != EAGAIN) {
  44.230 +		TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n", 
  44.231 +				 tdb->fd, offset, rw_type, lck_type, 
  44.232 +				 strerror(errno)));
  44.233 +		}
  44.234 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
  44.235 +	}
  44.236 +	return 0;
  44.237 +}
  44.238 +
  44.239 +/* lock a list in the database. list -1 is the alloc list */
  44.240 +static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
  44.241 +{
  44.242 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
  44.243 +		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n", 
  44.244 +			   list, ltype));
  44.245 +		return -1;
  44.246 +	}
  44.247 +	if (tdb->flags & TDB_NOLOCK)
  44.248 +		return 0;
  44.249 +
  44.250 +	/* Since fcntl locks don't nest, we do a lock for the first one,
  44.251 +	   and simply bump the count for future ones */
  44.252 +	if (tdb->locked[list+1].count == 0) {
  44.253 +		if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
  44.254 +			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n", 
  44.255 +					   list, ltype, strerror(errno)));
  44.256 +			return -1;
  44.257 +		}
  44.258 +		tdb->locked[list+1].ltype = ltype;
  44.259 +	}
  44.260 +	tdb->locked[list+1].count++;
  44.261 +	return 0;
  44.262 +}
  44.263 +
  44.264 +/* unlock the database: returns void because it's too late for errors. */
  44.265 +	/* changed to return int it may be interesting to know there
  44.266 +	   has been an error  --simo */
  44.267 +static int tdb_unlock(TDB_CONTEXT *tdb, int list,
  44.268 +		      int ltype __attribute__((unused)))
  44.269 +{
  44.270 +	int ret = -1;
  44.271 +
  44.272 +	if (tdb->flags & TDB_NOLOCK)
  44.273 +		return 0;
  44.274 +
  44.275 +	/* Sanity checks */
  44.276 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
  44.277 +		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
  44.278 +		return ret;
  44.279 +	}
  44.280 +
  44.281 +	if (tdb->locked[list+1].count==0) {
  44.282 +		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
  44.283 +		return ret;
  44.284 +	}
  44.285 +
  44.286 +	if (tdb->locked[list+1].count == 1) {
  44.287 +		/* Down to last nested lock: unlock underneath */
  44.288 +		ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
  44.289 +	} else {
  44.290 +		ret = 0;
  44.291 +	}
  44.292 +	tdb->locked[list+1].count--;
  44.293 +
  44.294 +	if (ret)
  44.295 +		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n")); 
  44.296 +	return ret;
  44.297 +}
  44.298 +
  44.299 +/* This is based on the hash algorithm from gdbm */
  44.300 +static u32 default_tdb_hash(TDB_DATA *key)
  44.301 +{
  44.302 +	u32 value;	/* Used to compute the hash value.  */
  44.303 +	u32   i;	/* Used to cycle through random values. */
  44.304 +
  44.305 +	/* Set the initial value from the key size. */
  44.306 +	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
  44.307 +		value = (value + (key->dptr[i] << (i*5 % 24)));
  44.308 +
  44.309 +	return (1103515243 * value + 12345);  
  44.310 +}
  44.311 +
  44.312 +/* check for an out of bounds access - if it is out of bounds then
  44.313 +   see if the database has been expanded by someone else and expand
  44.314 +   if necessary 
  44.315 +   note that "len" is the minimum length needed for the db
  44.316 +*/
  44.317 +static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
  44.318 +{
  44.319 +	struct stat st;
  44.320 +	if (len <= tdb->map_size)
  44.321 +		return 0;
  44.322 +	if (tdb->flags & TDB_INTERNAL) {
  44.323 +		if (!probe) {
  44.324 +			/* Ensure ecode is set for log fn. */
  44.325 +			tdb->ecode = TDB_ERR_IO;
  44.326 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
  44.327 +				 (int)len, (int)tdb->map_size));
  44.328 +		}
  44.329 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  44.330 +	}
  44.331 +
  44.332 +	if (fstat(tdb->fd, &st) == -1)
  44.333 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  44.334 +
  44.335 +	if (st.st_size < (off_t)len) {
  44.336 +		if (!probe) {
  44.337 +			/* Ensure ecode is set for log fn. */
  44.338 +			tdb->ecode = TDB_ERR_IO;
  44.339 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
  44.340 +				 (int)len, (int)st.st_size));
  44.341 +		}
  44.342 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  44.343 +	}
  44.344 +
  44.345 +	/* Unmap, update size, remap */
  44.346 +	if (tdb_munmap(tdb) == -1)
  44.347 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  44.348 +	tdb->map_size = st.st_size;
  44.349 +	tdb_mmap(tdb);
  44.350 +	return 0;
  44.351 +}
  44.352 +
  44.353 +/* write a lump of data at a specified offset */
  44.354 +static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
  44.355 +{
  44.356 +	if (tdb_oob(tdb, off + len, 0) != 0)
  44.357 +		return -1;
  44.358 +
  44.359 +	if (tdb->map_ptr)
  44.360 +		memcpy(off + (char *)tdb->map_ptr, buf, len);
  44.361 +#ifdef HAVE_PWRITE
  44.362 +	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
  44.363 +#else
  44.364 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
  44.365 +		 || write(tdb->fd, buf, len) != (off_t)len) {
  44.366 +#endif
  44.367 +		/* Ensure ecode is set for log fn. */
  44.368 +		tdb->ecode = TDB_ERR_IO;
  44.369 +		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
  44.370 +			   off, len, strerror(errno)));
  44.371 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  44.372 +	}
  44.373 +	return 0;
  44.374 +}
  44.375 +
  44.376 +/* read a lump of data at a specified offset, maybe convert */
  44.377 +static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
  44.378 +{
  44.379 +	if (tdb_oob(tdb, off + len, 0) != 0)
  44.380 +		return -1;
  44.381 +
  44.382 +	if (tdb->map_ptr)
  44.383 +		memcpy(buf, off + (char *)tdb->map_ptr, len);
  44.384 +#ifdef HAVE_PREAD
  44.385 +	else if (pread(tdb->fd, buf, len, off) != (off_t)len) {
  44.386 +#else
  44.387 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
  44.388 +		 || read(tdb->fd, buf, len) != (off_t)len) {
  44.389 +#endif
  44.390 +		/* Ensure ecode is set for log fn. */
  44.391 +		tdb->ecode = TDB_ERR_IO;
  44.392 +		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
  44.393 +			   off, len, strerror(errno)));
  44.394 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  44.395 +	}
  44.396 +	if (cv)
  44.397 +		convert(buf, len);
  44.398 +	return 0;
  44.399 +}
  44.400 +
  44.401 +/* don't allocate memory: used in tdb_delete path. */
  44.402 +static int tdb_key_eq(TDB_CONTEXT *tdb, tdb_off off, TDB_DATA key)
  44.403 +{
  44.404 +	char buf[64];
  44.405 +	u32 len;
  44.406 +
  44.407 +	if (tdb_oob(tdb, off + key.dsize, 0) != 0)
  44.408 +		return -1;
  44.409 +
  44.410 +	if (tdb->map_ptr)
  44.411 +		return !memcmp(off + (char*)tdb->map_ptr, key.dptr, key.dsize);
  44.412 +
  44.413 +	while (key.dsize) {
  44.414 +		len = key.dsize;
  44.415 +		if (len > sizeof(buf))
  44.416 +			len = sizeof(buf);
  44.417 +		if (tdb_read(tdb, off, buf, len, 0) != 0)
  44.418 +			return -1;
  44.419 +		if (memcmp(buf, key.dptr, len) != 0)
  44.420 +			return 0;
  44.421 +		key.dptr += len;
  44.422 +		key.dsize -= len;
  44.423 +		off += len;
  44.424 +	}
  44.425 +	return 1;
  44.426 +}
  44.427 +
  44.428 +/* read a lump of data, allocating the space for it */
  44.429 +static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
  44.430 +{
  44.431 +	char *buf;
  44.432 +
  44.433 +	if (!(buf = talloc_size(tdb, len))) {
  44.434 +		/* Ensure ecode is set for log fn. */
  44.435 +		tdb->ecode = TDB_ERR_OOM;
  44.436 +		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
  44.437 +			   len, strerror(errno)));
  44.438 +		return TDB_ERRCODE(TDB_ERR_OOM, buf);
  44.439 +	}
  44.440 +	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
  44.441 +		SAFE_FREE(buf);
  44.442 +		return NULL;
  44.443 +	}
  44.444 +	return buf;
  44.445 +}
  44.446 +
  44.447 +/* read/write a tdb_off */
  44.448 +static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
  44.449 +{
  44.450 +	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
  44.451 +}
  44.452 +static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
  44.453 +{
  44.454 +	tdb_off off = *d;
  44.455 +	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
  44.456 +}
  44.457 +
  44.458 +/* read/write a record */
  44.459 +static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  44.460 +{
  44.461 +	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
  44.462 +		return -1;
  44.463 +	if (TDB_BAD_MAGIC(rec)) {
  44.464 +		/* Ensure ecode is set for log fn. */
  44.465 +		tdb->ecode = TDB_ERR_CORRUPT;
  44.466 +		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
  44.467 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  44.468 +	}
  44.469 +	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
  44.470 +}
  44.471 +static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  44.472 +{
  44.473 +	struct list_struct r = *rec;
  44.474 +	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
  44.475 +}
  44.476 +
  44.477 +/* read a freelist record and check for simple errors */
  44.478 +static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
  44.479 +{
  44.480 +	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
  44.481 +		return -1;
  44.482 +
  44.483 +	if (rec->magic == TDB_MAGIC) {
  44.484 +		/* this happens when a app is showdown while deleting a record - we should
  44.485 +		   not completely fail when this happens */
  44.486 +		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n", 
  44.487 +			 rec->magic, off));
  44.488 +		rec->magic = TDB_FREE_MAGIC;
  44.489 +		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
  44.490 +			return -1;
  44.491 +	}
  44.492 +
  44.493 +	if (rec->magic != TDB_FREE_MAGIC) {
  44.494 +		/* Ensure ecode is set for log fn. */
  44.495 +		tdb->ecode = TDB_ERR_CORRUPT;
  44.496 +		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n", 
  44.497 +			   rec->magic, off));
  44.498 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  44.499 +	}
  44.500 +	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
  44.501 +		return -1;
  44.502 +	return 0;
  44.503 +}
  44.504 +
  44.505 +/* update a record tailer (must hold allocation lock) */
  44.506 +static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
  44.507 +			 const struct list_struct *rec)
  44.508 +{
  44.509 +	tdb_off totalsize;
  44.510 +
  44.511 +	/* Offset of tailer from record header */
  44.512 +	totalsize = sizeof(*rec) + rec->rec_len;
  44.513 +	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
  44.514 +			 &totalsize);
  44.515 +}
  44.516 +
  44.517 +static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
  44.518 +{
  44.519 +	struct list_struct rec;
  44.520 +	tdb_off tailer_ofs, tailer;
  44.521 +
  44.522 +	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
  44.523 +		printf("ERROR: failed to read record at %u\n", offset);
  44.524 +		return 0;
  44.525 +	}
  44.526 +
  44.527 +	printf(" rec: offset=0x%08x next=0x%08x rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
  44.528 +	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
  44.529 +
  44.530 +	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
  44.531 +	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
  44.532 +		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
  44.533 +		return rec.next;
  44.534 +	}
  44.535 +
  44.536 +	if (tailer != rec.rec_len + sizeof(rec)) {
  44.537 +		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
  44.538 +				(unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
  44.539 +	}
  44.540 +	return rec.next;
  44.541 +}
  44.542 +
  44.543 +static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
  44.544 +{
  44.545 +	tdb_off rec_ptr, top;
  44.546 +
  44.547 +	top = TDB_HASH_TOP(i);
  44.548 +
  44.549 +	if (tdb_lock(tdb, i, F_WRLCK) != 0)
  44.550 +		return -1;
  44.551 +
  44.552 +	if (ofs_read(tdb, top, &rec_ptr) == -1)
  44.553 +		return tdb_unlock(tdb, i, F_WRLCK);
  44.554 +
  44.555 +	if (rec_ptr)
  44.556 +		printf("hash=%d\n", i);
  44.557 +
  44.558 +	while (rec_ptr) {
  44.559 +		rec_ptr = tdb_dump_record(tdb, rec_ptr);
  44.560 +	}
  44.561 +
  44.562 +	return tdb_unlock(tdb, i, F_WRLCK);
  44.563 +}
  44.564 +
  44.565 +void tdb_dump_all(TDB_CONTEXT *tdb)
  44.566 +{
  44.567 +	unsigned int i;
  44.568 +	for (i=0;i<tdb->header.hash_size;i++) {
  44.569 +		tdb_dump_chain(tdb, i);
  44.570 +	}
  44.571 +	printf("freelist:\n");
  44.572 +	tdb_dump_chain(tdb, -1);
  44.573 +}
  44.574 +
  44.575 +int tdb_printfreelist(TDB_CONTEXT *tdb)
  44.576 +{
  44.577 +	int ret;
  44.578 +	long total_free = 0;
  44.579 +	tdb_off offset, rec_ptr;
  44.580 +	struct list_struct rec;
  44.581 +
  44.582 +	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
  44.583 +		return ret;
  44.584 +
  44.585 +	offset = FREELIST_TOP;
  44.586 +
  44.587 +	/* read in the freelist top */
  44.588 +	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
  44.589 +		tdb_unlock(tdb, -1, F_WRLCK);
  44.590 +		return 0;
  44.591 +	}
  44.592 +
  44.593 +	printf("freelist top=[0x%08x]\n", rec_ptr );
  44.594 +	while (rec_ptr) {
  44.595 +		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
  44.596 +			tdb_unlock(tdb, -1, F_WRLCK);
  44.597 +			return -1;
  44.598 +		}
  44.599 +
  44.600 +		if (rec.magic != TDB_FREE_MAGIC) {
  44.601 +			printf("bad magic 0x%08x in free list\n", rec.magic);
  44.602 +			tdb_unlock(tdb, -1, F_WRLCK);
  44.603 +			return -1;
  44.604 +		}
  44.605 +
  44.606 +		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n", 
  44.607 +		       rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
  44.608 +		total_free += rec.rec_len;
  44.609 +
  44.610 +		/* move to the next record */
  44.611 +		rec_ptr = rec.next;
  44.612 +	}
  44.613 +	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free, 
  44.614 +               (int)total_free);
  44.615 +
  44.616 +	return tdb_unlock(tdb, -1, F_WRLCK);
  44.617 +}
  44.618 +
  44.619 +/* Remove an element from the freelist.  Must have alloc lock. */
  44.620 +static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
  44.621 +{
  44.622 +	tdb_off last_ptr, i;
  44.623 +
  44.624 +	/* read in the freelist top */
  44.625 +	last_ptr = FREELIST_TOP;
  44.626 +	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
  44.627 +		if (i == off) {
  44.628 +			/* We've found it! */
  44.629 +			return ofs_write(tdb, last_ptr, &next);
  44.630 +		}
  44.631 +		/* Follow chain (next offset is at start of record) */
  44.632 +		last_ptr = i;
  44.633 +	}
  44.634 +	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
  44.635 +	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  44.636 +}
  44.637 +
  44.638 +/* Add an element into the freelist. Merge adjacent records if
  44.639 +   neccessary. */
  44.640 +static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  44.641 +{
  44.642 +	tdb_off right, left;
  44.643 +
  44.644 +	/* Allocation and tailer lock */
  44.645 +	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
  44.646 +		return -1;
  44.647 +
  44.648 +	/* set an initial tailer, so if we fail we don't leave a bogus record */
  44.649 +	if (update_tailer(tdb, offset, rec) != 0) {
  44.650 +		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
  44.651 +		goto fail;
  44.652 +	}
  44.653 +
  44.654 +	/* Look right first (I'm an Australian, dammit) */
  44.655 +	right = offset + sizeof(*rec) + rec->rec_len;
  44.656 +	if (right + sizeof(*rec) <= tdb->map_size) {
  44.657 +		struct list_struct r;
  44.658 +
  44.659 +		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
  44.660 +			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
  44.661 +			goto left;
  44.662 +		}
  44.663 +
  44.664 +		/* If it's free, expand to include it. */
  44.665 +		if (r.magic == TDB_FREE_MAGIC) {
  44.666 +			if (remove_from_freelist(tdb, right, r.next) == -1) {
  44.667 +				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
  44.668 +				goto left;
  44.669 +			}
  44.670 +			rec->rec_len += sizeof(r) + r.rec_len;
  44.671 +		}
  44.672 +	}
  44.673 +
  44.674 +left:
  44.675 +	/* Look left */
  44.676 +	left = offset - sizeof(tdb_off);
  44.677 +	if (left > TDB_DATA_START(tdb->header.hash_size)) {
  44.678 +		struct list_struct l;
  44.679 +		tdb_off leftsize;
  44.680 +		
  44.681 +		/* Read in tailer and jump back to header */
  44.682 +		if (ofs_read(tdb, left, &leftsize) == -1) {
  44.683 +			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
  44.684 +			goto update;
  44.685 +		}
  44.686 +		left = offset - leftsize;
  44.687 +
  44.688 +		/* Now read in record */
  44.689 +		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
  44.690 +			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
  44.691 +			goto update;
  44.692 +		}
  44.693 +
  44.694 +		/* If it's free, expand to include it. */
  44.695 +		if (l.magic == TDB_FREE_MAGIC) {
  44.696 +			if (remove_from_freelist(tdb, left, l.next) == -1) {
  44.697 +				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
  44.698 +				goto update;
  44.699 +			} else {
  44.700 +				offset = left;
  44.701 +				rec->rec_len += leftsize;
  44.702 +			}
  44.703 +		}
  44.704 +	}
  44.705 +
  44.706 +update:
  44.707 +	if (update_tailer(tdb, offset, rec) == -1) {
  44.708 +		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
  44.709 +		goto fail;
  44.710 +	}
  44.711 +
  44.712 +	/* Now, prepend to free list */
  44.713 +	rec->magic = TDB_FREE_MAGIC;
  44.714 +
  44.715 +	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
  44.716 +	    rec_write(tdb, offset, rec) == -1 ||
  44.717 +	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
  44.718 +		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
  44.719 +		goto fail;
  44.720 +	}
  44.721 +
  44.722 +	/* And we're done. */
  44.723 +	tdb_unlock(tdb, -1, F_WRLCK);
  44.724 +	return 0;
  44.725 +
  44.726 + fail:
  44.727 +	tdb_unlock(tdb, -1, F_WRLCK);
  44.728 +	return -1;
  44.729 +}
  44.730 +
  44.731 +
  44.732 +/* expand a file.  we prefer to use ftruncate, as that is what posix
  44.733 +  says to use for mmap expansion */
  44.734 +static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
  44.735 +{
  44.736 +	char buf[1024];
  44.737 +#if HAVE_FTRUNCATE_EXTEND
  44.738 +	if (ftruncate(tdb->fd, size+addition) != 0) {
  44.739 +		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n", 
  44.740 +			   size+addition, strerror(errno)));
  44.741 +		return -1;
  44.742 +	}
  44.743 +#else
  44.744 +	char b = 0;
  44.745 +
  44.746 +#ifdef HAVE_PWRITE
  44.747 +	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
  44.748 +#else
  44.749 +	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (off_t)(size+addition) - 1 || 
  44.750 +	    write(tdb->fd, &b, 1) != 1) {
  44.751 +#endif
  44.752 +		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n", 
  44.753 +			   size+addition, strerror(errno)));
  44.754 +		return -1;
  44.755 +	}
  44.756 +#endif
  44.757 +
  44.758 +	/* now fill the file with something. This ensures that the file isn't sparse, which would be
  44.759 +	   very bad if we ran out of disk. This must be done with write, not via mmap */
  44.760 +	memset(buf, 0x42, sizeof(buf));
  44.761 +	while (addition) {
  44.762 +		int n = addition>sizeof(buf)?sizeof(buf):addition;
  44.763 +#ifdef HAVE_PWRITE
  44.764 +		int ret = pwrite(tdb->fd, buf, n, size);
  44.765 +#else
  44.766 +		int ret;
  44.767 +		if (lseek(tdb->fd, size, SEEK_SET) != (off_t)size)
  44.768 +			return -1;
  44.769 +		ret = write(tdb->fd, buf, n);
  44.770 +#endif
  44.771 +		if (ret != n) {
  44.772 +			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n", 
  44.773 +				   n, strerror(errno)));
  44.774 +			return -1;
  44.775 +		}
  44.776 +		addition -= n;
  44.777 +		size += n;
  44.778 +	}
  44.779 +	return 0;
  44.780 +}
  44.781 +
  44.782 +
  44.783 +/* expand the database at least size bytes by expanding the underlying
  44.784 +   file and doing the mmap again if necessary */
  44.785 +static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
  44.786 +{
  44.787 +	struct list_struct rec;
  44.788 +	tdb_off offset;
  44.789 +
  44.790 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
  44.791 +		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
  44.792 +		return -1;
  44.793 +	}
  44.794 +
  44.795 +	/* must know about any previous expansions by another process */
  44.796 +	tdb_oob(tdb, tdb->map_size + 1, 1);
  44.797 +
  44.798 +	/* always make room for at least 10 more records, and round
  44.799 +           the database up to a multiple of TDB_PAGE_SIZE */
  44.800 +	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
  44.801 +
  44.802 +	if (!(tdb->flags & TDB_INTERNAL))
  44.803 +		tdb_munmap(tdb);
  44.804 +
  44.805 +	/*
  44.806 +	 * We must ensure the file is unmapped before doing this
  44.807 +	 * to ensure consistency with systems like OpenBSD where
  44.808 +	 * writes and mmaps are not consistent.
  44.809 +	 */
  44.810 +
  44.811 +	/* expand the file itself */
  44.812 +	if (!(tdb->flags & TDB_INTERNAL)) {
  44.813 +		if (expand_file(tdb, tdb->map_size, size) != 0)
  44.814 +			goto fail;
  44.815 +	}
  44.816 +
  44.817 +	tdb->map_size += size;
  44.818 +
  44.819 +	if (tdb->flags & TDB_INTERNAL) {
  44.820 +		char *new_map_ptr = talloc_realloc_size(tdb, tdb->map_ptr,
  44.821 +							tdb->map_size);
  44.822 +		if (!new_map_ptr) {
  44.823 +			tdb->map_size -= size;
  44.824 +			goto fail;
  44.825 +		}
  44.826 +		tdb->map_ptr = new_map_ptr;
  44.827 +	} else {
  44.828 +		/*
  44.829 +		 * We must ensure the file is remapped before adding the space
  44.830 +		 * to ensure consistency with systems like OpenBSD where
  44.831 +		 * writes and mmaps are not consistent.
  44.832 +		 */
  44.833 +
  44.834 +		/* We're ok if the mmap fails as we'll fallback to read/write */
  44.835 +		tdb_mmap(tdb);
  44.836 +	}
  44.837 +
  44.838 +	/* form a new freelist record */
  44.839 +	memset(&rec,'\0',sizeof(rec));
  44.840 +	rec.rec_len = size - sizeof(rec);
  44.841 +
  44.842 +	/* link it into the free list */
  44.843 +	offset = tdb->map_size - size;
  44.844 +	if (tdb_free(tdb, offset, &rec) == -1)
  44.845 +		goto fail;
  44.846 +
  44.847 +	tdb_unlock(tdb, -1, F_WRLCK);
  44.848 +	return 0;
  44.849 + fail:
  44.850 +	tdb_unlock(tdb, -1, F_WRLCK);
  44.851 +	return -1;
  44.852 +}
  44.853 +
  44.854 +
  44.855 +/* 
  44.856 +   the core of tdb_allocate - called when we have decided which
  44.857 +   free list entry to use
  44.858 + */
  44.859 +static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
  44.860 +				struct list_struct *rec, tdb_off last_ptr)
  44.861 +{
  44.862 +	struct list_struct newrec;
  44.863 +	tdb_off newrec_ptr;
  44.864 +
  44.865 +	memset(&newrec, '\0', sizeof(newrec));
  44.866 +
  44.867 +	/* found it - now possibly split it up  */
  44.868 +	if (rec->rec_len > length + MIN_REC_SIZE) {
  44.869 +		/* Length of left piece */
  44.870 +		length = TDB_ALIGN(length, TDB_ALIGNMENT);
  44.871 +		
  44.872 +		/* Right piece to go on free list */
  44.873 +		newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
  44.874 +		newrec_ptr = rec_ptr + sizeof(*rec) + length;
  44.875 +		
  44.876 +		/* And left record is shortened */
  44.877 +		rec->rec_len = length;
  44.878 +	} else {
  44.879 +		newrec_ptr = 0;
  44.880 +	}
  44.881 +	
  44.882 +	/* Remove allocated record from the free list */
  44.883 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
  44.884 +		return 0;
  44.885 +	}
  44.886 +	
  44.887 +	/* Update header: do this before we drop alloc
  44.888 +	   lock, otherwise tdb_free() might try to
  44.889 +	   merge with us, thinking we're free.
  44.890 +	   (Thanks Jeremy Allison). */
  44.891 +	rec->magic = TDB_MAGIC;
  44.892 +	if (rec_write(tdb, rec_ptr, rec) == -1) {
  44.893 +		return 0;
  44.894 +	}
  44.895 +	
  44.896 +	/* Did we create new block? */
  44.897 +	if (newrec_ptr) {
  44.898 +		/* Update allocated record tailer (we
  44.899 +		   shortened it). */
  44.900 +		if (update_tailer(tdb, rec_ptr, rec) == -1) {
  44.901 +			return 0;
  44.902 +		}
  44.903 +		
  44.904 +		/* Free new record */
  44.905 +		if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
  44.906 +			return 0;
  44.907 +		}
  44.908 +	}
  44.909 +	
  44.910 +	/* all done - return the new record offset */
  44.911 +	return rec_ptr;
  44.912 +}
  44.913 +
  44.914 +/* allocate some space from the free list. The offset returned points
  44.915 +   to a unconnected list_struct within the database with room for at
  44.916 +   least length bytes of total data
  44.917 +
  44.918 +   0 is returned if the space could not be allocated
  44.919 + */
  44.920 +static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
  44.921 +			    struct list_struct *rec)
  44.922 +{
  44.923 +	tdb_off rec_ptr, last_ptr, newrec_ptr;
  44.924 +	struct {
  44.925 +		tdb_off rec_ptr, last_ptr;
  44.926 +		tdb_len rec_len;
  44.927 +	} bestfit = { 0, 0, 0 };
  44.928 +
  44.929 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
  44.930 +		return 0;
  44.931 +
  44.932 +	/* Extra bytes required for tailer */
  44.933 +	length += sizeof(tdb_off);
  44.934 +
  44.935 + again:
  44.936 +	last_ptr = FREELIST_TOP;
  44.937 +
  44.938 +	/* read in the freelist top */
  44.939 +	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
  44.940 +		goto fail;
  44.941 +
  44.942 +	bestfit.rec_ptr = 0;
  44.943 +
  44.944 +	/* 
  44.945 +	   this is a best fit allocation strategy. Originally we used
  44.946 +	   a first fit strategy, but it suffered from massive fragmentation
  44.947 +	   issues when faced with a slowly increasing record size.
  44.948 +	 */
  44.949 +	while (rec_ptr) {
  44.950 +		if (rec_free_read(tdb, rec_ptr, rec) == -1) {
  44.951 +			goto fail;
  44.952 +		}
  44.953 +
  44.954 +		if (rec->rec_len >= length) {
  44.955 +			if (bestfit.rec_ptr == 0 ||
  44.956 +			    rec->rec_len < bestfit.rec_len) {
  44.957 +				bestfit.rec_len = rec->rec_len;
  44.958 +				bestfit.rec_ptr = rec_ptr;
  44.959 +				bestfit.last_ptr = last_ptr;
  44.960 +				/* consider a fit to be good enough if we aren't wasting more than half the space */
  44.961 +				if (bestfit.rec_len < 2*length) {
  44.962 +					break;
  44.963 +				}
  44.964 +			}
  44.965 +		}
  44.966 +
  44.967 +		/* move to the next record */
  44.968 +		last_ptr = rec_ptr;
  44.969 +		rec_ptr = rec->next;
  44.970 +	}
  44.971 +
  44.972 +	if (bestfit.rec_ptr != 0) {
  44.973 +		if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
  44.974 +			goto fail;
  44.975 +		}
  44.976 +
  44.977 +		newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
  44.978 +		tdb_unlock(tdb, -1, F_WRLCK);
  44.979 +		return newrec_ptr;
  44.980 +	}
  44.981 +
  44.982 +	/* we didn't find enough space. See if we can expand the
  44.983 +	   database and if we can then try again */
  44.984 +	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
  44.985 +		goto again;
  44.986 + fail:
  44.987 +	tdb_unlock(tdb, -1, F_WRLCK);
  44.988 +	return 0;
  44.989 +}
  44.990 +
  44.991 +/* initialise a new database with a specified hash size */
  44.992 +static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
  44.993 +{
  44.994 +	struct tdb_header *newdb;
  44.995 +	int size, ret = -1;
  44.996 +
  44.997 +	/* We make it up in memory, then write it out if not internal */
  44.998 +	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
  44.999 +	if (!(newdb = talloc_zero_size(tdb, size)))
 44.1000 +		return TDB_ERRCODE(TDB_ERR_OOM, -1);
 44.1001 +
 44.1002 +	/* Fill in the header */
 44.1003 +	newdb->version = TDB_VERSION;
 44.1004 +	newdb->hash_size = hash_size;
 44.1005 +	if (tdb->flags & TDB_INTERNAL) {
 44.1006 +		tdb->map_size = size;
 44.1007 +		tdb->map_ptr = (char *)newdb;
 44.1008 +		memcpy(&tdb->header, newdb, sizeof(tdb->header));
 44.1009 +		/* Convert the `ondisk' version if asked. */
 44.1010 +		CONVERT(*newdb);
 44.1011 +		return 0;
 44.1012 +	}
 44.1013 +	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 44.1014 +		goto fail;
 44.1015 +
 44.1016 +	if (ftruncate(tdb->fd, 0) == -1)
 44.1017 +		goto fail;
 44.1018 +
 44.1019 +	/* This creates an endian-converted header, as if read from disk */
 44.1020 +	CONVERT(*newdb);
 44.1021 +	memcpy(&tdb->header, newdb, sizeof(tdb->header));
 44.1022 +	/* Don't endian-convert the magic food! */
 44.1023 +	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 44.1024 +	if (write(tdb->fd, newdb, size) != size)
 44.1025 +		ret = -1;
 44.1026 +	else
 44.1027 +		ret = 0;
 44.1028 +
 44.1029 +  fail:
 44.1030 +	SAFE_FREE(newdb);
 44.1031 +	return ret;
 44.1032 +}
 44.1033 +
 44.1034 +/* Returns 0 on fail.  On success, return offset of record, and fills
 44.1035 +   in rec */
 44.1036 +static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 44.1037 +			struct list_struct *r)
 44.1038 +{
 44.1039 +	tdb_off rec_ptr;
 44.1040 +	
 44.1041 +	/* read in the hash top */
 44.1042 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 44.1043 +		return 0;
 44.1044 +
 44.1045 +	/* keep looking until we find the right record */
 44.1046 +	while (rec_ptr) {
 44.1047 +		if (rec_read(tdb, rec_ptr, r) == -1)
 44.1048 +			return 0;
 44.1049 +
 44.1050 +		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 44.1051 +			/* a very likely hit - read the key */
 44.1052 +			int cmp = tdb_key_eq(tdb, rec_ptr + sizeof(*r), key);
 44.1053 +			if (cmp < 0)
 44.1054 +				return 0;
 44.1055 +			else if (cmp > 0)
 44.1056 +				return rec_ptr;
 44.1057 +		}
 44.1058 +		rec_ptr = r->next;
 44.1059 +	}
 44.1060 +	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
 44.1061 +}
 44.1062 +
 44.1063 +/* As tdb_find, but if you succeed, keep the lock */
 44.1064 +static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
 44.1065 +			     struct list_struct *rec)
 44.1066 +{
 44.1067 +	u32 rec_ptr;
 44.1068 +
 44.1069 +	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 44.1070 +		return 0;
 44.1071 +	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 44.1072 +		tdb_unlock(tdb, BUCKET(hash), locktype);
 44.1073 +	return rec_ptr;
 44.1074 +}
 44.1075 +
 44.1076 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
 44.1077 +{
 44.1078 +	return tdb->ecode;
 44.1079 +}
 44.1080 +
 44.1081 +static struct tdb_errname {
 44.1082 +	enum TDB_ERROR ecode; const char *estring;
 44.1083 +} emap[] = { {TDB_SUCCESS, "Success"},
 44.1084 +	     {TDB_ERR_CORRUPT, "Corrupt database"},
 44.1085 +	     {TDB_ERR_IO, "IO Error"},
 44.1086 +	     {TDB_ERR_LOCK, "Locking error"},
 44.1087 +	     {TDB_ERR_OOM, "Out of memory"},
 44.1088 +	     {TDB_ERR_EXISTS, "Record exists"},
 44.1089 +	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
 44.1090 +	     {TDB_ERR_NOEXIST, "Record does not exist"} };
 44.1091 +
 44.1092 +/* Error string for the last tdb error */
 44.1093 +const char *tdb_errorstr(TDB_CONTEXT *tdb)
 44.1094 +{
 44.1095 +	u32 i;
 44.1096 +	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
 44.1097 +		if (tdb->ecode == emap[i].ecode)
 44.1098 +			return emap[i].estring;
 44.1099 +	return "Invalid error code";
 44.1100 +}
 44.1101 +
 44.1102 +/* update an entry in place - this only works if the new data size
 44.1103 +   is <= the old data size and the key exists.
 44.1104 +   on failure return -1.
 44.1105 +*/
 44.1106 +
 44.1107 +static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
 44.1108 +{
 44.1109 +	struct list_struct rec;
 44.1110 +	tdb_off rec_ptr;
 44.1111 +
 44.1112 +	/* find entry */
 44.1113 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 44.1114 +		return -1;
 44.1115 +
 44.1116 +	/* must be long enough key, data and tailer */
 44.1117 +	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
 44.1118 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
 44.1119 +		return -1;
 44.1120 +	}
 44.1121 +
 44.1122 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 44.1123 +		      dbuf.dptr, dbuf.dsize) == -1)
 44.1124 +		return -1;
 44.1125 +
 44.1126 +	if (dbuf.dsize != rec.data_len) {
 44.1127 +		/* update size */
 44.1128 +		rec.data_len = dbuf.dsize;
 44.1129 +		return rec_write(tdb, rec_ptr, &rec);
 44.1130 +	}
 44.1131 + 
 44.1132 +	return 0;
 44.1133 +}
 44.1134 +
 44.1135 +/* find an entry in the database given a key */
 44.1136 +/* If an entry doesn't exist tdb_err will be set to
 44.1137 + * TDB_ERR_NOEXIST. If a key has no data attached
 44.1138 + * then the TDB_DATA will have zero length but
 44.1139 + * a non-zero pointer
 44.1140 + */
 44.1141 +
 44.1142 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
 44.1143 +{
 44.1144 +	tdb_off rec_ptr;
 44.1145 +	struct list_struct rec;
 44.1146 +	TDB_DATA ret;
 44.1147 +	u32 hash;
 44.1148 +
 44.1149 +	/* find which hash bucket it is in */
 44.1150 +	hash = tdb->hash_fn(&key);
 44.1151 +	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 44.1152 +		return tdb_null;
 44.1153 +
 44.1154 +	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 44.1155 +				  rec.data_len);
 44.1156 +	ret.dsize = rec.data_len;
 44.1157 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 44.1158 +	return ret;
 44.1159 +}
 44.1160 +
 44.1161 +/* check if an entry in the database exists 
 44.1162 +
 44.1163 +   note that 1 is returned if the key is found and 0 is returned if not found
 44.1164 +   this doesn't match the conventions in the rest of this module, but is
 44.1165 +   compatible with gdbm
 44.1166 +*/
 44.1167 +static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
 44.1168 +{
 44.1169 +	struct list_struct rec;
 44.1170 +	
 44.1171 +	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 44.1172 +		return 0;
 44.1173 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 44.1174 +	return 1;
 44.1175 +}
 44.1176 +
 44.1177 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
 44.1178 +{
 44.1179 +	u32 hash = tdb->hash_fn(&key);
 44.1180 +	return tdb_exists_hash(tdb, key, hash);
 44.1181 +}
 44.1182 +
 44.1183 +/* record lock stops delete underneath */
 44.1184 +static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
 44.1185 +{
 44.1186 +	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
 44.1187 +}
 44.1188 +/*
 44.1189 +  Write locks override our own fcntl readlocks, so check it here.
 44.1190 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 44.1191 +  an error to fail to get the lock here.
 44.1192 +*/
 44.1193 + 
 44.1194 +static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
 44.1195 +{
 44.1196 +	struct tdb_traverse_lock *i;
 44.1197 +	for (i = &tdb->travlocks; i; i = i->next)
 44.1198 +		if (i->off == off)
 44.1199 +			return -1;
 44.1200 +	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
 44.1201 +}
 44.1202 +
 44.1203 +/*
 44.1204 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 44.1205 +  an error to fail to get the lock here.
 44.1206 +*/
 44.1207 +
 44.1208 +static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
 44.1209 +{
 44.1210 +	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
 44.1211 +}
 44.1212 +/* fcntl locks don't stack: avoid unlocking someone else's */
 44.1213 +static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
 44.1214 +{
 44.1215 +	struct tdb_traverse_lock *i;
 44.1216 +	u32 count = 0;
 44.1217 +
 44.1218 +	if (off == 0)
 44.1219 +		return 0;
 44.1220 +	for (i = &tdb->travlocks; i; i = i->next)
 44.1221 +		if (i->off == off)
 44.1222 +			count++;
 44.1223 +	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
 44.1224 +}
 44.1225 +
 44.1226 +/* actually delete an entry in the database given the offset */
 44.1227 +static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
 44.1228 +{
 44.1229 +	tdb_off last_ptr, i;
 44.1230 +	struct list_struct lastrec;
 44.1231 +
 44.1232 +	if (tdb->read_only) return -1;
 44.1233 +
 44.1234 +	if (write_lock_record(tdb, rec_ptr) == -1) {
 44.1235 +		/* Someone traversing here: mark it as dead */
 44.1236 +		rec->magic = TDB_DEAD_MAGIC;
 44.1237 +		return rec_write(tdb, rec_ptr, rec);
 44.1238 +	}
 44.1239 +	if (write_unlock_record(tdb, rec_ptr) != 0)
 44.1240 +		return -1;
 44.1241 +
 44.1242 +	/* find previous record in hash chain */
 44.1243 +	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 44.1244 +		return -1;
 44.1245 +	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 44.1246 +		if (rec_read(tdb, i, &lastrec) == -1)
 44.1247 +			return -1;
 44.1248 +
 44.1249 +	/* unlink it: next ptr is at start of record. */
 44.1250 +	if (last_ptr == 0)
 44.1251 +		last_ptr = TDB_HASH_TOP(rec->full_hash);
 44.1252 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 44.1253 +		return -1;
 44.1254 +
 44.1255 +	/* recover the space */
 44.1256 +	if (tdb_free(tdb, rec_ptr, rec) == -1)
 44.1257 +		return -1;
 44.1258 +	return 0;
 44.1259 +}
 44.1260 +
 44.1261 +/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
 44.1262 +static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
 44.1263 +			 struct list_struct *rec)
 44.1264 +{
 44.1265 +	int want_next = (tlock->off != 0);
 44.1266 +
 44.1267 +	/* Lock each chain from the start one. */
 44.1268 +	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
 44.1269 +
 44.1270 +		/* this is an optimisation for the common case where
 44.1271 +		   the hash chain is empty, which is particularly
 44.1272 +		   common for the use of tdb with ldb, where large
 44.1273 +		   hashes are used. In that case we spend most of our
 44.1274 +		   time in tdb_brlock(), locking empty hash chains.
 44.1275 +
 44.1276 +		   To avoid this, we do an unlocked pre-check to see
 44.1277 +		   if the hash chain is empty before starting to look
 44.1278 +		   inside it. If it is empty then we can avoid that
 44.1279 +		   hash chain. If it isn't empty then we can't believe
 44.1280 +		   the value we get back, as we read it without a
 44.1281 +		   lock, so instead we get the lock and re-fetch the
 44.1282 +		   value below.
 44.1283 +
 44.1284 +		   Notice that not doing this optimisation on the
 44.1285 +		   first hash chain is critical. We must guarantee
 44.1286 +		   that we have done at least one fcntl lock at the
 44.1287 +		   start of a search to guarantee that memory is
 44.1288 +		   coherent on SMP systems. If records are added by
 44.1289 +		   others during the search then thats OK, and we
 44.1290 +		   could possibly miss those with this trick, but we
 44.1291 +		   could miss them anyway without this trick, so the
 44.1292 +		   semantics don't change.
 44.1293 +
 44.1294 +		   With a non-indexed ldb search this trick gains us a
 44.1295 +		   factor of around 80 in speed on a linux 2.6.x
 44.1296 +		   system (testing using ldbtest).
 44.1297 +		 */
 44.1298 +		if (!tlock->off && tlock->hash != 0) {
 44.1299 +			u32 off;
 44.1300 +			if (tdb->map_ptr) {
 44.1301 +				for (;tlock->hash < tdb->header.hash_size;tlock->hash++) {
 44.1302 +					if (0 != *(u32 *)(TDB_HASH_TOP(tlock->hash) + (unsigned char *)tdb->map_ptr)) {
 44.1303 +						break;
 44.1304 +					}
 44.1305 +				}
 44.1306 +				if (tlock->hash == tdb->header.hash_size) {
 44.1307 +					continue;
 44.1308 +				}
 44.1309 +			} else {
 44.1310 +				if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash), &off) == 0 &&
 44.1311 +				    off == 0) {
 44.1312 +					continue;
 44.1313 +				}
 44.1314 +			}
 44.1315 +		}
 44.1316 +
 44.1317 +		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
 44.1318 +			return -1;
 44.1319 +
 44.1320 +		/* No previous record?  Start at top of chain. */
 44.1321 +		if (!tlock->off) {
 44.1322 +			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
 44.1323 +				     &tlock->off) == -1)
 44.1324 +				goto fail;
 44.1325 +		} else {
 44.1326 +			/* Otherwise unlock the previous record. */
 44.1327 +			if (unlock_record(tdb, tlock->off) != 0)
 44.1328 +				goto fail;
 44.1329 +		}
 44.1330 +
 44.1331 +		if (want_next) {
 44.1332 +			/* We have offset of old record: grab next */
 44.1333 +			if (rec_read(tdb, tlock->off, rec) == -1)
 44.1334 +				goto fail;
 44.1335 +			tlock->off = rec->next;
 44.1336 +		}
 44.1337 +
 44.1338 +		/* Iterate through chain */
 44.1339 +		while( tlock->off) {
 44.1340 +			tdb_off current;
 44.1341 +			if (rec_read(tdb, tlock->off, rec) == -1)
 44.1342 +				goto fail;
 44.1343 +
 44.1344 +			/* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
 44.1345 +			if (tlock->off == rec->next) {
 44.1346 +				TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
 44.1347 +				goto fail;
 44.1348 +			}
 44.1349 +
 44.1350 +			if (!TDB_DEAD(rec)) {
 44.1351 +				/* Woohoo: we found one! */
 44.1352 +				if (lock_record(tdb, tlock->off) != 0)
 44.1353 +					goto fail;
 44.1354 +				return tlock->off;
 44.1355 +			}
 44.1356 +
 44.1357 +			/* Try to clean dead ones from old traverses */
 44.1358 +			current = tlock->off;
 44.1359 +			tlock->off = rec->next;
 44.1360 +			if (!tdb->read_only && 
 44.1361 +			    do_delete(tdb, current, rec) != 0)
 44.1362 +				goto fail;
 44.1363 +		}
 44.1364 +		tdb_unlock(tdb, tlock->hash, F_WRLCK);
 44.1365 +		want_next = 0;
 44.1366 +	}
 44.1367 +	/* We finished iteration without finding anything */
 44.1368 +	return TDB_ERRCODE(TDB_SUCCESS, 0);
 44.1369 +
 44.1370 + fail:
 44.1371 +	tlock->off = 0;
 44.1372 +	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
 44.1373 +		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
 44.1374 +	return -1;
 44.1375 +}
 44.1376 +
 44.1377 +/* traverse the entire database - calling fn(tdb, key, data) on each element.
 44.1378 +   return -1 on error or the record count traversed
 44.1379 +   if fn is NULL then it is not called
 44.1380 +   a non-zero return value from fn() indicates that the traversal should stop
 44.1381 +  */
 44.1382 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
 44.1383 +{
 44.1384 +	TDB_DATA key, dbuf;
 44.1385 +	struct list_struct rec;
 44.1386 +	struct tdb_traverse_lock tl = { NULL, 0, 0 };
 44.1387 +	int ret, count = 0;
 44.1388 +
 44.1389 +	/* This was in the initializaton, above, but the IRIX compiler
 44.1390 +	 * did not like it.  crh
 44.1391 +	 */
 44.1392 +	tl.next = tdb->travlocks.next;
 44.1393 +
 44.1394 +	/* fcntl locks don't stack: beware traverse inside traverse */
 44.1395 +	tdb->travlocks.next = &tl;
 44.1396 +
 44.1397 +	/* tdb_next_lock places locks on the record returned, and its chain */
 44.1398 +	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
 44.1399 +		count++;
 44.1400 +		/* now read the full record */
 44.1401 +		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec), 
 44.1402 +					  rec.key_len + rec.data_len);
 44.1403 +		if (!key.dptr) {
 44.1404 +			ret = -1;
 44.1405 +			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
 44.1406 +				goto out;
 44.1407 +			if (unlock_record(tdb, tl.off) != 0)
 44.1408 +				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
 44.1409 +			goto out;
 44.1410 +		}
 44.1411 +		key.dsize = rec.key_len;
 44.1412 +		dbuf.dptr = key.dptr + rec.key_len;
 44.1413 +		dbuf.dsize = rec.data_len;
 44.1414 +
 44.1415 +		/* Drop chain lock, call out */
 44.1416 +		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
 44.1417 +			ret = -1;
 44.1418 +			goto out;
 44.1419 +		}
 44.1420 +		if (fn && fn(tdb, key, dbuf, private)) {
 44.1421 +			/* They want us to terminate traversal */
 44.1422 +			ret = count;
 44.1423 +			if (unlock_record(tdb, tl.off) != 0) {
 44.1424 +				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
 44.1425 +				ret = -1;
 44.1426 +			}
 44.1427 +			tdb->travlocks.next = tl.next;
 44.1428 +			SAFE_FREE(key.dptr);
 44.1429 +			return count;
 44.1430 +		}
 44.1431 +		SAFE_FREE(key.dptr);
 44.1432 +	}
 44.1433 +out:
 44.1434 +	tdb->travlocks.next = tl.next;
 44.1435 +	if (ret < 0)
 44.1436 +		return -1;
 44.1437 +	else
 44.1438 +		return count;
 44.1439 +}
 44.1440 +
 44.1441 +/* find the first entry in the database and return its key */
 44.1442 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
 44.1443 +{
 44.1444 +	TDB_DATA key;
 44.1445 +	struct list_struct rec;
 44.1446 +
 44.1447 +	/* release any old lock */
 44.1448 +	if (unlock_record(tdb, tdb->travlocks.off) != 0)
 44.1449 +		return tdb_null;
 44.1450 +	tdb->travlocks.off = tdb->travlocks.hash = 0;
 44.1451 +
 44.1452 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
 44.1453 +		return tdb_null;
 44.1454 +	/* now read the key */
 44.1455 +	key.dsize = rec.key_len;
 44.1456 +	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
 44.1457 +	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
 44.1458 +		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
 44.1459 +	return key;
 44.1460 +}
 44.1461 +
 44.1462 +/* find the next entry in the database, returning its key */
 44.1463 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
 44.1464 +{
 44.1465 +	u32 oldhash;
 44.1466 +	TDB_DATA key = tdb_null;
 44.1467 +	struct list_struct rec;
 44.1468 +	char *k = NULL;
 44.1469 +
 44.1470 +	/* Is locked key the old key?  If so, traverse will be reliable. */
 44.1471 +	if (tdb->travlocks.off) {
 44.1472 +		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
 44.1473 +			return tdb_null;
 44.1474 +		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
 44.1475 +		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
 44.1476 +					    rec.key_len))
 44.1477 +		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
 44.1478 +			/* No, it wasn't: unlock it and start from scratch */
 44.1479 +			if (unlock_record(tdb, tdb->travlocks.off) != 0)
 44.1480 +				return tdb_null;
 44.1481 +			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
 44.1482 +				return tdb_null;
 44.1483 +			tdb->travlocks.off = 0;
 44.1484 +		}
 44.1485 +
 44.1486 +		SAFE_FREE(k);
 44.1487 +	}
 44.1488 +
 44.1489 +	if (!tdb->travlocks.off) {
 44.1490 +		/* No previous element: do normal find, and lock record */
 44.1491 +		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
 44.1492 +		if (!tdb->travlocks.off)
 44.1493 +			return tdb_null;
 44.1494 +		tdb->travlocks.hash = BUCKET(rec.full_hash);
 44.1495 +		if (lock_record(tdb, tdb->travlocks.off) != 0) {
 44.1496 +			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
 44.1497 +			return tdb_null;
 44.1498 +		}
 44.1499 +	}
 44.1500 +	oldhash = tdb->travlocks.hash;
 44.1501 +
 44.1502 +	/* Grab next record: locks chain and returned record,
 44.1503 +	   unlocks old record */
 44.1504 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
 44.1505 +		key.dsize = rec.key_len;
 44.1506 +		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
 44.1507 +					  key.dsize);
 44.1508 +		/* Unlock the chain of this new record */
 44.1509 +		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
 44.1510 +			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 44.1511 +	}
 44.1512 +	/* Unlock the chain of old record */
 44.1513 +	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
 44.1514 +		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 44.1515 +	return key;
 44.1516 +}
 44.1517 +
 44.1518 +/* delete an entry in the database given a key */
 44.1519 +static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
 44.1520 +{
 44.1521 +	tdb_off rec_ptr;
 44.1522 +	struct list_struct rec;
 44.1523 +	int ret;
 44.1524 +
 44.1525 +	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
 44.1526 +		return -1;
 44.1527 +	ret = do_delete(tdb, rec_ptr, &rec);
 44.1528 +	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
 44.1529 +		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
 44.1530 +	return ret;
 44.1531 +}
 44.1532 +
 44.1533 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
 44.1534 +{
 44.1535 +	u32 hash = tdb->hash_fn(&key);
 44.1536 +	return tdb_delete_hash(tdb, key, hash);
 44.1537 +}
 44.1538 +
 44.1539 +/* store an element in the database, replacing any existing element
 44.1540 +   with the same key 
 44.1541 +
 44.1542 +   return 0 on success, -1 on failure
 44.1543 +*/
 44.1544 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 44.1545 +{
 44.1546 +	struct list_struct rec;
 44.1547 +	u32 hash;
 44.1548 +	tdb_off rec_ptr;
 44.1549 +	char *p = NULL;
 44.1550 +	int ret = 0;
 44.1551 +
 44.1552 +	/* find which hash bucket it is in */
 44.1553 +	hash = tdb->hash_fn(&key);
 44.1554 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 44.1555 +		return -1;
 44.1556 +
 44.1557 +	/* check for it existing, on insert. */
 44.1558 +	if (flag == TDB_INSERT) {
 44.1559 +		if (tdb_exists_hash(tdb, key, hash)) {
 44.1560 +			tdb->ecode = TDB_ERR_EXISTS;
 44.1561 +			goto fail;
 44.1562 +		}
 44.1563 +	} else {
 44.1564 +		/* first try in-place update, on modify or replace. */
 44.1565 +		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
 44.1566 +			goto out;
 44.1567 +		if (tdb->ecode == TDB_ERR_NOEXIST &&
 44.1568 +		    flag == TDB_MODIFY) {
 44.1569 +			/* if the record doesn't exist and we are in TDB_MODIFY mode then
 44.1570 +			 we should fail the store */
 44.1571 +			goto fail;
 44.1572 +		}
 44.1573 +	}
 44.1574 +	/* reset the error code potentially set by the tdb_update() */
 44.1575 +	tdb->ecode = TDB_SUCCESS;
 44.1576 +
 44.1577 +	/* delete any existing record - if it doesn't exist we don't
 44.1578 +           care.  Doing this first reduces fragmentation, and avoids
 44.1579 +           coalescing with `allocated' block before it's updated. */
 44.1580 +	if (flag != TDB_INSERT)
 44.1581 +		tdb_delete_hash(tdb, key, hash);
 44.1582 +
 44.1583 +	/* Copy key+value *before* allocating free space in case malloc
 44.1584 +	   fails and we are left with a dead spot in the tdb. */
 44.1585 +
 44.1586 +	if (!(p = (char *)talloc_size(tdb, key.dsize + dbuf.dsize))) {
 44.1587 +		tdb->ecode = TDB_ERR_OOM;
 44.1588 +		goto fail;
 44.1589 +	}
 44.1590 +
 44.1591 +	memcpy(p, key.dptr, key.dsize);
 44.1592 +	if (dbuf.dsize)
 44.1593 +		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
 44.1594 +
 44.1595 +	/* we have to allocate some space */
 44.1596 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
 44.1597 +		goto fail;
 44.1598 +
 44.1599 +	/* Read hash top into next ptr */
 44.1600 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 44.1601 +		goto fail;
 44.1602 +
 44.1603 +	rec.key_len = key.dsize;
 44.1604 +	rec.data_len = dbuf.dsize;
 44.1605 +	rec.full_hash = hash;
 44.1606 +	rec.magic = TDB_MAGIC;
 44.1607 +
 44.1608 +	/* write out and point the top of the hash chain at it */
 44.1609 +	if (rec_write(tdb, rec_ptr, &rec) == -1
 44.1610 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
 44.1611 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 44.1612 +		/* Need to tdb_unallocate() here */
 44.1613 +		goto fail;
 44.1614 +	}
 44.1615 + out:
 44.1616 +	SAFE_FREE(p); 
 44.1617 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 44.1618 +	return ret;
 44.1619 +fail:
 44.1620 +	ret = -1;
 44.1621 +	goto out;
 44.1622 +}
 44.1623 +
 44.1624 +/* Attempt to append data to an entry in place - this only works if the new data size
 44.1625 +   is <= the old data size and the key exists.
 44.1626 +   on failure return -1. Record must be locked before calling.
 44.1627 +*/
 44.1628 +static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
 44.1629 +{
 44.1630 +	struct list_struct rec;
 44.1631 +	tdb_off rec_ptr;
 44.1632 +
 44.1633 +	/* find entry */
 44.1634 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 44.1635 +		return -1;
 44.1636 +
 44.1637 +	/* Append of 0 is always ok. */
 44.1638 +	if (new_dbuf.dsize == 0)
 44.1639 +		return 0;
 44.1640 +
 44.1641 +	/* must be long enough for key, old data + new data and tailer */
 44.1642 +	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
 44.1643 +		/* No room. */
 44.1644 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
 44.1645 +		return -1;
 44.1646 +	}
 44.1647 +
 44.1648 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
 44.1649 +		      new_dbuf.dptr, new_dbuf.dsize) == -1)
 44.1650 +		return -1;
 44.1651 +
 44.1652 +	/* update size */
 44.1653 +	rec.data_len += new_dbuf.dsize;
 44.1654 +	return rec_write(tdb, rec_ptr, &rec);
 44.1655 +}
 44.1656 +
 44.1657 +/* Append to an entry. Create if not exist. */
 44.1658 +
 44.1659 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 44.1660 +{
 44.1661 +	struct list_struct rec;
 44.1662 +	u32 hash;
 44.1663 +	tdb_off rec_ptr;
 44.1664 +	char *p = NULL;
 44.1665 +	int ret = 0;
 44.1666 +	size_t new_data_size = 0;
 44.1667 +
 44.1668 +	/* find which hash bucket it is in */
 44.1669 +	hash = tdb->hash_fn(&key);
 44.1670 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 44.1671 +		return -1;
 44.1672 +
 44.1673 +	/* first try in-place. */
 44.1674 +	if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
 44.1675 +		goto out;
 44.1676 +
 44.1677 +	/* reset the error code potentially set by the tdb_append_inplace() */
 44.1678 +	tdb->ecode = TDB_SUCCESS;
 44.1679 +
 44.1680 +	/* find entry */
 44.1681 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
 44.1682 +		if (tdb->ecode != TDB_ERR_NOEXIST)
 44.1683 +			goto fail;
 44.1684 +
 44.1685 +		/* Not found - create. */
 44.1686 +
 44.1687 +		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
 44.1688 +		goto out;
 44.1689 +	}
 44.1690 +
 44.1691 +	new_data_size = rec.data_len + new_dbuf.dsize;
 44.1692 +
 44.1693 +	/* Copy key+old_value+value *before* allocating free space in case malloc
 44.1694 +	   fails and we are left with a dead spot in the tdb. */
 44.1695 +
 44.1696 +	if (!(p = (char *)talloc_size(tdb, key.dsize + new_data_size))) {
 44.1697 +		tdb->ecode = TDB_ERR_OOM;
 44.1698 +		goto fail;
 44.1699 +	}
 44.1700 +
 44.1701 +	/* Copy the key in place. */
 44.1702 +	memcpy(p, key.dptr, key.dsize);
 44.1703 +
 44.1704 +	/* Now read the old data into place. */
 44.1705 +	if (rec.data_len &&
 44.1706 +		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
 44.1707 +			goto fail;
 44.1708 +
 44.1709 +	/* Finally append the new data. */
 44.1710 +	if (new_dbuf.dsize)
 44.1711 +		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
 44.1712 +
 44.1713 +	/* delete any existing record - if it doesn't exist we don't
 44.1714 +           care.  Doing this first reduces fragmentation, and avoids
 44.1715 +           coalescing with `allocated' block before it's updated. */
 44.1716 +
 44.1717 +	tdb_delete_hash(tdb, key, hash);
 44.1718 +
 44.1719 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
 44.1720 +		goto fail;
 44.1721 +
 44.1722 +	/* Read hash top into next ptr */
 44.1723 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 44.1724 +		goto fail;
 44.1725 +
 44.1726 +	rec.key_len = key.dsize;
 44.1727 +	rec.data_len = new_data_size;
 44.1728 +	rec.full_hash = hash;
 44.1729 +	rec.magic = TDB_MAGIC;
 44.1730 +
 44.1731 +	/* write out and point the top of the hash chain at it */
 44.1732 +	if (rec_write(tdb, rec_ptr, &rec) == -1
 44.1733 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
 44.1734 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 44.1735 +		/* Need to tdb_unallocate() here */
 44.1736 +		goto fail;
 44.1737 +	}
 44.1738 +
 44.1739 + out:
 44.1740 +	SAFE_FREE(p); 
 44.1741 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 44.1742 +	return ret;
 44.1743 +
 44.1744 +fail:
 44.1745 +	ret = -1;
 44.1746 +	goto out;
 44.1747 +}
 44.1748 +
 44.1749 +static int tdb_already_open(dev_t device,
 44.1750 +			    ino_t ino)
 44.1751 +{
 44.1752 +	TDB_CONTEXT *i;
 44.1753 +	
 44.1754 +	for (i = tdbs; i; i = i->next) {
 44.1755 +		if (i->device == device && i->inode == ino) {
 44.1756 +			return 1;
 44.1757 +		}
 44.1758 +	}
 44.1759 +
 44.1760 +	return 0;
 44.1761 +}
 44.1762 +
 44.1763 +/* open the database, creating it if necessary 
 44.1764 +
 44.1765 +   The open_flags and mode are passed straight to the open call on the
 44.1766 +   database file. A flags value of O_WRONLY is invalid. The hash size
 44.1767 +   is advisory, use zero for a default value.
 44.1768 +
 44.1769 +   Return is NULL on error, in which case errno is also set.  Don't 
 44.1770 +   try to call tdb_error or tdb_errname, just do strerror(errno).
 44.1771 +
 44.1772 +   @param name may be NULL for internal databases. */
 44.1773 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
 44.1774 +		      int open_flags, mode_t mode)
 44.1775 +{
 44.1776 +	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
 44.1777 +}
 44.1778 +
 44.1779 +/* a default logging function */
 44.1780 +static void null_log_fn(TDB_CONTEXT *tdb __attribute__((unused)),
 44.1781 +			int level __attribute__((unused)),
 44.1782 +			const char *fmt __attribute__((unused)), ...)
 44.1783 +{
 44.1784 +}
 44.1785 +
 44.1786 +
 44.1787 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
 44.1788 +			 int open_flags, mode_t mode,
 44.1789 +			 tdb_log_func log_fn,
 44.1790 +			 tdb_hash_func hash_fn)
 44.1791 +{
 44.1792 +	TDB_CONTEXT *tdb;
 44.1793 +	struct stat st;
 44.1794 +	int rev = 0, locked = 0;
 44.1795 +	uint8_t *vp;
 44.1796 +	u32 vertest;
 44.1797 +
 44.1798 +	if (!(tdb = talloc_zero(name, TDB_CONTEXT))) {
 44.1799 +		/* Can't log this */
 44.1800 +		errno = ENOMEM;
 44.1801 +		goto fail;
 44.1802 +	}
 44.1803 +	tdb->fd = -1;
 44.1804 +	tdb->name = NULL;
 44.1805 +	tdb->map_ptr = NULL;
 44.1806 +	tdb->flags = tdb_flags;
 44.1807 +	tdb->open_flags = open_flags;
 44.1808 +	tdb->log_fn = log_fn?log_fn:null_log_fn;
 44.1809 +	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
 44.1810 +
 44.1811 +	if ((open_flags & O_ACCMODE) == O_WRONLY) {
 44.1812 +		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
 44.1813 +			 name));
 44.1814 +		errno = EINVAL;
 44.1815 +		goto fail;
 44.1816 +	}
 44.1817 +	
 44.1818 +	if (hash_size == 0)
 44.1819 +		hash_size = DEFAULT_HASH_SIZE;
 44.1820 +	if ((open_flags & O_ACCMODE) == O_RDONLY) {
 44.1821 +		tdb->read_only = 1;
 44.1822 +		/* read only databases don't do locking or clear if first */
 44.1823 +		tdb->flags |= TDB_NOLOCK;
 44.1824 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 44.1825 +	}
 44.1826 +
 44.1827 +	/* internal databases don't mmap or lock, and start off cleared */
 44.1828 +	if (tdb->flags & TDB_INTERNAL) {
 44.1829 +		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
 44.1830 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 44.1831 +		if (tdb_new_database(tdb, hash_size) != 0) {
 44.1832 +			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
 44.1833 +			goto fail;
 44.1834 +		}
 44.1835 +		goto internal;
 44.1836 +	}
 44.1837 +
 44.1838 +	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
 44.1839 +		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
 44.1840 +			 name, strerror(errno)));
 44.1841 +		goto fail;	/* errno set by open(2) */
 44.1842 +	}
 44.1843 +
 44.1844 +	/* ensure there is only one process initialising at once */
 44.1845 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
 44.1846 +		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
 44.1847 +			 name, strerror(errno)));
 44.1848 +		goto fail;	/* errno set by tdb_brlock */
 44.1849 +	}
 44.1850 +
 44.1851 +	/* we need to zero database if we are the only one with it open */
 44.1852 +	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
 44.1853 +		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
 44.1854 +		open_flags |= O_CREAT;
 44.1855 +		if (ftruncate(tdb->fd, 0) == -1) {
 44.1856 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
 44.1857 +				 "failed to truncate %s: %s\n",
 44.1858 +				 name, strerror(errno)));
 44.1859 +			goto fail; /* errno set by ftruncate */
 44.1860 +		}
 44.1861 +	}
 44.1862 +
 44.1863 +	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
 44.1864 +	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
 44.1865 +	    || (tdb->header.version != TDB_VERSION
 44.1866 +		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
 44.1867 +		/* its not a valid database - possibly initialise it */
 44.1868 +		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
 44.1869 +			errno = EIO; /* ie bad format or something */
 44.1870 +			goto fail;
 44.1871 +		}
 44.1872 +		rev = (tdb->flags & TDB_CONVERT);
 44.1873 +	}
 44.1874 +	vp = (uint8_t *)&tdb->header.version;
 44.1875 +	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
 44.1876 +		  (((u32)vp[2]) << 8) | (u32)vp[3];
 44.1877 +	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
 44.1878 +	if (!rev)
 44.1879 +		tdb->flags &= ~TDB_CONVERT;
 44.1880 +	else {
 44.1881 +		tdb->flags |= TDB_CONVERT;
 44.1882 +		convert(&tdb->header, sizeof(tdb->header));
 44.1883 +	}
 44.1884 +	if (fstat(tdb->fd, &st) == -1)
 44.1885 +		goto fail;
 44.1886 +
 44.1887 +	/* Is it already in the open list?  If so, fail. */
 44.1888 +	if (tdb_already_open(st.st_dev, st.st_ino)) {
 44.1889 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
 44.1890 +			 "%s (%d,%d) is already open in this process\n",
 44.1891 +			 name, (int)st.st_dev, (int)st.st_ino));
 44.1892 +		errno = EBUSY;
 44.1893 +		goto fail;
 44.1894 +	}
 44.1895 +
 44.1896 +	if (!(tdb->name = (char *)talloc_strdup(tdb, name))) {
 44.1897 +		errno = ENOMEM;
 44.1898 +		goto fail;
 44.1899 +	}
 44.1900 +
 44.1901 +	tdb->map_size = st.st_size;
 44.1902 +	tdb->device = st.st_dev;
 44.1903 +	tdb->inode = st.st_ino;
 44.1904 +	tdb->locked = talloc_zero_array(tdb, struct tdb_lock_type,
 44.1905 +					tdb->header.hash_size+1);
 44.1906 +	if (!tdb->locked) {
 44.1907 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
 44.1908 +			 "failed to allocate lock structure for %s\n",
 44.1909 +			 name));
 44.1910 +		errno = ENOMEM;
 44.1911 +		goto fail;
 44.1912 +	}
 44.1913 +	tdb_mmap(tdb);
 44.1914 +	if (locked) {
 44.1915 +		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
 44.1916 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
 44.1917 +				 "failed to take ACTIVE_LOCK on %s: %s\n",
 44.1918 +				 name, strerror(errno)));
 44.1919 +			goto fail;
 44.1920 +		}
 44.1921 +
 44.1922 +	}
 44.1923 +
 44.1924 +	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
 44.1925 +	   we didn't get the initial exclusive lock as we need to let all other
 44.1926 +	   users know we're using it. */
 44.1927 +
 44.1928 +	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
 44.1929 +	/* leave this lock in place to indicate it's in use */
 44.1930 +	if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
 44.1931 +		goto fail;
 44.1932 +	}
 44.1933 +
 44.1934 +
 44.1935 + internal:
 44.1936 +	/* Internal (memory-only) databases skip all the code above to
 44.1937 +	 * do with disk files, and resume here by releasing their
 44.1938 +	 * global lock and hooking into the active list. */
 44.1939 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
 44.1940 +		goto fail;
 44.1941 +	tdb->next = tdbs;
 44.1942 +	tdbs = tdb;
 44.1943 +	return tdb;
 44.1944 +
 44.1945 + fail:
 44.1946 +	{ int save_errno = errno;
 44.1947 +
 44.1948 +	if (!tdb)
 44.1949 +		return NULL;
 44.1950 +	
 44.1951 +	if (tdb->map_ptr) {
 44.1952 +		if (tdb->flags & TDB_INTERNAL)
 44.1953 +			SAFE_FREE(tdb->map_ptr);
 44.1954 +		else
 44.1955 +			tdb_munmap(tdb);
 44.1956 +	}
 44.1957 +	SAFE_FREE(tdb->name);
 44.1958 +	if (tdb->fd != -1)
 44.1959 +		if (close(tdb->fd) != 0)
 44.1960 +			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
 44.1961 +	SAFE_FREE(tdb->locked);
 44.1962 +	SAFE_FREE(tdb);
 44.1963 +	errno = save_errno;
 44.1964 +	return NULL;
 44.1965 +	}
 44.1966 +}
 44.1967 +
 44.1968 +/**
 44.1969 + * Close a database.
 44.1970 + *
 44.1971 + * @returns -1 for error; 0 for success.
 44.1972 + **/
 44.1973 +int tdb_close(TDB_CONTEXT *tdb)
 44.1974 +{
 44.1975 +	TDB_CONTEXT **i;
 44.1976 +	int ret = 0;
 44.1977 +
 44.1978 +	if (tdb->map_ptr) {
 44.1979 +		if (tdb->flags & TDB_INTERNAL)
 44.1980 +			SAFE_FREE(tdb->map_ptr);
 44.1981 +		else
 44.1982 +			tdb_munmap(tdb);
 44.1983 +	}
 44.1984 +	SAFE_FREE(tdb->name);
 44.1985 +	if (tdb->fd != -1)
 44.1986 +		ret = close(tdb->fd);
 44.1987 +	SAFE_FREE(tdb->locked);
 44.1988 +
 44.1989 +	/* Remove from contexts list */
 44.1990 +	for (i = &tdbs; *i; i = &(*i)->next) {
 44.1991 +		if (*i == tdb) {
 44.1992 +			*i = tdb->next;
 44.1993 +			break;
 44.1994 +		}
 44.1995 +	}
 44.1996 +
 44.1997 +	memset(tdb, 0, sizeof(*tdb));
 44.1998 +	SAFE_FREE(tdb);
 44.1999 +
 44.2000 +	return ret;
 44.2001 +}
 44.2002 +
 44.2003 +/* lock/unlock entire database */
 44.2004 +int tdb_lockall(TDB_CONTEXT *tdb)
 44.2005 +{
 44.2006 +	u32 i;
 44.2007 +
 44.2008 +	/* There are no locks on read-only dbs */
 44.2009 +	if (tdb->read_only)
 44.2010 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 44.2011 +	for (i = 0; i < tdb->header.hash_size; i++) 
 44.2012 +		if (tdb_lock(tdb, i, F_WRLCK))
 44.2013 +			break;
 44.2014 +
 44.2015 +	/* If error, release locks we have... */
 44.2016 +	if (i < tdb->header.hash_size) {
 44.2017 +		u32 j;
 44.2018 +
 44.2019 +		for ( j = 0; j < i; j++)
 44.2020 +			tdb_unlock(tdb, j, F_WRLCK);
 44.2021 +		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
 44.2022 +	}
 44.2023 +
 44.2024 +	return 0;
 44.2025 +}
 44.2026 +void tdb_unlockall(TDB_CONTEXT *tdb)
 44.2027 +{
 44.2028 +	u32 i;
 44.2029 +	for (i=0; i < tdb->header.hash_size; i++)
 44.2030 +		tdb_unlock(tdb, i, F_WRLCK);
 44.2031 +}
 44.2032 +
 44.2033 +/* lock/unlock one hash chain. This is meant to be used to reduce
 44.2034 +   contention - it cannot guarantee how many records will be locked */
 44.2035 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
 44.2036 +{
 44.2037 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 44.2038 +}
 44.2039 +
 44.2040 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
 44.2041 +{
 44.2042 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 44.2043 +}
 44.2044 +
 44.2045 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
 44.2046 +{
 44.2047 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 44.2048 +}
 44.2049 +
 44.2050 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
 44.2051 +{
 44.2052 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 44.2053 +}
 44.2054 +
 44.2055 +
 44.2056 +/* register a loging function */
 44.2057 +void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
 44.2058 +{
 44.2059 +	tdb->log_fn = fn?fn:null_log_fn;
 44.2060 +}
 44.2061 +
 44.2062 +
 44.2063 +/* reopen a tdb - this can be used after a fork to ensure that we have an independent
 44.2064 +   seek pointer from our parent and to re-establish locks */
 44.2065 +int tdb_reopen(TDB_CONTEXT *tdb)
 44.2066 +{
 44.2067 +	struct stat st;
 44.2068 +
 44.2069 +	if (tdb->flags & TDB_INTERNAL)
 44.2070 +		return 0; /* Nothing to do. */
 44.2071 +	if (tdb_munmap(tdb) != 0) {
 44.2072 +		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
 44.2073 +		goto fail;
 44.2074 +	}
 44.2075 +	if (close(tdb->fd) != 0)
 44.2076 +		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
 44.2077 +	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
 44.2078 +	if (tdb->fd == -1) {
 44.2079 +		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
 44.2080 +		goto fail;
 44.2081 +	}
 44.2082 +	if (fstat(tdb->fd, &st) != 0) {
 44.2083 +		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
 44.2084 +		goto fail;
 44.2085 +	}
 44.2086 +	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
 44.2087 +		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
 44.2088 +		goto fail;
 44.2089 +	}
 44.2090 +	tdb_mmap(tdb);
 44.2091 +	if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
 44.2092 +		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
 44.2093 +		goto fail;
 44.2094 +	}
 44.2095 +
 44.2096 +	return 0;
 44.2097 +
 44.2098 +fail:
 44.2099 +	tdb_close(tdb);
 44.2100 +	return -1;
 44.2101 +}
 44.2102 +
 44.2103 +/* Not general: only works if single writer. */
 44.2104 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile)
 44.2105 +{
 44.2106 +	int fd, saved_errno;
 44.2107 +	TDB_CONTEXT *copy;
 44.2108 +
 44.2109 +	fd = open(outfile, O_TRUNC|O_CREAT|O_WRONLY, 0640);
 44.2110 +	if (fd < 0)
 44.2111 +		return NULL;
 44.2112 +	if (tdb->map_ptr) {
 44.2113 +		if (write(fd,tdb->map_ptr,tdb->map_size) != (int)tdb->map_size)
 44.2114 +			goto fail;
 44.2115 +	} else {
 44.2116 +		char buf[65536];
 44.2117 +		int r;
 44.2118 +
 44.2119 +		lseek(tdb->fd, 0, SEEK_SET);
 44.2120 +		while ((r = read(tdb->fd, buf, sizeof(buf))) > 0) {
 44.2121 +			if (write(fd, buf, r) != r)
 44.2122 +				goto fail;
 44.2123 +		}
 44.2124 +		if (r < 0)
 44.2125 +			goto fail;
 44.2126 +	}
 44.2127 +	copy = tdb_open(outfile, 0, 0, O_RDWR, 0);
 44.2128 +	if (!copy)
 44.2129 +		goto fail;
 44.2130 +	close(fd);
 44.2131 +	return copy;
 44.2132 +
 44.2133 +fail:
 44.2134 +	saved_errno = errno;
 44.2135 +	close(fd);
 44.2136 +	unlink(outfile);
 44.2137 +	errno = saved_errno;
 44.2138 +	return NULL;
 44.2139 +}
 44.2140 +
 44.2141 +/* reopen all tdb's */
 44.2142 +int tdb_reopen_all(void)
 44.2143 +{
 44.2144 +	TDB_CONTEXT *tdb;
 44.2145 +
 44.2146 +	for (tdb=tdbs; tdb; tdb = tdb->next) {
 44.2147 +		/* Ensure no clear-if-first. */
 44.2148 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 44.2149 +		if (tdb_reopen(tdb) != 0)
 44.2150 +			return -1;
 44.2151 +	}
 44.2152 +
 44.2153 +	return 0;
 44.2154 +}
    45.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    45.2 +++ b/tools/xenstore/tdb.h	Thu Sep 29 16:22:02 2005 -0600
    45.3 @@ -0,0 +1,157 @@
    45.4 +#ifndef __TDB_H__
    45.5 +#define __TDB_H__
    45.6 +
    45.7 +/* 
    45.8 +   Unix SMB/CIFS implementation.
    45.9 +
   45.10 +   trivial database library
   45.11 +
   45.12 +   Copyright (C) Andrew Tridgell 1999-2004
   45.13 +   
   45.14 +     ** NOTE! The following LGPL license applies to the tdb
   45.15 +     ** library. This does NOT imply that all of Samba is released
   45.16 +     ** under the LGPL
   45.17 +   
   45.18 +   This library is free software; you can redistribute it and/or
   45.19 +   modify it under the terms of the GNU Lesser General Public
   45.20 +   License as published by the Free Software Foundation; either
   45.21 +   version 2 of the License, or (at your option) any later version.
   45.22 +
   45.23 +   This library is distributed in the hope that it will be useful,
   45.24 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
   45.25 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   45.26 +   Lesser General Public License for more details.
   45.27 +
   45.28 +   You should have received a copy of the GNU Lesser General Public
   45.29 +   License along with this library; if not, write to the Free Software
   45.30 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   45.31 +*/
   45.32 +
   45.33 +#ifdef  __cplusplus
   45.34 +extern "C" {
   45.35 +#endif
   45.36 +
   45.37 +
   45.38 +/* flags to tdb_store() */
   45.39 +#define TDB_REPLACE 1
   45.40 +#define TDB_INSERT 2
   45.41 +#define TDB_MODIFY 3
   45.42 +
   45.43 +/* flags for tdb_open() */
   45.44 +#define TDB_DEFAULT 0 /* just a readability place holder */
   45.45 +#define TDB_CLEAR_IF_FIRST 1
   45.46 +#define TDB_INTERNAL 2 /* don't store on disk */
   45.47 +#define TDB_NOLOCK   4 /* don't do any locking */
   45.48 +#define TDB_NOMMAP   8 /* don't use mmap */
   45.49 +#define TDB_CONVERT 16 /* convert endian (internal use) */
   45.50 +#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */
   45.51 +
   45.52 +#define TDB_ERRCODE(code, ret) ((tdb->ecode = (code)), ret)
   45.53 +
   45.54 +/* error codes */
   45.55 +enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
   45.56 +		TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
   45.57 +		TDB_ERR_NOEXIST};
   45.58 +
   45.59 +#ifndef u32
   45.60 +#define u32 unsigned
   45.61 +#endif
   45.62 +
   45.63 +typedef struct TDB_DATA {
   45.64 +	char *dptr;
   45.65 +	size_t dsize;
   45.66 +} TDB_DATA;
   45.67 +
   45.68 +typedef u32 tdb_len;
   45.69 +typedef u32 tdb_off;
   45.70 +
   45.71 +/* this is stored at the front of every database */
   45.72 +struct tdb_header {
   45.73 +	char magic_food[32]; /* for /etc/magic */
   45.74 +	u32 version; /* version of the code */
   45.75 +	u32 hash_size; /* number of hash entries */
   45.76 +	tdb_off rwlocks;
   45.77 +	tdb_off reserved[31];
   45.78 +};
   45.79 +
   45.80 +struct tdb_lock_type {
   45.81 +	u32 count;
   45.82 +	u32 ltype;
   45.83 +};
   45.84 +
   45.85 +struct tdb_traverse_lock {
   45.86 +	struct tdb_traverse_lock *next;
   45.87 +	u32 off;
   45.88 +	u32 hash;
   45.89 +};
   45.90 +
   45.91 +#ifndef PRINTF_ATTRIBUTE
   45.92 +#define PRINTF_ATTRIBUTE(a,b)
   45.93 +#endif
   45.94 +
   45.95 +/* this is the context structure that is returned from a db open */
   45.96 +typedef struct tdb_context {
   45.97 +	char *name; /* the name of the database */
   45.98 +	void *map_ptr; /* where it is currently mapped */
   45.99 +	int fd; /* open file descriptor for the database */
  45.100 +	tdb_len map_size; /* how much space has been mapped */
  45.101 +	int read_only; /* opened read-only */
  45.102 +	struct tdb_lock_type *locked; /* array of chain locks */
  45.103 +	enum TDB_ERROR ecode; /* error code for last tdb error */
  45.104 +	struct tdb_header header; /* a cached copy of the header */
  45.105 +	u32 flags; /* the flags passed to tdb_open */
  45.106 +	struct tdb_traverse_lock travlocks; /* current traversal locks */
  45.107 +	struct tdb_context *next; /* all tdbs to avoid multiple opens */
  45.108 +	dev_t device;	/* uniquely identifies this tdb */
  45.109 +	ino_t inode;	/* uniquely identifies this tdb */
  45.110 +	void (*log_fn)(struct tdb_context *tdb, int level, const char *, ...) PRINTF_ATTRIBUTE(3,4); /* logging function */
  45.111 +	u32 (*hash_fn)(TDB_DATA *key);
  45.112 +	int open_flags; /* flags used in the open - needed by reopen */
  45.113 +} TDB_CONTEXT;
  45.114 +
  45.115 +typedef int (*tdb_traverse_func)(TDB_CONTEXT *, TDB_DATA, TDB_DATA, void *);
  45.116 +typedef void (*tdb_log_func)(TDB_CONTEXT *, int , const char *, ...);
  45.117 +typedef u32 (*tdb_hash_func)(TDB_DATA *key);
  45.118 +
  45.119 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
  45.120 +		      int open_flags, mode_t mode);
  45.121 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
  45.122 +			 int open_flags, mode_t mode,
  45.123 +			 tdb_log_func log_fn,
  45.124 +			 tdb_hash_func hash_fn);
  45.125 +
  45.126 +int tdb_reopen(TDB_CONTEXT *tdb);
  45.127 +int tdb_reopen_all(void);
  45.128 +void tdb_logging_function(TDB_CONTEXT *tdb, tdb_log_func);
  45.129 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb);
  45.130 +const char *tdb_errorstr(TDB_CONTEXT *tdb);
  45.131 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key);
  45.132 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key);
  45.133 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag);
  45.134 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf);
  45.135 +int tdb_close(TDB_CONTEXT *tdb);
  45.136 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb);
  45.137 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key);
  45.138 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *);
  45.139 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key);
  45.140 +int tdb_lockall(TDB_CONTEXT *tdb);
  45.141 +void tdb_unlockall(TDB_CONTEXT *tdb);
  45.142 +
  45.143 +/* Low level locking functions: use with care */
  45.144 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key);
  45.145 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key);
  45.146 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
  45.147 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
  45.148 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile);
  45.149 +
  45.150 +/* Debug functions. Not used in production. */
  45.151 +void tdb_dump_all(TDB_CONTEXT *tdb);
  45.152 +int tdb_printfreelist(TDB_CONTEXT *tdb);
  45.153 +
  45.154 +extern TDB_DATA tdb_null;
  45.155 +
  45.156 +#ifdef  __cplusplus
  45.157 +}
  45.158 +#endif
  45.159 +
  45.160 +#endif /* tdb.h */
    46.1 --- a/tools/xenstore/testsuite/04rm.test	Thu Sep 29 13:35:13 2005 -0600
    46.2 +++ b/tools/xenstore/testsuite/04rm.test	Thu Sep 29 16:22:02 2005 -0600
    46.3 @@ -6,6 +6,8 @@ rm /dir/test
    46.4  # Create file and remove it
    46.5  write /test contents
    46.6  rm /test
    46.7 +expect tool
    46.8 +dir /
    46.9  
   46.10  # Create directory and remove it.
   46.11  mkdir /dir
   46.12 @@ -15,3 +17,4 @@ rm /dir
   46.13  mkdir /dir
   46.14  write /dir/test contents
   46.15  rm /dir
   46.16 +
    47.1 --- a/tools/xenstore/testsuite/08transaction.slowtest	Thu Sep 29 13:35:13 2005 -0600
    47.2 +++ b/tools/xenstore/testsuite/08transaction.slowtest	Thu Sep 29 16:22:02 2005 -0600
    47.3 @@ -1,21 +1,43 @@
    47.4 -# Test transaction timeouts.  Take a second each.
    47.5 +# Test transaction clashes.
    47.6  
    47.7  mkdir /test
    47.8  write /test/entry1 contents
    47.9  
   47.10 -# Transactions can take as long as the want...
   47.11 -start /test
   47.12 -sleep 1100
   47.13 -rm /test/entry1
   47.14 -commit
   47.15 -dir /test
   47.16 +# Start transaction, do read-only op, transaction succeeds
   47.17 +1 start
   47.18 +1 write /test/entry1 contents2
   47.19 +expect contents
   47.20 +read /test/entry1
   47.21 +1 commit
   47.22 +expect contents2
   47.23 +read /test/entry1
   47.24  
   47.25 -# ... as long as noone is waiting.
   47.26 -1 start /test
   47.27 -notimeout
   47.28 -2 mkdir /test/dir
   47.29 -1 mkdir /test/dir
   47.30 -expect 1:dir
   47.31 -1 dir /test
   47.32 -expect 1: commit failed: Connection timed out
   47.33 +# Start transaction, abort other transaction, transaction succeeds.
   47.34 +1 start
   47.35 +1 write /test/entry1 contents3
   47.36 +start
   47.37 +write /test/entry1 contents
   47.38 +abort
   47.39  1 commit
   47.40 +expect contents3
   47.41 +read /test/entry1
   47.42 +
   47.43 +# Start transaction, do write op, transaction fails
   47.44 +1 start
   47.45 +1 write /test/entry1 contents4
   47.46 +write /test/entry1 contents
   47.47 +expect 1: commit failed: Resource temporarily unavailable
   47.48 +1 commit
   47.49 +expect contents
   47.50 +read /test/entry1
   47.51 +
   47.52 +# Start transaction, do other transaction, transaction fails
   47.53 +1 start
   47.54 +1 write /test/entry1 contents4
   47.55 +start
   47.56 +write /test/entry1 contents5
   47.57 +commit
   47.58 +expect 1: commit failed: Resource temporarily unavailable
   47.59 +1 commit
   47.60 +expect contents5
   47.61 +read /test/entry1
    48.1 --- a/tools/xenstore/testsuite/08transaction.test	Thu Sep 29 13:35:13 2005 -0600
    48.2 +++ b/tools/xenstore/testsuite/08transaction.test	Thu Sep 29 16:22:02 2005 -0600
    48.3 @@ -3,7 +3,7 @@
    48.4  mkdir /test
    48.5  
    48.6  # Simple transaction: create a file inside transaction.
    48.7 -1 start /test
    48.8 +1 start
    48.9  1 write /test/entry1 contents
   48.10  2 dir /test
   48.11  expect 1:entry1
   48.12 @@ -15,7 +15,7 @@ 2 read /test/entry1
   48.13  rm /test/entry1
   48.14  
   48.15  # Create a file and abort transaction.
   48.16 -1 start /test
   48.17 +1 start
   48.18  1 write /test/entry1 contents
   48.19  2 dir /test
   48.20  expect 1:entry1
   48.21 @@ -25,7 +25,7 @@ 2 dir /test
   48.22  
   48.23  write /test/entry1 contents
   48.24  # Delete in transaction, commit
   48.25 -1 start /test
   48.26 +1 start
   48.27  1 rm /test/entry1
   48.28  expect 2:entry1
   48.29  2 dir /test
   48.30 @@ -35,7 +35,7 @@ 2 dir /test
   48.31  
   48.32  # Delete in transaction, abort.
   48.33  write /test/entry1 contents
   48.34 -1 start /test
   48.35 +1 start
   48.36  1 rm /test/entry1
   48.37  expect 2:entry1
   48.38  2 dir /test
   48.39 @@ -47,7 +47,7 @@ 2 dir /test
   48.40  # Events inside transactions don't trigger watches until (successful) commit.
   48.41  mkdir /test/dir
   48.42  1 watch /test token
   48.43 -2 start /test
   48.44 +2 start
   48.45  2 mkdir /test/dir/sub
   48.46  expect 1: waitwatch failed: Connection timed out
   48.47  1 waitwatch
   48.48 @@ -55,7 +55,7 @@ 2 close
   48.49  1 close
   48.50  
   48.51  1 watch /test token
   48.52 -2 start /test
   48.53 +2 start
   48.54  2 mkdir /test/dir/sub
   48.55  2 abort
   48.56  expect 1: waitwatch failed: Connection timed out
   48.57 @@ -63,7 +63,7 @@ 1 waitwatch
   48.58  1 close
   48.59  
   48.60  1 watch /test token
   48.61 -2 start /test
   48.62 +2 start
   48.63  2 mkdir /test/dir/sub
   48.64  2 commit
   48.65  expect 1:/test/dir/sub:token
   48.66 @@ -73,7 +73,7 @@ 1 close
   48.67  
   48.68  # Rm inside transaction works like rm outside: children get notified.
   48.69  1 watch /test/dir/sub token
   48.70 -2 start /test
   48.71 +2 start
   48.72  2 rm /test/dir
   48.73  2 commit
   48.74  expect 1:/test/dir/sub:token
   48.75 @@ -83,7 +83,7 @@ 1 close
   48.76  
   48.77  # Multiple events from single transaction don't trigger assert
   48.78  1 watch /test token
   48.79 -2 start /test
   48.80 +2 start
   48.81  2 write /test/1 contents
   48.82  2 write /test/2 contents
   48.83  2 commit
    49.1 --- a/tools/xenstore/testsuite/12readonly.test	Thu Sep 29 13:35:13 2005 -0600
    49.2 +++ b/tools/xenstore/testsuite/12readonly.test	Thu Sep 29 16:22:02 2005 -0600
    49.3 @@ -13,23 +13,23 @@ expect 0 READ
    49.4  getperm /test
    49.5  watch /test token
    49.6  unwatch /test token 
    49.7 -start /
    49.8 +start
    49.9  commit
   49.10 -start /
   49.11 +start
   49.12  abort
   49.13  
   49.14  # These don't work
   49.15 -expect write failed: Read-only file system
   49.16 +expect write failed: Permission denied
   49.17  write /test2 contents
   49.18 -expect write failed: Read-only file system
   49.19 +expect write failed: Permission denied
   49.20  write /test contents
   49.21 -expect setperm failed: Read-only file system
   49.22 +expect setperm failed: Permission denied
   49.23  setperm /test 100 NONE
   49.24 -expect setperm failed: Read-only file system
   49.25 +expect setperm failed: Permission denied
   49.26  setperm /test 100 NONE
   49.27 -expect shutdown failed: Read-only file system
   49.28 +expect shutdown failed: Permission denied
   49.29  shutdown
   49.30 -expect introduce failed: Read-only file system
   49.31 +expect introduce failed: Permission denied
   49.32  introduce 1 100 7 /home
   49.33  
   49.34  # Check that watches work like normal.
    50.1 --- a/tools/xenstore/testsuite/14complexperms.test	Thu Sep 29 13:35:13 2005 -0600
    50.2 +++ b/tools/xenstore/testsuite/14complexperms.test	Thu Sep 29 16:22:02 2005 -0600
    50.3 @@ -33,14 +33,6 @@ unwatch /dir/file token
    50.4  expect *No such file or directory
    50.5  unwatch /dir/file token 
    50.6  expect *Permission denied
    50.7 -start /dir/file
    50.8 -expect *No such file or directory
    50.9 -abort
   50.10 -expect *Permission denied
   50.11 -start /dir/file
   50.12 -expect *No such file or directory
   50.13 -commit
   50.14 -expect *Permission denied
   50.15  introduce 2 100 7 /dir/file
   50.16  
   50.17  # Now it exists
   50.18 @@ -73,12 +65,4 @@ unwatch /dir/file token
   50.19  expect *No such file or directory
   50.20  unwatch /dir/file token 
   50.21  expect *Permission denied
   50.22 -start /dir/file
   50.23 -expect *No such file or directory
   50.24 -abort
   50.25 -expect *Permission denied
   50.26 -start /dir/file
   50.27 -expect *No such file or directory
   50.28 -commit
   50.29 -expect *Permission denied
   50.30  introduce 2 100 7 /dir/file
    51.1 --- a/tools/xenstore/testsuite/16block-watch-crash.test	Thu Sep 29 13:35:13 2005 -0600
    51.2 +++ b/tools/xenstore/testsuite/16block-watch-crash.test	Thu Sep 29 16:22:02 2005 -0600
    51.3 @@ -1,13 +1,14 @@
    51.4  # Test case where blocked connection gets sent watch.
    51.5  
    51.6 -mkdir /test
    51.7 -watch /test token
    51.8 -1 start /test
    51.9 -# This will block on above
   51.10 -noackwrite /test/entry contents
   51.11 -1 write /test/entry2 contents
   51.12 -1 commit
   51.13 -readack
   51.14 -expect /test/entry2:token
   51.15 -waitwatch
   51.16 -ackwatch token
   51.17 +# FIXME: We no longer block connections 
   51.18 +# mkdir /test
   51.19 +# watch /test token
   51.20 +# 1 start
   51.21 +# # This will block on above
   51.22 +# noackwrite /test/entry contents
   51.23 +# 1 write /test/entry2 contents
   51.24 +# 1 commit
   51.25 +# readack
   51.26 +# expect /test/entry2:token
   51.27 +# waitwatch
   51.28 +# ackwatch token
    52.1 --- a/tools/xenstore/xenstore_client.c	Thu Sep 29 13:35:13 2005 -0600
    52.2 +++ b/tools/xenstore/xenstore_client.c	Thu Sep 29 16:22:02 2005 -0600
    52.3 @@ -14,6 +14,7 @@
    52.4  #include <stdlib.h>
    52.5  #include <string.h>
    52.6  #include <xs.h>
    52.7 +#include <errno.h>
    52.8  
    52.9  static void
   52.10  usage(const char *progname)
   52.11 @@ -82,8 +83,8 @@ main(int argc, char **argv)
   52.12      }
   52.13  #endif
   52.14  
   52.15 -    /* XXX maybe find longest common prefix */
   52.16 -    success = xs_transaction_start(xsh, "/");
   52.17 +  again:
   52.18 +    success = xs_transaction_start(xsh);
   52.19      if (!success)
   52.20  	errx(1, "couldn't start transaction");
   52.21  
   52.22 @@ -145,8 +146,10 @@ main(int argc, char **argv)
   52.23  
   52.24   out:
   52.25      success = xs_transaction_end(xsh, ret ? true : false);
   52.26 -    if (!success)
   52.27 +    if (!success) {
   52.28 +	if (ret == 0 && errno == EAGAIN)
   52.29 +	    goto again;
   52.30  	errx(1, "couldn't end transaction");
   52.31 -
   52.32 +    }
   52.33      return ret;
   52.34  }
    53.1 --- a/tools/xenstore/xenstored.h	Thu Sep 29 13:35:13 2005 -0600
    53.2 +++ b/tools/xenstore/xenstored.h	Thu Sep 29 16:22:02 2005 -0600
    53.3 @@ -75,7 +75,7 @@ static struct xsd_errors xsd_errors[] __
    53.4  	XSD_ERROR(ENOSYS),
    53.5  	XSD_ERROR(EROFS),
    53.6  	XSD_ERROR(EBUSY),
    53.7 -	XSD_ERROR(ETIMEDOUT),
    53.8 +	XSD_ERROR(EAGAIN),
    53.9  	XSD_ERROR(EISCONN),
   53.10  };
   53.11  struct xsd_sockmsg
    54.1 --- a/tools/xenstore/xenstored_core.c	Thu Sep 29 13:35:13 2005 -0600
    54.2 +++ b/tools/xenstore/xenstored_core.c	Thu Sep 29 16:22:02 2005 -0600
    54.3 @@ -50,10 +50,12 @@
    54.4  #include "xenstored_transaction.h"
    54.5  #include "xenstored_domain.h"
    54.6  #include "xenctrl.h"
    54.7 +#include "tdb.h"
    54.8  
    54.9  static bool verbose;
   54.10  LIST_HEAD(connections);
   54.11  static int tracefd = -1;
   54.12 +static TDB_CONTEXT *tdb_ctx;
   54.13  
   54.14  #ifdef TESTING
   54.15  static bool failtest = false;
   54.16 @@ -126,6 +128,23 @@ void __attribute__((noreturn)) corrupt(s
   54.17  	_exit(2);
   54.18  }
   54.19  
   54.20 +TDB_CONTEXT *tdb_context(struct connection *conn)
   54.21 +{
   54.22 +	/* conn = NULL used in manual_node at setup. */
   54.23 +	if (!conn || !conn->transaction)
   54.24 +		return tdb_ctx;
   54.25 +	return tdb_transaction_context(conn->transaction);
   54.26 +}
   54.27 +
   54.28 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb)
   54.29 +{
   54.30 +	if (rename(newname, xs_daemon_tdb()) != 0)
   54.31 +		return false;
   54.32 +	tdb_close(tdb_ctx);
   54.33 +	tdb_ctx = talloc_steal(talloc_autofree_context(), newtdb);
   54.34 +	return true;
   54.35 +}
   54.36 +
   54.37  static char *sockmsg_string(enum xsd_sockmsg_type type)
   54.38  {
   54.39  	switch (type) {
   54.40 @@ -202,37 +221,6 @@ void trace_destroy(const void *data, con
   54.41  	write(tracefd, string, strlen(string));
   54.42  }
   54.43  
   54.44 -void trace_watch_timeout(const struct connection *conn, const char *node, const char *token)
   54.45 -{
   54.46 -	char string[64];
   54.47 -	if (tracefd < 0)
   54.48 -		return;
   54.49 -	write(tracefd, "WATCH_TIMEOUT ", strlen("WATCH_TIMEOUT "));
   54.50 -	sprintf(string, " %p ", conn);
   54.51 -	write(tracefd, string, strlen(string));
   54.52 -	write(tracefd, " (", 2);
   54.53 -	write(tracefd, node, strlen(node));
   54.54 -	write(tracefd, " ", 1);
   54.55 -	write(tracefd, token, strlen(token));
   54.56 -	write(tracefd, ")\n", 2);
   54.57 -}
   54.58 -
   54.59 -static void trace_blocked(const struct connection *conn,
   54.60 -			  const struct buffered_data *data)
   54.61 -{
   54.62 -	char string[64];
   54.63 -
   54.64 -	if (tracefd < 0)
   54.65 -		return;
   54.66 -
   54.67 -	write(tracefd, "BLOCKED", strlen("BLOCKED"));
   54.68 -	sprintf(string, " %p (", conn);
   54.69 -	write(tracefd, string, strlen(string));
   54.70 -	write(tracefd, sockmsg_string(data->hdr.msg.type),
   54.71 -	      strlen(sockmsg_string(data->hdr.msg.type)));
   54.72 -	write(tracefd, ")\n", 2);
   54.73 -}
   54.74 -
   54.75  void trace(const char *fmt, ...)
   54.76  {
   54.77  	va_list arglist;
   54.78 @@ -253,7 +241,6 @@ static bool write_message(struct connect
   54.79  	int ret;
   54.80  	struct buffered_data *out = conn->out;
   54.81  
   54.82 -	assert(conn->state != BLOCKED);
   54.83  	if (out->inhdr) {
   54.84  		if (verbose)
   54.85  			xprintf("Writing msg %s (%s) out to %p\n",
   54.86 @@ -351,24 +338,6 @@ static int initialize_set(fd_set *inset,
   54.87  	return max;
   54.88  }
   54.89  
   54.90 -/* Read everything from a talloc_open'ed fd. */
   54.91 -void *read_all(int *fd, unsigned int *size)
   54.92 -{
   54.93 -	unsigned int max = 4;
   54.94 -	int ret;
   54.95 -	void *buffer = talloc_size(fd, max);
   54.96 -
   54.97 -	*size = 0;
   54.98 -	while ((ret = read(*fd, buffer + *size, max - *size)) > 0) {
   54.99 -		*size += ret;
  54.100 -		if (*size == max)
  54.101 -			buffer = talloc_realloc_size(fd, buffer, max *= 2);
  54.102 -	}
  54.103 -	if (ret < 0)
  54.104 -		return NULL;
  54.105 -	return buffer;
  54.106 -}
  54.107 -
  54.108  static int destroy_fd(void *_fd)
  54.109  {
  54.110  	int *fd = _fd;
  54.111 @@ -409,42 +378,167 @@ bool is_child(const char *child, const c
  54.112  	return child[len] == '/' || child[len] == '\0';
  54.113  }
  54.114  
  54.115 -/* Answer never ends in /. */
  54.116 -char *node_dir_outside_transaction(const char *node)
  54.117 +/* If it fails, returns NULL and sets errno. */
  54.118 +static struct node *read_node(struct connection *conn, const char *name)
  54.119  {
  54.120 -	if (streq(node, "/"))
  54.121 -		return talloc_strdup(node, xs_daemon_store());
  54.122 -	return talloc_asprintf(node, "%s%s", xs_daemon_store(), node);
  54.123 -}
  54.124 +	TDB_DATA key, data;
  54.125 +	u32 *p;
  54.126 +	struct node *node;
  54.127  
  54.128 -static char *node_dir(struct transaction *trans, const char *node)
  54.129 -{
  54.130 -	if (!trans || !within_transaction(trans, node))
  54.131 -		return node_dir_outside_transaction(node);
  54.132 -	return node_dir_inside_transaction(trans, node);
  54.133 +	key.dptr = (void *)name;
  54.134 +	key.dsize = strlen(name);
  54.135 +	data = tdb_fetch(tdb_context(conn), key);
  54.136 +
  54.137 +	if (data.dptr == NULL) {
  54.138 +		if (tdb_error(tdb_context(conn)) == TDB_ERR_NOEXIST)
  54.139 +			errno = ENOENT;
  54.140 +		else
  54.141 +			errno = EIO;
  54.142 +		return NULL;
  54.143 +	}
  54.144 +
  54.145 +	node = talloc(name, struct node);
  54.146 +	node->name = talloc_strdup(node, name);
  54.147 +	node->parent = NULL;
  54.148 +	node->tdb = tdb_context(conn);
  54.149 +	talloc_steal(node, data.dptr);
  54.150 +
  54.151 +	/* Datalen, childlen, number of permissions */
  54.152 +	p = (u32 *)data.dptr;
  54.153 +	node->num_perms = p[0];
  54.154 +	node->datalen = p[1];
  54.155 +	node->childlen = p[2];
  54.156 +
  54.157 +	/* Permissions are struct xs_permissions. */
  54.158 +	node->perms = (void *)&p[3];
  54.159 +	/* Data is binary blob (usually ascii, no nul). */
  54.160 +	node->data = node->perms + node->num_perms;
  54.161 +	/* Children is strings, nul separated. */
  54.162 +	node->children = node->data + node->datalen;
  54.163 +
  54.164 +	return node;
  54.165  }
  54.166  
  54.167 -static char *datafile(const char *dir)
  54.168 +static bool write_node(struct connection *conn, const struct node *node)
  54.169  {
  54.170 -	return talloc_asprintf(dir, "%s/.data", dir);
  54.171 -}
  54.172 +	TDB_DATA key, data;
  54.173 +	void *p;
  54.174  
  54.175 -static char *node_datafile(struct transaction *trans, const char *node)
  54.176 -{
  54.177 -	return datafile(node_dir(trans, node));
  54.178 +	key.dptr = (void *)node->name;
  54.179 +	key.dsize = strlen(node->name);
  54.180 +
  54.181 +	data.dsize = 3*sizeof(u32)
  54.182 +		+ node->num_perms*sizeof(node->perms[0])
  54.183 +		+ node->datalen + node->childlen;
  54.184 +	data.dptr = talloc_size(node, data.dsize);
  54.185 +	((u32 *)data.dptr)[0] = node->num_perms;
  54.186 +	((u32 *)data.dptr)[1] = node->datalen;
  54.187 +	((u32 *)data.dptr)[2] = node->childlen;
  54.188 +	p = data.dptr + 3 * sizeof(u32);
  54.189 +
  54.190 +	memcpy(p, node->perms, node->num_perms*sizeof(node->perms[0]));
  54.191 +	p += node->num_perms*sizeof(node->perms[0]);
  54.192 +	memcpy(p, node->data, node->datalen);
  54.193 +	p += node->datalen;
  54.194 +	memcpy(p, node->children, node->childlen);
  54.195 +
  54.196 +	/* TDB should set errno, but doesn't even set ecode AFAICT. */
  54.197 +	if (tdb_store(tdb_context(conn), key, data, TDB_REPLACE) != 0) {
  54.198 +		errno = ENOSPC;
  54.199 +		return false;
  54.200 +	}
  54.201 +	return true;
  54.202  }
  54.203  
  54.204 -static char *permfile(const char *dir)
  54.205 +static enum xs_perm_type perm_for_conn(struct connection *conn,
  54.206 +				       struct xs_permissions *perms,
  54.207 +				       unsigned int num)
  54.208  {
  54.209 -	return talloc_asprintf(dir, "%s/.perms", dir);
  54.210 +	unsigned int i;
  54.211 +	enum xs_perm_type mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  54.212 +
  54.213 +	if (!conn->can_write)
  54.214 +		mask &= ~XS_PERM_WRITE;
  54.215 +
  54.216 +	/* Owners and tools get it all... */
  54.217 +	if (!conn->id || perms[0].id == conn->id)
  54.218 +		return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
  54.219 +
  54.220 +	for (i = 1; i < num; i++)
  54.221 +		if (perms[i].id == conn->id)
  54.222 +			return perms[i].perms & mask;
  54.223 +
  54.224 +	return perms[0].perms & mask;
  54.225  }
  54.226  
  54.227 -static char *node_permfile(struct transaction *trans, const char *node)
  54.228 +static char *get_parent(const char *node)
  54.229  {
  54.230 -	return permfile(node_dir(trans, node));
  54.231 +	char *slash = strrchr(node + 1, '/');
  54.232 +	if (!slash)
  54.233 +		return talloc_strdup(node, "/");
  54.234 +	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  54.235  }
  54.236  
  54.237 -struct buffered_data *new_buffer(void *ctx)
  54.238 +/* What do parents say? */
  54.239 +static enum xs_perm_type ask_parents(struct connection *conn, const char *name)
  54.240 +{
  54.241 +	struct node *node;
  54.242 +
  54.243 +	do {
  54.244 +		name = get_parent(name);
  54.245 +		node = read_node(conn, name);
  54.246 +		if (node)
  54.247 +			break;
  54.248 +	} while (!streq(name, "/"));
  54.249 +
  54.250 +	/* No permission at root?  We're in trouble. */
  54.251 +	if (!node)
  54.252 +		corrupt(conn, "No permissions file at root");
  54.253 +
  54.254 +	return perm_for_conn(conn, node->perms, node->num_perms);
  54.255 +}
  54.256 +
  54.257 +/* We have a weird permissions system.  You can allow someone into a
  54.258 + * specific node without allowing it in the parents.  If it's going to
  54.259 + * fail, however, we don't want the errno to indicate any information
  54.260 + * about the node. */
  54.261 +static int errno_from_parents(struct connection *conn, const char *node,
  54.262 +			      int errnum, enum xs_perm_type perm)
  54.263 +{
  54.264 +	/* We always tell them about memory failures. */
  54.265 +	if (errnum == ENOMEM)
  54.266 +		return errnum;
  54.267 +
  54.268 +	if (ask_parents(conn, node) & perm)
  54.269 +		return errnum;
  54.270 +	return EACCES;
  54.271 +}
  54.272 +
  54.273 +/* If it fails, returns NULL and sets errno. */
  54.274 +struct node *get_node(struct connection *conn,
  54.275 +		      const char *name,
  54.276 +		      enum xs_perm_type perm)
  54.277 +{
  54.278 +	struct node *node;
  54.279 +
  54.280 +	if (!name || !is_valid_nodename(name)) {
  54.281 +		errno = EINVAL;
  54.282 +		return NULL;
  54.283 +	}
  54.284 +	node = read_node(conn, name);
  54.285 +	/* If we don't have permission, we don't have node. */
  54.286 +	if (node) {
  54.287 +		if ((perm_for_conn(conn, node->perms, node->num_perms) & perm)
  54.288 +		    != perm)
  54.289 +			node = NULL;
  54.290 +	}
  54.291 +	/* Clean up errno if they weren't supposed to know. */
  54.292 +	if (!node) 
  54.293 +		errno = errno_from_parents(conn, name, errno, perm);
  54.294 +	return node;
  54.295 +}
  54.296 +
  54.297 +static struct buffered_data *new_buffer(void *ctx)
  54.298  {
  54.299  	struct buffered_data *data;
  54.300  
  54.301 @@ -457,7 +551,8 @@ struct buffered_data *new_buffer(void *c
  54.302  }
  54.303  
  54.304  /* Return length of string (including nul) at this offset. */
  54.305 -unsigned int get_string(const struct buffered_data *data, unsigned int offset)
  54.306 +static unsigned int get_string(const struct buffered_data *data,
  54.307 +			       unsigned int offset)
  54.308  {
  54.309  	const char *nul;
  54.310  
  54.311 @@ -508,7 +603,6 @@ void send_reply(struct connection *conn,
  54.312  		conn->waiting_reply = bdata;
  54.313  	} else
  54.314  		conn->out = bdata;
  54.315 -	assert(conn->state != BLOCKED);
  54.316  	conn->state = BUSY;
  54.317  }
  54.318  
  54.319 @@ -567,29 +661,6 @@ static const char *onearg(struct buffere
  54.320  	return in->buffer;
  54.321  }
  54.322  
  54.323 -/* If it fails, returns NULL and sets errno. */
  54.324 -static struct xs_permissions *get_perms(const char *dir, unsigned int *num)
  54.325 -{
  54.326 -	unsigned int size;
  54.327 -	char *strings;
  54.328 -	struct xs_permissions *ret;
  54.329 -	int *fd;
  54.330 -
  54.331 -	fd = talloc_open(permfile(dir), O_RDONLY, 0);
  54.332 -	if (!fd)
  54.333 -		return NULL;
  54.334 -	strings = read_all(fd, &size);
  54.335 -	if (!strings)
  54.336 -		return NULL;
  54.337 -
  54.338 -	*num = xs_count_strings(strings, size);
  54.339 -	ret = talloc_array(dir, struct xs_permissions, *num);
  54.340 -	if (!xs_strings_to_perms(ret, *num, strings))
  54.341 -		corrupt(NULL, "Permissions corrupt for %s", dir);
  54.342 -
  54.343 -	return ret;
  54.344 -}
  54.345 -
  54.346  static char *perms_to_strings(const void *ctx,
  54.347  			      struct xs_permissions *perms, unsigned int num,
  54.348  			      unsigned int *len)
  54.349 @@ -610,173 +681,6 @@ static char *perms_to_strings(const void
  54.350  	return strings;
  54.351  }
  54.352  
  54.353 -/* Destroy this, and its children, and its children's children. */
  54.354 -int destroy_path(void *path)
  54.355 -{
  54.356 -	DIR *dir;
  54.357 -	struct dirent *dirent;
  54.358 -
  54.359 -	dir = opendir(path);
  54.360 -	if (!dir) {
  54.361 -		if (unlink(path) == 0 || errno == ENOENT)
  54.362 -			return 0;
  54.363 -		corrupt(NULL, "Destroying path %s", path);
  54.364 -	}
  54.365 -
  54.366 -	while ((dirent = readdir(dir)) != NULL) {
  54.367 -		char fullpath[strlen(path) + 1 + strlen(dirent->d_name) + 1];
  54.368 -		sprintf(fullpath, "%s/%s", (char *)path, dirent->d_name);
  54.369 -		if (!streq(dirent->d_name,".") && !streq(dirent->d_name,".."))
  54.370 -			destroy_path(fullpath);
  54.371 -	}
  54.372 -	closedir(dir);
  54.373 -	if (rmdir(path) != 0)
  54.374 -		corrupt(NULL, "Destroying directory %s", path);
  54.375 -	return 0;
  54.376 -}
  54.377 -
  54.378 -/* Create a self-destructing temporary path */
  54.379 -static char *temppath(const char *path)
  54.380 -{
  54.381 -	char *tmppath = talloc_asprintf(path, "%s.tmp", path);
  54.382 -	talloc_set_destructor(tmppath, destroy_path);
  54.383 -	return tmppath;
  54.384 -}
  54.385 -
  54.386 -/* Create a self-destructing temporary file */
  54.387 -static char *tempfile(const char *path, void *contents, unsigned int len)
  54.388 -{
  54.389 -	int *fd;
  54.390 -	char *tmppath = temppath(path);
  54.391 -
  54.392 -	fd = talloc_open(tmppath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  54.393 -	if (!fd)
  54.394 -		return NULL;
  54.395 -	if (!xs_write_all(*fd, contents, len))
  54.396 -		return NULL;
  54.397 -
  54.398 -	return tmppath;
  54.399 -}
  54.400 -
  54.401 -static int destroy_opendir(void *_dir)
  54.402 -{
  54.403 -	DIR **dir = _dir;
  54.404 -	closedir(*dir);
  54.405 -	return 0;
  54.406 -}
  54.407 -
  54.408 -/* Return a pointer to a DIR*, self-closing and attached to this pathname. */
  54.409 -DIR **talloc_opendir(const char *pathname)
  54.410 -{
  54.411 -	DIR **dir;
  54.412 -
  54.413 -	dir = talloc(pathname, DIR *);
  54.414 -	*dir = opendir(pathname);
  54.415 -	if (!*dir) {
  54.416 -		int saved_errno = errno;
  54.417 -		talloc_free(dir);
  54.418 -		errno = saved_errno;
  54.419 -		return NULL;
  54.420 -	}
  54.421 -	talloc_set_destructor(dir, destroy_opendir);
  54.422 -	return dir;
  54.423 -}
  54.424 -
  54.425 -/* We assume rename() doesn't fail on moves in same dir. */
  54.426 -static void commit_tempfile(const char *path)
  54.427 -{
  54.428 -	char realname[strlen(path) + 1];
  54.429 -	unsigned int len = strrchr(path, '.') - path;
  54.430 -
  54.431 -	memcpy(realname, path, len);
  54.432 -	realname[len] = '\0';
  54.433 -	if (rename(path, realname) != 0)
  54.434 -		corrupt(NULL, "Committing %s", realname);
  54.435 -	talloc_set_destructor(path, NULL);
  54.436 -}
  54.437 -
  54.438 -static bool set_perms(struct transaction *transaction,
  54.439 -		      const char *node,
  54.440 -		      struct xs_permissions *perms, unsigned int num)
  54.441 -{
  54.442 -	unsigned int len;
  54.443 -	char *permpath, *strings;
  54.444 -
  54.445 -	strings = perms_to_strings(node, perms, num, &len);
  54.446 -	if (!strings)
  54.447 -		return false;
  54.448 -
  54.449 -	/* Create then move. */
  54.450 -	permpath = tempfile(node_permfile(transaction, node), strings, len);
  54.451 -	if (!permpath)
  54.452 -		return false;
  54.453 -
  54.454 -	commit_tempfile(permpath);
  54.455 -	return true;
  54.456 -}
  54.457 -
  54.458 -static char *get_parent(const char *node)
  54.459 -{
  54.460 -	char *slash = strrchr(node + 1, '/');
  54.461 -	if (!slash)
  54.462 -		return talloc_strdup(node, "/");
  54.463 -	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  54.464 -}
  54.465 -
  54.466 -static enum xs_perm_type perm_for_id(domid_t id,
  54.467 -				     struct xs_permissions *perms,
  54.468 -				     unsigned int num)
  54.469 -{
  54.470 -	unsigned int i;
  54.471 -
  54.472 -	/* Owners and tools get it all... */
  54.473 -	if (!id || perms[0].id == id)
  54.474 -		return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  54.475 -
  54.476 -	for (i = 1; i < num; i++)
  54.477 -		if (perms[i].id == id)
  54.478 -			return perms[i].perms;
  54.479 -
  54.480 -	return perms[0].perms;
  54.481 -}
  54.482 -
  54.483 -/* What do parents say? */
  54.484 -static enum xs_perm_type ask_parents(struct connection *conn,
  54.485 -				     const char *node)
  54.486 -{
  54.487 -	struct xs_permissions *perms;
  54.488 -	unsigned int num;
  54.489 -
  54.490 -	do {
  54.491 -		node = get_parent(node);
  54.492 -		perms = get_perms(node_dir(conn->transaction, node), &num);
  54.493 -		if (perms)
  54.494 -			break;
  54.495 -	} while (!streq(node, "/"));
  54.496 -
  54.497 -	/* No permission at root?  We're in trouble. */
  54.498 -	if (!perms)
  54.499 -		corrupt(conn, "No permissions file at root");
  54.500 -
  54.501 -	return perm_for_id(conn->id, perms, num);
  54.502 -}
  54.503 -
  54.504 -/* We have a weird permissions system.  You can allow someone into a
  54.505 - * specific node without allowing it in the parents.  If it's going to
  54.506 - * fail, however, we don't want the errno to indicate any information
  54.507 - * about the node. */
  54.508 -static int errno_from_parents(struct connection *conn, const char *node,
  54.509 -			      int errnum)
  54.510 -{
  54.511 -	/* We always tell them about memory failures. */
  54.512 -	if (errnum == ENOMEM)
  54.513 -		return errnum;
  54.514 -
  54.515 -	if (ask_parents(conn, node) & XS_PERM_READ)
  54.516 -		return errnum;
  54.517 -	return EACCES;
  54.518 -}
  54.519 -
  54.520  char *canonicalize(struct connection *conn, const char *node)
  54.521  {
  54.522  	const char *prefix;
  54.523 @@ -789,46 +693,6 @@ char *canonicalize(struct connection *co
  54.524  	return (char *)node;
  54.525  }
  54.526  
  54.527 -bool check_node_perms(struct connection *conn, const char *node,
  54.528 -		      enum xs_perm_type perm)
  54.529 -{
  54.530 -	struct xs_permissions *perms;
  54.531 -	unsigned int num;
  54.532 -
  54.533 -	if (!node || !is_valid_nodename(node)) {
  54.534 -		errno = EINVAL;
  54.535 -		return false;
  54.536 -	}
  54.537 -
  54.538 -	if (!conn->can_write && (perm & XS_PERM_WRITE)) {
  54.539 -		errno = EROFS;
  54.540 -		return false;
  54.541 -	}
  54.542 -
  54.543 -	perms = get_perms(node_dir(conn->transaction, node), &num);
  54.544 -
  54.545 -	if (perms) {
  54.546 -		if (perm_for_id(conn->id, perms, num) & perm)
  54.547 -			return true;
  54.548 -		errno = EACCES;
  54.549 -		return false;
  54.550 -	}
  54.551 -
  54.552 -	/* If it's OK not to exist, we consult parents. */
  54.553 -	if (errno == ENOENT && (perm & XS_PERM_ENOENT_OK)) {
  54.554 -		if (ask_parents(conn, node) & perm)
  54.555 -			return true;
  54.556 -		/* Parents say they should not know. */
  54.557 -		errno = EACCES;
  54.558 -		return false;
  54.559 -	}
  54.560 -
  54.561 -	/* They might not have permission to even *see* this node, in
  54.562 -	 * which case we return EACCES even if it's ENOENT or EIO. */
  54.563 -	errno = errno_from_parents(conn, node, errno);
  54.564 -	return false;
  54.565 -}
  54.566 -
  54.567  bool check_event_node(const char *node)
  54.568  {
  54.569  	if (!node || !strstarts(node, "@")) {
  54.570 @@ -838,142 +702,144 @@ bool check_event_node(const char *node)
  54.571  	return true;
  54.572  }
  54.573  
  54.574 -static void send_directory(struct connection *conn, const char *node)
  54.575 +static void send_directory(struct connection *conn, const char *name)
  54.576  {
  54.577 -	char *path, *reply;
  54.578 -	unsigned int reply_len = 0;
  54.579 -	DIR **dir;
  54.580 -	struct dirent *dirent;
  54.581 -
  54.582 -	node = canonicalize(conn, node);
  54.583 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  54.584 -		send_error(conn, errno);
  54.585 -		return;
  54.586 -	}
  54.587 -
  54.588 -	path = node_dir(conn->transaction, node);
  54.589 -	dir = talloc_opendir(path);
  54.590 -	if (!dir) {
  54.591 -		send_error(conn, errno);
  54.592 -		return;
  54.593 -	}
  54.594 -
  54.595 -	reply = talloc_strdup(node, "");
  54.596 -	while ((dirent = readdir(*dir)) != NULL) {
  54.597 -		int len = strlen(dirent->d_name) + 1;
  54.598 +	struct node *node;
  54.599  
  54.600 -		if (!valid_chars(dirent->d_name))
  54.601 -			continue;
  54.602 -
  54.603 -		reply = talloc_realloc(path, reply, char, reply_len + len);
  54.604 -		strcpy(reply + reply_len, dirent->d_name);
  54.605 -		reply_len += len;
  54.606 -	}
  54.607 -
  54.608 -	send_reply(conn, XS_DIRECTORY, reply, reply_len);
  54.609 -}
  54.610 -
  54.611 -static void do_read(struct connection *conn, const char *node)
  54.612 -{
  54.613 -	char *value;
  54.614 -	unsigned int size;
  54.615 -	int *fd;
  54.616 -
  54.617 -	node = canonicalize(conn, node);
  54.618 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  54.619 -		send_error(conn, errno);
  54.620 -		return;
  54.621 -	}
  54.622 -
  54.623 -	fd = talloc_open(node_datafile(conn->transaction, node), O_RDONLY, 0);
  54.624 -	if (!fd) {
  54.625 -		/* Data file doesn't exist?  We call that a directory */
  54.626 -		if (errno == ENOENT)
  54.627 -			errno = EISDIR;
  54.628 +	name = canonicalize(conn, name);
  54.629 +	node = get_node(conn, name, XS_PERM_READ);
  54.630 +	if (!node) {
  54.631  		send_error(conn, errno);
  54.632  		return;
  54.633  	}
  54.634  
  54.635 -	value = read_all(fd, &size);
  54.636 -	if (!value)
  54.637 -		send_error(conn, errno);
  54.638 -	else
  54.639 -		send_reply(conn, XS_READ, value, size);
  54.640 -}
  54.641 -
  54.642 -/* Commit this directory, eg. comitting a/b.tmp/c causes a/b.tmp -> a.b */
  54.643 -static bool commit_dir(char *dir)
  54.644 -{
  54.645 -	char *dot, *slash, *dest;
  54.646 -
  54.647 -	dot = strrchr(dir, '.');
  54.648 -	slash = strchr(dot, '/');
  54.649 -	if (slash)
  54.650 -		*slash = '\0';
  54.651 -
  54.652 -	dest = talloc_asprintf(dir, "%.*s", (int)(dot - dir), dir);
  54.653 -	return rename(dir, dest) == 0;
  54.654 +	send_reply(conn, XS_DIRECTORY, node->children, node->childlen);
  54.655  }
  54.656  
  54.657 -/* Create a temporary directory.  Put data in it (if data != NULL) */
  54.658 -static char *tempdir(struct connection *conn,
  54.659 -		     const char *node, void *data, unsigned int datalen)
  54.660 +static void do_read(struct connection *conn, const char *name)
  54.661  {
  54.662 -	struct xs_permissions *perms;
  54.663 -	char *permstr;
  54.664 -	unsigned int num, len;
  54.665 -	int *fd;
  54.666 -	char *dir;
  54.667 +	struct node *node;
  54.668  
  54.669 -	dir = temppath(node_dir(conn->transaction, node));
  54.670 -	if (mkdir(dir, 0750) != 0) {
  54.671 -		if (errno != ENOENT)
  54.672 -			return NULL;
  54.673 -
  54.674 -		dir = tempdir(conn, get_parent(node), NULL, 0);
  54.675 -		if (!dir)
  54.676 -			return NULL;
  54.677 -
  54.678 -		dir = talloc_asprintf(dir, "%s%s", dir, strrchr(node, '/'));
  54.679 -		if (mkdir(dir, 0750) != 0)
  54.680 -			return NULL;
  54.681 -		talloc_set_destructor(dir, destroy_path);
  54.682 +	name = canonicalize(conn, name);
  54.683 +	node = get_node(conn, name, XS_PERM_READ);
  54.684 +	if (!node) {
  54.685 +		send_error(conn, errno);
  54.686 +		return;
  54.687  	}
  54.688  
  54.689 -	perms = get_perms(get_parent(dir), &num);
  54.690 -	assert(perms);
  54.691 -	/* Domains own what they create. */
  54.692 +	send_reply(conn, XS_READ, node->data, node->datalen);
  54.693 +}
  54.694 +
  54.695 +static void delete_node_single(struct connection *conn, struct node *node)
  54.696 +{
  54.697 +	TDB_DATA key;
  54.698 +
  54.699 +	key.dptr = (void *)node->name;
  54.700 +	key.dsize = strlen(node->name);
  54.701 +
  54.702 +	if (tdb_delete(tdb_context(conn), key) != 0)
  54.703 +		corrupt(conn, "Could not delete '%s'", node->name);
  54.704 +}
  54.705 +
  54.706 +/* Must not be / */
  54.707 +static char *basename(const char *name)
  54.708 +{
  54.709 +	return strrchr(name, '/') + 1;
  54.710 +}
  54.711 +
  54.712 +static struct node *construct_node(struct connection *conn, const char *name)
  54.713 +{
  54.714 +	const char *base;
  54.715 +	unsigned int baselen;
  54.716 +	struct node *parent, *node;
  54.717 +	char *children, *parentname = get_parent(name);
  54.718 +
  54.719 +	/* If parent doesn't exist, create it. */
  54.720 +	parent = read_node(conn, parentname);
  54.721 +	if (!parent)
  54.722 +		parent = construct_node(conn, parentname);
  54.723 +	if (!parent)
  54.724 +		return NULL;
  54.725 +	
  54.726 +	/* Add child to parent. */
  54.727 +	base = basename(name);
  54.728 +	baselen = strlen(base) + 1;
  54.729 +	children = talloc_array(name, char, parent->childlen + baselen);
  54.730 +	memcpy(children, parent->children, parent->childlen);
  54.731 +	memcpy(children + parent->childlen, base, baselen);
  54.732 +	parent->children = children;
  54.733 +	parent->childlen += baselen;
  54.734 +
  54.735 +	/* Allocate node */
  54.736 +	node = talloc(name, struct node);
  54.737 +	node->tdb = tdb_context(conn);
  54.738 +	node->name = talloc_strdup(node, name);
  54.739 +
  54.740 +	/* Inherit permissions, except domains own what they create */
  54.741 +	node->num_perms = parent->num_perms;
  54.742 +	node->perms = talloc_memdup(node, parent->perms,
  54.743 +				    node->num_perms * sizeof(node->perms[0]));
  54.744  	if (conn->id)
  54.745 -		perms->id = conn->id;
  54.746 +		node->perms[0].id = conn->id;
  54.747  
  54.748 -	permstr = perms_to_strings(dir, perms, num, &len);
  54.749 -	fd = talloc_open(permfile(dir), O_WRONLY|O_CREAT|O_EXCL, 0640);
  54.750 -	if (!fd || !xs_write_all(*fd, permstr, len))
  54.751 +	/* No children, no data */
  54.752 +	node->children = node->data = NULL;
  54.753 +	node->childlen = node->datalen = 0;
  54.754 +	node->parent = parent;
  54.755 +	return node;
  54.756 +}
  54.757 +
  54.758 +static int destroy_node(void *_node)
  54.759 +{
  54.760 +	struct node *node = _node;
  54.761 +	TDB_DATA key;
  54.762 +
  54.763 +	if (streq(node->name, "/"))
  54.764 +		corrupt(NULL, "Destroying root node!");
  54.765 +
  54.766 +	key.dptr = (void *)node->name;
  54.767 +	key.dsize = strlen(node->name);
  54.768 +
  54.769 +	tdb_delete(node->tdb, key);
  54.770 +	return 0;
  54.771 +}
  54.772 +
  54.773 +/* Be careful: create heirarchy, put entry in existing parent *last*.
  54.774 + * This helps fsck if we die during this. */
  54.775 +static struct node *create_node(struct connection *conn, 
  54.776 +				const char *name,
  54.777 +				void *data, unsigned int datalen)
  54.778 +{
  54.779 +	struct node *node, *i;
  54.780 +
  54.781 +	node = construct_node(conn, name);
  54.782 +	if (!node)
  54.783  		return NULL;
  54.784  
  54.785 -	if (data) {
  54.786 -		char *datapath = datafile(dir);
  54.787 +	node->data = data;
  54.788 +	node->datalen = datalen;
  54.789  
  54.790 -		fd = talloc_open(datapath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  54.791 -		if (!fd || !xs_write_all(*fd, data, datalen))
  54.792 +	/* We write out the nodes down, setting destructor in case
  54.793 +	 * something goes wrong. */
  54.794 +	for (i = node; i; i = i->parent) {
  54.795 +		if (!write_node(conn, i))
  54.796  			return NULL;
  54.797 +		talloc_set_destructor(i, destroy_node);
  54.798  	}
  54.799 -	return dir;
  54.800 -}
  54.801  
  54.802 -static bool node_exists(struct connection *conn, const char *node)
  54.803 -{
  54.804 -	struct stat st;
  54.805 -
  54.806 -	return lstat(node_dir(conn->transaction, node), &st) == 0;
  54.807 +	/* OK, now remove destructors so they stay around */
  54.808 +	for (i = node; i; i = i->parent)
  54.809 +		talloc_set_destructor(i, NULL);
  54.810 +	return node;
  54.811  }
  54.812  
  54.813  /* path, data... */
  54.814  static void do_write(struct connection *conn, struct buffered_data *in)
  54.815  {
  54.816  	unsigned int offset, datalen;
  54.817 +	struct node *node;
  54.818  	char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
  54.819 -	char *node, *tmppath;
  54.820 +	char *name;
  54.821  
  54.822  	/* Extra "strings" can be created by binary data. */
  54.823  	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) {
  54.824 @@ -981,99 +847,115 @@ static void do_write(struct connection *
  54.825  		return;
  54.826  	}
  54.827  
  54.828 -	node = canonicalize(conn, vec[0]);
  54.829 -	if (!within_transaction(conn->transaction, node)) {
  54.830 -		send_error(conn, EROFS);
  54.831 -		return;
  54.832 -	}
  54.833 -
  54.834 -	if (transaction_block(conn, node))
  54.835 -		return;
  54.836 -
  54.837  	offset = strlen(vec[0]) + 1;
  54.838  	datalen = in->used - offset;
  54.839  
  54.840 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  54.841 -		send_error(conn, errno);
  54.842 -		return;
  54.843 -	}
  54.844 -
  54.845 -	if (!node_exists(conn, node)) {
  54.846 -		char *dir;
  54.847 -
  54.848 -		/* Does not exist... */
  54.849 +	name = canonicalize(conn, vec[0]);
  54.850 +	node = get_node(conn, name, XS_PERM_WRITE);
  54.851 +	if (!node) {
  54.852 +		/* No permissions, invalid input? */
  54.853  		if (errno != ENOENT) {
  54.854  			send_error(conn, errno);
  54.855  			return;
  54.856  		}
  54.857 -
  54.858 -		dir = tempdir(conn, node, in->buffer + offset, datalen);
  54.859 -		if (!dir || !commit_dir(dir)) {
  54.860 +		node = create_node(conn, name, in->buffer + offset, datalen);
  54.861 +		if (!node) {
  54.862  			send_error(conn, errno);
  54.863  			return;
  54.864  		}
  54.865 -		
  54.866  	} else {
  54.867 -		/* Exists... */
  54.868 -		tmppath = tempfile(node_datafile(conn->transaction, node),
  54.869 -				   in->buffer + offset, datalen);
  54.870 -		if (!tmppath) {
  54.871 +		node->data = in->buffer + offset;
  54.872 +		node->datalen = datalen;
  54.873 +		if (!write_node(conn, node)){
  54.874  			send_error(conn, errno);
  54.875  			return;
  54.876  		}
  54.877 -
  54.878 -		commit_tempfile(tmppath);
  54.879  	}
  54.880  
  54.881 -	add_change_node(conn->transaction, node, false);
  54.882 -	fire_watches(conn, node, false);
  54.883 +	add_change_node(conn->transaction, name, false);
  54.884 +	fire_watches(conn, name, false);
  54.885  	send_ack(conn, XS_WRITE);
  54.886  }
  54.887  
  54.888 -static void do_mkdir(struct connection *conn, const char *node)
  54.889 +static void do_mkdir(struct connection *conn, const char *name)
  54.890  {
  54.891 -	char *dir;
  54.892 +	struct node *node;
  54.893  
  54.894 -	node = canonicalize(conn, node);
  54.895 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  54.896 -		send_error(conn, errno);
  54.897 -		return;
  54.898 -	}
  54.899 -
  54.900 -	if (!within_transaction(conn->transaction, node)) {
  54.901 -		send_error(conn, EROFS);
  54.902 -		return;
  54.903 -	}
  54.904 -
  54.905 -	if (transaction_block(conn, node))
  54.906 -		return;
  54.907 +	name = canonicalize(conn, name);
  54.908 +	node = get_node(conn, name, XS_PERM_WRITE);
  54.909  
  54.910  	/* If it already exists, fine. */
  54.911 -	if (node_exists(conn, node)) {
  54.912 -		send_ack(conn, XS_MKDIR);
  54.913 -		return;
  54.914 +	if (!node) {
  54.915 +		/* No permissions? */
  54.916 +		if (errno != ENOENT) {
  54.917 +			send_error(conn, errno);
  54.918 +			return;
  54.919 +		}
  54.920 +		node = create_node(conn, name, NULL, 0);
  54.921 +		if (!node) {
  54.922 +			send_error(conn, errno);
  54.923 +			return;
  54.924 +		}
  54.925 +		add_change_node(conn->transaction, name, false);
  54.926 +		fire_watches(conn, name, false);
  54.927  	}
  54.928 -
  54.929 -	dir = tempdir(conn, node, NULL, 0);
  54.930 -	if (!dir || !commit_dir(dir)) {
  54.931 -		send_error(conn, errno);
  54.932 -		return;
  54.933 -	}
  54.934 -
  54.935 -	add_change_node(conn->transaction, node, false);
  54.936 -	fire_watches(conn, node, false);
  54.937  	send_ack(conn, XS_MKDIR);
  54.938  }
  54.939  
  54.940 -static void do_rm(struct connection *conn, const char *node)
  54.941 +static void delete_node(struct connection *conn, struct node *node)
  54.942  {
  54.943 -	char *tmppath, *path;
  54.944 +	unsigned int i;
  54.945  
  54.946 -	node = canonicalize(conn, node);
  54.947 -	if (!check_node_perms(conn, node, XS_PERM_WRITE)) {
  54.948 +	/* Delete self, then delete children.  If something goes wrong,
  54.949 +	 * consistency check will clean up this way. */
  54.950 +	delete_node_single(conn, node);
  54.951 +
  54.952 +	/* Delete children, too. */
  54.953 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  54.954 +		struct node *child;
  54.955 +
  54.956 +		child = read_node(conn, 
  54.957 +				  talloc_asprintf(node, "%s/%s", node->name,
  54.958 +						  node->children + i));
  54.959 +		if (!child)
  54.960 +			corrupt(conn, "No child '%s' found", child);
  54.961 +		delete_node(conn, child);
  54.962 +	}
  54.963 +}
  54.964 +
  54.965 +/* Delete memory using memmove. */
  54.966 +static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
  54.967 +{
  54.968 +	memmove(mem + off, mem + off + len, total - off - len);
  54.969 +}
  54.970 +
  54.971 +static bool delete_child(struct connection *conn,
  54.972 +			 struct node *node, const char *childname)
  54.973 +{
  54.974 +	unsigned int i;
  54.975 +
  54.976 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  54.977 +		if (streq(node->children+i, childname)) {
  54.978 +			memdel(node->children, i, strlen(childname) + 1,
  54.979 +			       node->childlen);
  54.980 +			node->childlen -= strlen(childname) + 1;
  54.981 +			return write_node(conn, node);
  54.982 +		}
  54.983 +	}
  54.984 +	corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
  54.985 +}
  54.986 +
  54.987 +static void do_rm(struct connection *conn, const char *name)
  54.988 +{
  54.989 +	struct node *node, *parent;
  54.990 +
  54.991 +	name = canonicalize(conn, name);
  54.992 +	node = get_node(conn, name, XS_PERM_WRITE);
  54.993 +	if (!node) {
  54.994  		/* Didn't exist already?  Fine, if parent exists. */
  54.995  		if (errno == ENOENT) {
  54.996 -			if (node_exists(conn, get_parent(node))) {
  54.997 +			node = read_node(conn, get_parent(name));
  54.998 +			if (node) {
  54.999  				send_ack(conn, XS_RM);
 54.1000  				return;
 54.1001  			}
 54.1002 @@ -1084,53 +966,43 @@ static void do_rm(struct connection *con
 54.1003  		return;
 54.1004  	}
 54.1005  
 54.1006 -	if (!within_transaction(conn->transaction, node)) {
 54.1007 -		send_error(conn, EROFS);
 54.1008 -		return;
 54.1009 -	}
 54.1010 -
 54.1011 -	if (transaction_block(conn, node))
 54.1012 -		return;
 54.1013 -
 54.1014 -	if (streq(node, "/")) {
 54.1015 +	if (streq(name, "/")) {
 54.1016  		send_error(conn, EINVAL);
 54.1017  		return;
 54.1018  	}
 54.1019  
 54.1020 -	/* We move the directory to temporary name, destructor cleans up. */
 54.1021 -	path = node_dir(conn->transaction, node);
 54.1022 -	tmppath = talloc_asprintf(node, "%s.tmp", path);
 54.1023 -	talloc_set_destructor(tmppath, destroy_path);
 54.1024 +	/* Delete from parent first, then if something explodes fsck cleans. */
 54.1025 +	parent = read_node(conn, get_parent(name));
 54.1026 +	if (!parent) {
 54.1027 +		send_error(conn, EINVAL);
 54.1028 +		return;
 54.1029 +	}
 54.1030  
 54.1031 -	if (rename(path, tmppath) != 0) {
 54.1032 +	if (!delete_child(conn, parent, basename(name))) {
 54.1033 +		send_error(conn, EINVAL);
 54.1034 +		return;
 54.1035 +	}
 54.1036 +
 54.1037 +	delete_node(conn, node);
 54.1038 +	add_change_node(conn->transaction, name, true);
 54.1039 +	fire_watches(conn, name, true);
 54.1040 +	send_ack(conn, XS_RM);
 54.1041 +}
 54.1042 +
 54.1043 +static void do_get_perms(struct connection *conn, const char *name)
 54.1044 +{
 54.1045 +	struct node *node;
 54.1046 +	char *strings;
 54.1047 +	unsigned int len;
 54.1048 +
 54.1049 +	name = canonicalize(conn, name);
 54.1050 +	node = get_node(conn, name, XS_PERM_READ);
 54.1051 +	if (!node) {
 54.1052  		send_error(conn, errno);
 54.1053  		return;
 54.1054  	}
 54.1055  
 54.1056 -	add_change_node(conn->transaction, node, true);
 54.1057 -	fire_watches(conn, node, true);
 54.1058 -	send_ack(conn, XS_RM);
 54.1059 -}
 54.1060 -
 54.1061 -static void do_get_perms(struct connection *conn, const char *node)
 54.1062 -{
 54.1063 -	struct xs_permissions *perms;
 54.1064 -	char *strings;
 54.1065 -	unsigned int len, num;
 54.1066 -
 54.1067 -	node = canonicalize(conn, node);
 54.1068 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
 54.1069 -		send_error(conn, errno);
 54.1070 -		return;
 54.1071 -	}
 54.1072 -
 54.1073 -	perms = get_perms(node_dir(conn->transaction, node), &num);
 54.1074 -	if (!perms) {
 54.1075 -		send_error(conn, errno);
 54.1076 -		return;
 54.1077 -	}
 54.1078 -
 54.1079 -	strings = perms_to_strings(node, perms, num, &len);
 54.1080 +	strings = perms_to_strings(node, node->perms, node->num_perms, &len);
 54.1081  	if (!strings)
 54.1082  		send_error(conn, errno);
 54.1083  	else
 54.1084 @@ -1140,8 +1012,8 @@ static void do_get_perms(struct connecti
 54.1085  static void do_set_perms(struct connection *conn, struct buffered_data *in)
 54.1086  {
 54.1087  	unsigned int num;
 54.1088 -	char *node, *permstr;
 54.1089 -	struct xs_permissions *perms;
 54.1090 +	char *name, *permstr;
 54.1091 +	struct node *node;
 54.1092  
 54.1093  	num = xs_count_strings(in->buffer, in->used);
 54.1094  	if (num < 2) {
 54.1095 @@ -1150,37 +1022,30 @@ static void do_set_perms(struct connecti
 54.1096  	}
 54.1097  
 54.1098  	/* First arg is node name. */
 54.1099 -	node = canonicalize(conn, in->buffer);
 54.1100 +	name = canonicalize(conn, in->buffer);
 54.1101  	permstr = in->buffer + strlen(in->buffer) + 1;
 54.1102  	num--;
 54.1103  
 54.1104 -	if (!within_transaction(conn->transaction, node)) {
 54.1105 -		send_error(conn, EROFS);
 54.1106 -		return;
 54.1107 -	}
 54.1108 -
 54.1109 -	if (transaction_block(conn, node))
 54.1110 -		return;
 54.1111 -
 54.1112  	/* We must own node to do this (tools can do this too). */
 54.1113 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_OWNER)) {
 54.1114 +	node = get_node(conn, name, XS_PERM_WRITE|XS_PERM_OWNER);
 54.1115 +	if (!node) {
 54.1116  		send_error(conn, errno);
 54.1117  		return;
 54.1118  	}
 54.1119  
 54.1120 -	perms = talloc_array(node, struct xs_permissions, num);
 54.1121 -	if (!xs_strings_to_perms(perms, num, permstr)) {
 54.1122 +	node->perms = talloc_array(node, struct xs_permissions, num);
 54.1123 +	node->num_perms = num;
 54.1124 +	if (!xs_strings_to_perms(node->perms, num, permstr)) {
 54.1125 +		send_error(conn, errno);
 54.1126 +		return;
 54.1127 +	}
 54.1128 +	if (!write_node(conn, node)) {
 54.1129  		send_error(conn, errno);
 54.1130  		return;
 54.1131  	}
 54.1132  
 54.1133 -	if (!set_perms(conn->transaction, node, perms, num)) {
 54.1134 -		send_error(conn, errno);
 54.1135 -		return;
 54.1136 -	}
 54.1137 -
 54.1138 -	add_change_node(conn->transaction, node, false);
 54.1139 -	fire_watches(conn, node, false);
 54.1140 +	add_change_node(conn->transaction, name, false);
 54.1141 +	fire_watches(conn, name, false);
 54.1142  	send_ack(conn, XS_SET_PERMS);
 54.1143  }
 54.1144  
 54.1145 @@ -1221,14 +1086,10 @@ static void process_message(struct conne
 54.1146  	case XS_SHUTDOWN:
 54.1147  		/* FIXME: Implement gentle shutdown too. */
 54.1148  		/* Only tools can do this. */
 54.1149 -		if (conn->id != 0) {
 54.1150 +		if (conn->id != 0 || !conn->can_write) {
 54.1151  			send_error(conn, EACCES);
 54.1152  			break;
 54.1153  		}
 54.1154 -		if (!conn->can_write) {
 54.1155 -			send_error(conn, EROFS);
 54.1156 -			break;
 54.1157 -		}
 54.1158  		send_ack(conn, XS_SHUTDOWN);
 54.1159  		/* Everything hangs off auto-free context, freed at exit. */
 54.1160  		exit(0);
 54.1161 @@ -1263,7 +1124,7 @@ static void process_message(struct conne
 54.1162  		break;
 54.1163  
 54.1164  	case XS_TRANSACTION_START:
 54.1165 -		do_transaction_start(conn, onearg(in));
 54.1166 +		do_transaction_start(conn, in);
 54.1167  		break;
 54.1168  
 54.1169  	case XS_TRANSACTION_END:
 54.1170 @@ -1309,6 +1170,8 @@ static void consider_message(struct conn
 54.1171  	/* For simplicity, we kill the connection on OOM. */
 54.1172  	talloc_set_fail_handler(out_of_mem, &talloc_fail);
 54.1173  	if (setjmp(talloc_fail)) {
 54.1174 +		/* Free in before conn, in case it needs something. */
 54.1175 +		talloc_free(in);
 54.1176  		talloc_free(conn);
 54.1177  		goto end;
 54.1178  	}
 54.1179 @@ -1330,16 +1193,8 @@ static void consider_message(struct conn
 54.1180  	conn->in = new_buffer(conn);
 54.1181  	process_message(conn, in);
 54.1182  
 54.1183 -	if (conn->state == BLOCKED) {
 54.1184 -		/* Blocked by transaction: queue for re-xmit. */
 54.1185 -		talloc_free(conn->in);
 54.1186 -		conn->in = in;
 54.1187 -		in = NULL;
 54.1188 -		trace_blocked(conn, conn->in);
 54.1189 -	}
 54.1190 -
 54.1191 +	talloc_free(in);
 54.1192  end:
 54.1193 -	talloc_free(in);
 54.1194  	talloc_set_fail_handler(NULL, NULL);
 54.1195  	if (talloc_total_blocks(NULL)
 54.1196  	    != talloc_total_blocks(talloc_autofree_context()) + 1) {
 54.1197 @@ -1350,7 +1205,7 @@ end:
 54.1198  
 54.1199  /* Errors in reading or allocating here mean we get out of sync, so we
 54.1200   * drop the whole client connection. */
 54.1201 -void handle_input(struct connection *conn)
 54.1202 +static void handle_input(struct connection *conn)
 54.1203  {
 54.1204  	int bytes;
 54.1205  	struct buffered_data *in;
 54.1206 @@ -1402,41 +1257,12 @@ bad_client:
 54.1207  	talloc_free(conn);
 54.1208  }
 54.1209  
 54.1210 -void handle_output(struct connection *conn)
 54.1211 +static void handle_output(struct connection *conn)
 54.1212  {
 54.1213  	if (!write_message(conn))
 54.1214  		talloc_free(conn);
 54.1215  }
 54.1216  
 54.1217 -/* If a transaction has ended, see if we can unblock any connections. */
 54.1218 -static void unblock_connections(void)
 54.1219 -{
 54.1220 -	struct connection *i, *tmp;
 54.1221 -
 54.1222 -	list_for_each_entry_safe(i, tmp, &connections, list) {
 54.1223 -		switch (i->state) {
 54.1224 -		case BLOCKED:
 54.1225 -			if (!transaction_covering_node(i->blocked_by)) {
 54.1226 -				talloc_free(i->blocked_by);
 54.1227 -				i->blocked_by = NULL;
 54.1228 -				i->state = OK;
 54.1229 -				consider_message(i);
 54.1230 -			}
 54.1231 -			break;
 54.1232 -		case BUSY:
 54.1233 -		case OK:
 54.1234 -			break;
 54.1235 -		}
 54.1236 -	}
 54.1237 -
 54.1238 -	/* To balance bias, move first entry to end. */
 54.1239 -	if (!list_empty(&connections)) {
 54.1240 -		i = list_top(&connections, struct connection, list);
 54.1241 -		list_del(&i->list);
 54.1242 -		list_add_tail(&i->list, &connections);
 54.1243 -	}
 54.1244 -}
 54.1245 -
 54.1246  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
 54.1247  {
 54.1248  	/*
 54.1249 @@ -1451,7 +1277,6 @@ struct connection *new_connection(connwr
 54.1250  		return NULL;
 54.1251  
 54.1252  	new->state = OK;
 54.1253 -	new->blocked_by = NULL;
 54.1254  	new->out = new->waiting_reply = NULL;
 54.1255  	new->waiting_for_ack = NULL;
 54.1256  	new->fd = -1;
 54.1257 @@ -1504,25 +1329,9 @@ static void accept_connection(int sock, 
 54.1258  		close(fd);
 54.1259  }
 54.1260  
 54.1261 -/* Calc timespan from now to absolute time. */
 54.1262 -static void time_relative_to_now(struct timeval *tv)
 54.1263 -{
 54.1264 -	struct timeval now;
 54.1265 -
 54.1266 -	gettimeofday(&now, NULL);
 54.1267 -	if (timercmp(&now, tv, >))
 54.1268 -		timerclear(tv);
 54.1269 -	else {
 54.1270 -		tv->tv_sec -= now.tv_sec;
 54.1271 -		if (now.tv_usec > tv->tv_usec) {
 54.1272 -			tv->tv_sec--;
 54.1273 -			tv->tv_usec += 1000000;
 54.1274 -		}
 54.1275 -		tv->tv_usec -= now.tv_usec;
 54.1276 -	}
 54.1277 -}
 54.1278 -
 54.1279  #ifdef TESTING
 54.1280 +/* Valgrind can check our writes better if we don't use mmap */
 54.1281 +#define TDB_FLAGS TDB_NOMMAP
 54.1282  /* Useful for running under debugger. */
 54.1283  void dump_connection(void)
 54.1284  {
 54.1285 @@ -1532,13 +1341,10 @@ void dump_connection(void)
 54.1286  		printf("Connection %p:\n", i);
 54.1287  		printf("    state = %s\n",
 54.1288  		       i->state == OK ? "OK"
 54.1289 -		       : i->state == BLOCKED ? "BLOCKED"
 54.1290  		       : i->state == BUSY ? "BUSY"
 54.1291  		       : "INVALID");
 54.1292  		if (i->id)
 54.1293  			printf("    id = %i\n", i->id);
 54.1294 -		if (i->blocked_by)
 54.1295 -			printf("    blocked on = %s\n", i->blocked_by);
 54.1296  		if (!i->in->inhdr || i->in->used)
 54.1297  			printf("    got %i bytes of %s\n",
 54.1298  			       i->in->used, i->in->inhdr ? "header" : "data");
 54.1299 @@ -1559,44 +1365,53 @@ void dump_connection(void)
 54.1300  		dump_watches(i);
 54.1301  	}
 54.1302  }
 54.1303 +#else
 54.1304 +#define TDB_FLAGS 0
 54.1305  #endif
 54.1306  
 54.1307 +/* We create initial nodes manually. */
 54.1308 +static void manual_node(const char *name, const char *child)
 54.1309 +{
 54.1310 +	struct node *node;
 54.1311 +	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 54.1312 +
 54.1313 +	node = talloc(NULL, struct node);
 54.1314 +	node->name = name;
 54.1315 +	node->perms = &perms;
 54.1316 +	node->num_perms = 1;
 54.1317 +	node->data = NULL;
 54.1318 +	node->datalen = 0;
 54.1319 +	node->children = (char *)child;
 54.1320 +	if (child)
 54.1321 +		node->childlen = strlen(child) + 1;
 54.1322 +	else
 54.1323 +		node->childlen = 0;
 54.1324 +
 54.1325 +	if (!write_node(NULL, node))
 54.1326 +		barf_perror("Could not create initial node %s", name);
 54.1327 +	talloc_free(node);
 54.1328 +}
 54.1329 +
 54.1330 +#
 54.1331 +
 54.1332  static void setup_structure(void)
 54.1333  {
 54.1334 -	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 54.1335 -	char *root, *dir, *permfile;
 54.1336 -
 54.1337 -	/* Create root directory, with permissions. */
 54.1338 -	if (mkdir(xs_daemon_store(), 0750) != 0) {
 54.1339 -		if (errno != EEXIST)
 54.1340 -			barf_perror("Could not create root %s",
 54.1341 -				    xs_daemon_store());
 54.1342 -		return;
 54.1343 -	}
 54.1344 -	root = talloc_strdup(talloc_autofree_context(), "/");
 54.1345 -	if (!set_perms(NULL, root, &perms, 1))
 54.1346 -		barf_perror("Could not create permissions in root");
 54.1347 +	char *tdbname;
 54.1348 +	tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
 54.1349 +	tdb_ctx = tdb_open(tdbname, 0, TDB_FLAGS, O_RDWR, 0);
 54.1350  
 54.1351 -	/* Create tool directory, with xenstored subdir. */
 54.1352 -	dir = talloc_asprintf(root, "%s/%s", xs_daemon_store(), "tool");
 54.1353 -	if (mkdir(dir, 0750) != 0)
 54.1354 -		barf_perror("Making dir %s", dir);
 54.1355 -	
 54.1356 -	permfile = talloc_strdup(root, "/tool");
 54.1357 -	if (!set_perms(NULL, permfile, &perms, 1))
 54.1358 -		barf_perror("Could not create permissions on %s", permfile);
 54.1359 +	if (!tdb_ctx) {
 54.1360 +		tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
 54.1361 +				   0640);
 54.1362 +		if (!tdb_ctx)
 54.1363 +			barf_perror("Could not create tdb file %s", tdbname);
 54.1364  
 54.1365 -	dir = talloc_asprintf(root, "%s/%s", dir, "xenstored");
 54.1366 -	if (mkdir(dir, 0750) != 0)
 54.1367 -		barf_perror("Making dir %s", dir);
 54.1368 -	
 54.1369 -	permfile = talloc_strdup(root, "/tool/xenstored");
 54.1370 -	if (!set_perms(NULL, permfile, &perms, 1))
 54.1371 -		barf_perror("Could not create permissions on %s", permfile);
 54.1372 -	talloc_free(root);
 54.1373 -	if (mkdir(xs_daemon_transactions(), 0750) != 0)
 54.1374 -		barf_perror("Could not create transaction dir %s",
 54.1375 -			    xs_daemon_transactions());
 54.1376 +		manual_node("/", "tool");
 54.1377 +		manual_node("/tool", "xenstored");
 54.1378 +		manual_node("/tool/xenstored", NULL);
 54.1379 +	}
 54.1380 +
 54.1381 +	/* FIXME: Fsck */
 54.1382  }
 54.1383  
 54.1384  static void write_pidfile(const char *pidfile)
 54.1385 @@ -1759,17 +1574,8 @@ int main(int argc, char *argv[])
 54.1386  	/* FIXME: Rewrite so noone can starve. */
 54.1387  	for (;;) {
 54.1388  		struct connection *i;
 54.1389 -		struct timeval *tvp = NULL, tv;
 54.1390  
 54.1391 -		timerclear(&tv);
 54.1392 -		shortest_transaction_timeout(&tv);
 54.1393 -		shortest_watch_ack_timeout(&tv);
 54.1394 -		if (timerisset(&tv)) {
 54.1395 -			time_relative_to_now(&tv);
 54.1396 -			tvp = &tv;
 54.1397 -		}
 54.1398 -
 54.1399 -		if (select(max+1, &inset, &outset, NULL, tvp) < 0) {
 54.1400 +		if (select(max+1, &inset, &outset, NULL, NULL) < 0) {
 54.1401  			if (errno == EINTR)
 54.1402  				continue;
 54.1403  			barf_perror("Select failed");
 54.1404 @@ -1818,14 +1624,6 @@ int main(int argc, char *argv[])
 54.1405  			}
 54.1406  		}
 54.1407  
 54.1408 -		if (tvp) {
 54.1409 -			check_transaction_timeout();
 54.1410 -			check_watch_ack_timeout();
 54.1411 -		}
 54.1412 -
 54.1413 -		/* If transactions ended, we might be able to do more work. */
 54.1414 -		unblock_connections();
 54.1415 -
 54.1416  		max = initialize_set(&inset, &outset, *sock, *ro_sock,
 54.1417  				     event_fd);
 54.1418  	}
    55.1 --- a/tools/xenstore/xenstored_core.h	Thu Sep 29 13:35:13 2005 -0600
    55.2 +++ b/tools/xenstore/xenstored_core.h	Thu Sep 29 16:22:02 2005 -0600
    55.3 @@ -28,6 +28,7 @@
    55.4  #include "xs_lib.h"
    55.5  #include "xenstored.h"
    55.6  #include "list.h"
    55.7 +#include "tdb.h"
    55.8  
    55.9  struct buffered_data
   55.10  {
   55.11 @@ -49,8 +50,6 @@ typedef int connreadfn_t(struct connecti
   55.12  
   55.13  enum state
   55.14  {
   55.15 -	/* Blocked by transaction. */
   55.16 -	BLOCKED,
   55.17  	/* Doing action, not listening */
   55.18  	BUSY,
   55.19  	/* Completed */
   55.20 @@ -70,9 +69,6 @@ struct connection
   55.21  	/* Blocked on transaction?  Busy? */
   55.22  	enum state state;
   55.23  
   55.24 -	/* Node we are waiting for (if state == BLOCKED) */
   55.25 -	char *blocked_by;
   55.26 -
   55.27  	/* Is this a read-only connection? */
   55.28  	bool can_write;
   55.29  
   55.30 @@ -103,9 +99,27 @@ struct connection
   55.31  };
   55.32  extern struct list_head connections;
   55.33  
   55.34 -/* Return length of string (including nul) at this offset. */
   55.35 -unsigned int get_string(const struct buffered_data *data,
   55.36 -			unsigned int offset);
   55.37 +struct node {
   55.38 +	const char *name;
   55.39 +
   55.40 +	/* Database I came from */
   55.41 +	TDB_CONTEXT *tdb;
   55.42 +
   55.43 +	/* Parent (optional) */
   55.44 +	struct node *parent;
   55.45 +
   55.46 +	/* Permissions. */
   55.47 +	unsigned int num_perms;
   55.48 +	struct xs_permissions *perms;
   55.49 +
   55.50 +	/* Contents. */
   55.51 +	unsigned int datalen;
   55.52 +	void *data;
   55.53 +
   55.54 +	/* Children, each nul-terminated. */
   55.55 +	unsigned int childlen;
   55.56 +	char *children;
   55.57 +};
   55.58  
   55.59  /* Break input into vectors, return the number, fill in up to num of them. */
   55.60  unsigned int get_strings(struct buffered_data *data,
   55.61 @@ -114,9 +128,6 @@ unsigned int get_strings(struct buffered
   55.62  /* Is child node a child or equal to parent node? */
   55.63  bool is_child(const char *child, const char *parent);
   55.64  
   55.65 -/* Create a new buffer with lifetime of context. */
   55.66 -struct buffered_data *new_buffer(void *ctx);
   55.67 -
   55.68  void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
   55.69  		const void *data, unsigned int len);
   55.70  
   55.71 @@ -129,15 +140,22 @@ void send_error(struct connection *conn,
   55.72  /* Canonicalize this path if possible. */
   55.73  char *canonicalize(struct connection *conn, const char *node);
   55.74  
   55.75 -/* Check permissions on this node. */
   55.76 -bool check_node_perms(struct connection *conn, const char *node,
   55.77 -		      enum xs_perm_type perm);
   55.78 -
   55.79  /* Check if node is an event node. */
   55.80  bool check_event_node(const char *node);
   55.81  
   55.82 -/* Path to this node outside transaction. */
   55.83 -char *node_dir_outside_transaction(const char *node);
   55.84 +/* Get this node, checking we have permissions. */
   55.85 +struct node *get_node(struct connection *conn,
   55.86 +		      const char *name,
   55.87 +		      enum xs_perm_type perm);
   55.88 +
   55.89 +/* Get TDB context for this connection */
   55.90 +TDB_CONTEXT *tdb_context(struct connection *conn);
   55.91 +
   55.92 +/* Destructor for tdbs: required for transaction code */
   55.93 +int destroy_tdb(void *_tdb);
   55.94 +
   55.95 +/* Replace the tdb: required for transaction code */
   55.96 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
   55.97  
   55.98  /* Fail due to excessive corruption, capitalist pigdogs! */
   55.99  void __attribute__((noreturn)) corrupt(struct connection *conn,
  55.100 @@ -145,23 +163,9 @@ void __attribute__((noreturn)) corrupt(s
  55.101  
  55.102  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
  55.103  
  55.104 -void handle_input(struct connection *conn);
  55.105 -void handle_output(struct connection *conn);
  55.106 -
  55.107  /* Is this a valid node name? */
  55.108  bool is_valid_nodename(const char *node);
  55.109  
  55.110 -/* Return a pointer to an open dir, self-closig and attached to pathname. */
  55.111 -DIR **talloc_opendir(const char *pathname);
  55.112 -
  55.113 -/* Return a pointer to an fd, self-closing and attached to this pathname. */
  55.114 -int *talloc_open(const char *pathname, int flags, int mode);
  55.115 -
  55.116 -/* Convenient talloc-style destructor for paths. */
  55.117 -int destroy_path(void *path);
  55.118 -
  55.119 -/* Read entire contents of a talloced fd. */
  55.120 -void *read_all(int *fd, unsigned int *size);
  55.121  
  55.122  /* Tracing infrastructure. */
  55.123  void trace_create(const void *data, const char *type);
    56.1 --- a/tools/xenstore/xenstored_domain.c	Thu Sep 29 13:35:13 2005 -0600
    56.2 +++ b/tools/xenstore/xenstored_domain.c	Thu Sep 29 16:22:02 2005 -0600
    56.3 @@ -309,16 +309,11 @@ void do_introduce(struct connection *con
    56.4  		return;
    56.5  	}
    56.6  
    56.7 -	if (conn->id != 0) {
    56.8 +	if (conn->id != 0 || !conn->can_write) {
    56.9  		send_error(conn, EACCES);
   56.10  		return;
   56.11  	}
   56.12  
   56.13 -	if (!conn->can_write) {
   56.14 -		send_error(conn, EROFS);
   56.15 -		return;
   56.16 -	}
   56.17 -
   56.18  	/* Sanity check args. */
   56.19  	if ((atoi(vec[2]) <= 0) || !is_valid_nodename(vec[3])) {
   56.20  		send_error(conn, EINVAL);
   56.21 @@ -386,7 +381,7 @@ void do_release(struct connection *conn,
   56.22  
   56.23  	talloc_free(domain->conn);
   56.24  
   56.25 -	fire_watches(NULL, "@releaseDomain", false);
   56.26 +	fire_watches(conn, "@releaseDomain", false);
   56.27  
   56.28  	send_ack(conn, XS_RELEASE);
   56.29  }
    57.1 --- a/tools/xenstore/xenstored_transaction.c	Thu Sep 29 13:35:13 2005 -0600
    57.2 +++ b/tools/xenstore/xenstored_transaction.c	Thu Sep 29 16:22:02 2005 -0600
    57.3 @@ -26,6 +26,7 @@
    57.4  #include <stdarg.h>
    57.5  #include <stdlib.h>
    57.6  #include <fcntl.h>
    57.7 +#include <unistd.h>
    57.8  #include "talloc.h"
    57.9  #include "list.h"
   57.10  #include "xenstored_transaction.h"
   57.11 @@ -51,74 +52,26 @@ struct transaction
   57.12  	/* Global list of transactions. */
   57.13  	struct list_head list;
   57.14  
   57.15 +	/* Generation when transaction started. */
   57.16 +	unsigned int generation;
   57.17 +
   57.18  	/* My owner (conn->transaction == me). */
   57.19  	struct connection *conn;
   57.20  
   57.21 -	/* Subtree this transaction covers */
   57.22 -	char *node;
   57.23 -
   57.24 -	/* Base for this transaction. */
   57.25 -	char *divert;
   57.26 +	/* TDB to work on, and filename */
   57.27 +	TDB_CONTEXT *tdb;
   57.28 +	char *tdb_name;
   57.29  
   57.30  	/* List of changed nodes. */
   57.31  	struct list_head changes;
   57.32 -
   57.33 -	/* Someone's waiting: time limit. */
   57.34 -	struct timeval timeout;
   57.35 -
   57.36 -	/* We've timed out. */
   57.37 -	bool destined_to_fail;
   57.38  };
   57.39  static LIST_HEAD(transactions);
   57.40 -
   57.41 -bool within_transaction(struct transaction *trans, const char *node)
   57.42 -{
   57.43 -	if (!trans)
   57.44 -		return true;
   57.45 -	return is_child(node, trans->node);
   57.46 -}
   57.47 -
   57.48 -/* You are on notice: this transaction is blocking someone. */
   57.49 -static void start_transaction_timeout(struct transaction *trans)
   57.50 -{
   57.51 -	if (timerisset(&trans->timeout))
   57.52 -		return;
   57.53 -
   57.54 -	/* One second timeout. */
   57.55 -	gettimeofday(&trans->timeout, NULL);
   57.56 -	trans->timeout.tv_sec += 1;
   57.57 -}
   57.58 -
   57.59 -struct transaction *transaction_covering_node(const char *node)
   57.60 -{
   57.61 -	struct transaction *i;
   57.62 +static unsigned int generation;
   57.63  
   57.64 -	list_for_each_entry(i, &transactions, list) {
   57.65 -		if (i->destined_to_fail)
   57.66 -			continue;
   57.67 -		if (is_child(i->node, node) || is_child(node, i->node))
   57.68 -			return i;
   57.69 -	}
   57.70 -	return NULL;
   57.71 -}
   57.72 -
   57.73 -bool transaction_block(struct connection *conn, const char *node)
   57.74 +/* Return tdb context to use for this connection. */
   57.75 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans)
   57.76  {
   57.77 -	struct transaction *trans;
   57.78 -
   57.79 -	/* Transactions don't overlap, so we can't be blocked by
   57.80 -	 * others if we're in one. */
   57.81 -	if (conn->transaction)
   57.82 -		return false;
   57.83 -
   57.84 -	trans = transaction_covering_node(node);
   57.85 -	if (trans) {
   57.86 -		start_transaction_timeout(trans);
   57.87 -		conn->state = BLOCKED;
   57.88 -		conn->blocked_by = talloc_strdup(conn, node);
   57.89 -		return true;
   57.90 -	}
   57.91 -	return false;
   57.92 +	return trans->tdb;
   57.93  }
   57.94  
   57.95  /* Callers get a change node (which can fail) and only commit after they've
   57.96 @@ -127,8 +80,11 @@ void add_change_node(struct transaction 
   57.97  {
   57.98  	struct changed_node *i;
   57.99  
  57.100 -	if (!trans)
  57.101 +	if (!trans) {
  57.102 +		/* They're changing the global database. */
  57.103 +		generation++;
  57.104  		return;
  57.105 +	}
  57.106  
  57.107  	list_for_each_entry(i, &trans->changes, list)
  57.108  		if (streq(i->node, node))
  57.109 @@ -140,167 +96,47 @@ void add_change_node(struct transaction 
  57.110  	list_add_tail(&i->list, &trans->changes);
  57.111  }
  57.112  
  57.113 -char *node_dir_inside_transaction(struct transaction *trans, const char *node)
  57.114 -{
  57.115 -	return talloc_asprintf(node, "%s/%s", trans->divert,
  57.116 -			       node + strlen(trans->node));
  57.117 -}
  57.118 -
  57.119 -void shortest_transaction_timeout(struct timeval *tv)
  57.120 -{
  57.121 -	struct transaction *i;
  57.122 -
  57.123 -	list_for_each_entry(i, &transactions, list) {
  57.124 -		if (!timerisset(&i->timeout))
  57.125 -			continue;
  57.126 -
  57.127 -		if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
  57.128 -			*tv = i->timeout;
  57.129 -	}
  57.130 -}	
  57.131 -
  57.132 -void check_transaction_timeout(void)
  57.133 -{
  57.134 -	struct transaction *i;
  57.135 -	struct timeval now;
  57.136 -
  57.137 -	gettimeofday(&now, NULL);
  57.138 -
  57.139 -	list_for_each_entry(i, &transactions, list) {
  57.140 -		if (!timerisset(&i->timeout))
  57.141 -			continue;
  57.142 -
  57.143 -		if (timercmp(&i->timeout, &now, <))
  57.144 -			i->destined_to_fail = true;
  57.145 -	}
  57.146 -}
  57.147 -
  57.148  static int destroy_transaction(void *_transaction)
  57.149  {
  57.150  	struct transaction *trans = _transaction;
  57.151  
  57.152  	list_del(&trans->list);
  57.153  	trace_destroy(trans, "transaction");
  57.154 -	return destroy_path(trans->divert);
  57.155 -}
  57.156 -
  57.157 -static bool copy_file(const char *src, const char *dst)
  57.158 -{
  57.159 -	int *infd, *outfd;
  57.160 -	void *data;
  57.161 -	unsigned int size;
  57.162 -
  57.163 -	infd = talloc_open(src, O_RDONLY, 0);
  57.164 -	if (!infd)
  57.165 -		return false;
  57.166 -	outfd = talloc_open(dst, O_WRONLY|O_CREAT|O_EXCL, 0640);
  57.167 -	if (!outfd)
  57.168 -		return false;
  57.169 -	data = read_all(infd, &size);
  57.170 -	if (!data)
  57.171 -		return false;
  57.172 -	return xs_write_all(*outfd, data, size);
  57.173 +	if (trans->tdb)
  57.174 +		tdb_close(trans->tdb);
  57.175 +	unlink(trans->tdb_name);
  57.176 +	return 0;
  57.177  }
  57.178  
  57.179 -static bool copy_dir(const char *src, const char *dst)
  57.180 +void do_transaction_start(struct connection *conn, struct buffered_data *in)
  57.181  {
  57.182 -	DIR **dir;
  57.183 -	struct dirent *dirent;
  57.184 -
  57.185 -	if (mkdir(dst, 0750) != 0)
  57.186 -		return false;
  57.187 -
  57.188 -	dir = talloc_opendir(src);
  57.189 -	if (!dir)
  57.190 -		return false;
  57.191 -
  57.192 -	while ((dirent = readdir(*dir)) != NULL) {
  57.193 -		struct stat st;
  57.194 -		char *newsrc, *newdst;
  57.195 -
  57.196 -		if (streq(dirent->d_name, ".") || streq(dirent->d_name, ".."))
  57.197 -			continue;
  57.198 -
  57.199 -		newsrc = talloc_asprintf(src, "%s/%s", src, dirent->d_name);
  57.200 -		newdst = talloc_asprintf(src, "%s/%s", dst, dirent->d_name);
  57.201 -		if (stat(newsrc, &st) != 0)
  57.202 -			return false;
  57.203 -		
  57.204 -		if (S_ISDIR(st.st_mode)) {
  57.205 -			if (!copy_dir(newsrc, newdst))
  57.206 -				return false;
  57.207 -		} else {
  57.208 -			if (!copy_file(newsrc, newdst))
  57.209 -				return false;
  57.210 -		}
  57.211 -		/* Free now so we don't run out of file descriptors */
  57.212 -		talloc_free(newsrc);
  57.213 -		talloc_free(newdst);
  57.214 -	}
  57.215 -	return true;
  57.216 -}
  57.217 -
  57.218 -void do_transaction_start(struct connection *conn, const char *node)
  57.219 -{
  57.220 -	struct transaction *transaction;
  57.221 -	char *dir;
  57.222 +	struct transaction *trans;
  57.223  
  57.224  	if (conn->transaction) {
  57.225  		send_error(conn, EBUSY);
  57.226  		return;
  57.227  	}
  57.228  
  57.229 -	node = canonicalize(conn, node);
  57.230 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  57.231 +	/* Attach transaction to input for autofree until it's complete */
  57.232 +	trans = talloc(in, struct transaction);
  57.233 +	INIT_LIST_HEAD(&trans->changes);
  57.234 +	trans->conn = conn;
  57.235 +	trans->generation = generation;
  57.236 +	trans->tdb_name = talloc_asprintf(trans, "%s.%p",
  57.237 +					  xs_daemon_tdb(), trans);
  57.238 +	trans->tdb = tdb_copy(tdb_context(conn), trans->tdb_name);
  57.239 +	if (!trans->tdb) {
  57.240  		send_error(conn, errno);
  57.241  		return;
  57.242  	}
  57.243 -
  57.244 -	if (transaction_block(conn, node))
  57.245 -		return;
  57.246 -
  57.247 -	dir = node_dir_outside_transaction(node);
  57.248 -
  57.249 -	/* Attach transaction to node for autofree until it's complete */
  57.250 -	transaction = talloc(node, struct transaction);
  57.251 -	transaction->node = talloc_strdup(transaction, node);
  57.252 -	transaction->divert = talloc_asprintf(transaction, "%s/%p", 
  57.253 -					      xs_daemon_transactions(),
  57.254 -					      transaction);
  57.255 -	INIT_LIST_HEAD(&transaction->changes);
  57.256 -	transaction->conn = conn;
  57.257 -	timerclear(&transaction->timeout);
  57.258 -	transaction->destined_to_fail = false;
  57.259 -	list_add_tail(&transaction->list, &transactions);
  57.260 -	talloc_set_destructor(transaction, destroy_transaction);
  57.261 -	trace_create(transaction, "transaction");
  57.262 +	/* Make it close if we go away. */
  57.263 +	talloc_steal(trans, trans->tdb);
  57.264  
  57.265 -	if (!copy_dir(dir, transaction->divert)) {
  57.266 -		send_error(conn, errno);
  57.267 -		return;
  57.268 -	}
  57.269 -
  57.270 -	talloc_steal(conn, transaction);
  57.271 -	conn->transaction = transaction;
  57.272 -	send_ack(transaction->conn, XS_TRANSACTION_START);
  57.273 -}
  57.274 -
  57.275 -static bool commit_transaction(struct transaction *trans)
  57.276 -{
  57.277 -	char *tmp, *dir;
  57.278 -
  57.279 -	/* Move: orig -> .old, repl -> orig.  Cleanup deletes .old. */
  57.280 -	dir = node_dir_outside_transaction(trans->node);
  57.281 -	tmp = talloc_asprintf(trans, "%s.old", dir);
  57.282 -
  57.283 -	if (rename(dir, tmp) != 0)
  57.284 -		return false;
  57.285 -	if (rename(trans->divert, dir) != 0)
  57.286 -		corrupt(trans->conn, "Failed rename %s to %s",
  57.287 -			trans->divert, dir);
  57.288 -
  57.289 -	trans->divert = tmp;
  57.290 -	return true;
  57.291 +	/* Now we own it. */
  57.292 +	conn->transaction = talloc_steal(conn, trans);
  57.293 +	list_add_tail(&trans->list, &transactions);
  57.294 +	talloc_set_destructor(trans, destroy_transaction);
  57.295 +	send_ack(conn, XS_TRANSACTION_START);
  57.296  }
  57.297  
  57.298  void do_transaction_end(struct connection *conn, const char *arg)
  57.299 @@ -318,25 +154,29 @@ void do_transaction_end(struct connectio
  57.300  		return;
  57.301  	}
  57.302  
  57.303 -	/* Set to NULL so fire_watches sends events. */
  57.304 +	/* Set to NULL so fire_watches sends events, tdb_context works. */
  57.305  	trans = conn->transaction;
  57.306  	conn->transaction = NULL;
  57.307  	/* Attach transaction to arg for auto-cleanup */
  57.308  	talloc_steal(arg, trans);
  57.309  
  57.310  	if (streq(arg, "T")) {
  57.311 -		if (trans->destined_to_fail) {
  57.312 -			send_error(conn, ETIMEDOUT);
  57.313 +		/* FIXME: Merge, rather failing on any change. */
  57.314 +		if (trans->generation != generation) {
  57.315 +			send_error(conn, EAGAIN);
  57.316  			return;
  57.317  		}
  57.318 -		if (!commit_transaction(trans)) {
  57.319 +		if (!replace_tdb(trans->tdb_name, trans->tdb)) {
  57.320  			send_error(conn, errno);
  57.321  			return;
  57.322  		}
  57.323 +		/* Don't close this: we won! */
  57.324 +		trans->tdb = NULL;
  57.325  
  57.326  		/* Fire off the watches for everything that changed. */
  57.327  		list_for_each_entry(i, &trans->changes, list)
  57.328  			fire_watches(conn, i->node, i->recurse);
  57.329 +		generation++;
  57.330  	}
  57.331  	send_ack(conn, XS_TRANSACTION_END);
  57.332  }
    58.1 --- a/tools/xenstore/xenstored_transaction.h	Thu Sep 29 13:35:13 2005 -0600
    58.2 +++ b/tools/xenstore/xenstored_transaction.h	Thu Sep 29 16:22:02 2005 -0600
    58.3 @@ -22,29 +22,14 @@
    58.4  
    58.5  struct transaction;
    58.6  
    58.7 -void do_transaction_start(struct connection *conn, const char *node);
    58.8 +void do_transaction_start(struct connection *conn, struct buffered_data *node);
    58.9  void do_transaction_end(struct connection *conn, const char *arg);
   58.10  
   58.11 -/* Is node covered by this transaction? */
   58.12 -bool within_transaction(struct transaction *trans, const char *node);
   58.13 -
   58.14 -/* If a write op on this node blocked by another connections' transaction,
   58.15 - * mark conn, setup transaction timeout and return true.
   58.16 - */
   58.17 -bool transaction_block(struct connection *conn, const char *node);
   58.18 -
   58.19 -/* Return transaction which covers this node. */
   58.20 -struct transaction *transaction_covering_node(const char *node);
   58.21 -
   58.22 -/* Return directory of node within transaction t. */
   58.23 -char *node_dir_inside_transaction(struct transaction *t, const char *node);
   58.24 +bool transaction_block(struct connection *conn);
   58.25  
   58.26  /* This node was changed: can fail and longjmp. */
   58.27  void add_change_node(struct transaction *trans, const char *node, bool recurse);
   58.28  
   58.29 -/* Get shortest timeout: leave tv unset if none. */
   58.30 -void shortest_transaction_timeout(struct timeval *tv);
   58.31 -
   58.32 -/* Have any transactions timed out yet? */
   58.33 -void check_transaction_timeout(void);
   58.34 +/* Return tdb context to use for this connection. */
   58.35 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans);
   58.36  #endif /* _XENSTORED_TRANSACTION_H */
    59.1 --- a/tools/xenstore/xenstored_watch.c	Thu Sep 29 13:35:13 2005 -0600
    59.2 +++ b/tools/xenstore/xenstored_watch.c	Thu Sep 29 16:22:02 2005 -0600
    59.3 @@ -96,36 +96,38 @@ static int destroy_watch_event(void *_ev
    59.4  }
    59.5  
    59.6  static void add_event(struct connection *conn,
    59.7 -		      struct watch *watch, const char *node)
    59.8 +		      struct watch *watch,
    59.9 +		      const char *name)
   59.10  {
   59.11  	struct watch_event *event;
   59.12  
   59.13 -	/* Check read permission: no permission, no watch event.
   59.14 -	 * If it doesn't exist, we need permission to read parent.
   59.15 -	 */
   59.16 -	if (!check_node_perms(conn, node, XS_PERM_READ|XS_PERM_ENOENT_OK) &&
   59.17 -	    !check_event_node(node)) {
   59.18 -		return;
   59.19 +	if (!check_event_node(name)) {
   59.20 +		/* Can this conn load node, or see that it doesn't exist? */
   59.21 +		struct node *node;
   59.22 +
   59.23 +		node = get_node(conn, name, XS_PERM_READ);
   59.24 +		if (!node && errno != ENOENT)
   59.25 +			return;
   59.26  	}
   59.27  
   59.28  	if (watch->relative_path) {
   59.29 -		node += strlen(watch->relative_path);
   59.30 -		if (*node == '/') /* Could be "" */
   59.31 -			node++;
   59.32 +		name += strlen(watch->relative_path);
   59.33 +		if (*name == '/') /* Could be "" */
   59.34 +			name++;
   59.35  	}
   59.36  
   59.37  	event = talloc(watch, struct watch_event);
   59.38 -	event->len = strlen(node) + 1 + strlen(watch->token) + 1;
   59.39 +	event->len = strlen(name) + 1 + strlen(watch->token) + 1;
   59.40  	event->data = talloc_array(event, char, event->len);
   59.41 -	strcpy(event->data, node);
   59.42 -	strcpy(event->data + strlen(node) + 1, watch->token);
   59.43 +	strcpy(event->data, name);
   59.44 +	strcpy(event->data + strlen(name) + 1, watch->token);
   59.45  	talloc_set_destructor(event, destroy_watch_event);
   59.46  	list_add_tail(&event->list, &watch->events);
   59.47  	trace_create(event, "watch_event");
   59.48  }
   59.49  
   59.50  /* FIXME: we fail to fire on out of memory.  Should drop connections. */
   59.51 -void fire_watches(struct connection *conn, const char *node, bool recurse)
   59.52 +void fire_watches(struct connection *conn, const char *name, bool recurse)
   59.53  {
   59.54  	struct connection *i;
   59.55  	struct watch *watch;
   59.56 @@ -137,9 +139,9 @@ void fire_watches(struct connection *con
   59.57  	/* Create an event for each watch. */
   59.58  	list_for_each_entry(i, &connections, list) {
   59.59  		list_for_each_entry(watch, &i->watches, list) {
   59.60 -			if (is_child(node, watch->node))
   59.61 -				add_event(i, watch, node);
   59.62 -			else if (recurse && is_child(watch->node, node))
   59.63 +			if (is_child(name, watch->node))
   59.64 +				add_event(i, watch, name);
   59.65 +			else if (recurse && is_child(watch->node, name))
   59.66  				add_event(i, watch, watch->node);
   59.67  			else
   59.68  				continue;
   59.69 @@ -156,49 +158,6 @@ static int destroy_watch(void *_watch)
   59.70  	return 0;
   59.71  }
   59.72  
   59.73 -void shortest_watch_ack_timeout(struct timeval *tv)
   59.74 -{
   59.75 -	(void)tv;
   59.76 -#if 0 /* FIXME */
   59.77 -	struct watch *watch;
   59.78 -
   59.79 -	list_for_each_entry(watch, &watches, list) {
   59.80 -		struct watch_event *i;
   59.81 -		list_for_each_entry(i, &watch->events, list) {
   59.82 -			if (!timerisset(&i->timeout))
   59.83 -				continue;
   59.84 -			if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
   59.85 -				*tv = i->timeout;
   59.86 -		}
   59.87 -	}
   59.88 -#endif
   59.89 -}	
   59.90 -
   59.91 -void check_watch_ack_timeout(void)
   59.92 -{
   59.93 -#if 0
   59.94 -	struct watch *watch;
   59.95 -	struct timeval now;
   59.96 -
   59.97 -	gettimeofday(&now, NULL);
   59.98 -	list_for_each_entry(watch, &watches, list) {
   59.99 -		struct watch_event *i, *tmp;
  59.100 -		list_for_each_entry_safe(i, tmp, &watch->events, list) {
  59.101 -			if (!timerisset(&i->timeout))
  59.102 -				continue;
  59.103 -			if (timercmp(&i->timeout, &now, <)) {
  59.104 -				xprintf("Warning: timeout on watch event %s"
  59.105 -					" token %s\n",
  59.106 -					i->node, watch->token);
  59.107 -				trace_watch_timeout(watch->conn, i->node,
  59.108 -						    watch->token);
  59.109 -				timerclear(&i->timeout);
  59.110 -			}
  59.111 -		}
  59.112 -	}
  59.113 -#endif
  59.114 -}
  59.115 -
  59.116  void do_watch(struct connection *conn, struct buffered_data *in)
  59.117  {
  59.118  	struct watch *watch;
    60.1 --- a/tools/xenstore/xenstored_watch.h	Thu Sep 29 13:35:13 2005 -0600
    60.2 +++ b/tools/xenstore/xenstored_watch.h	Thu Sep 29 16:22:02 2005 -0600
    60.3 @@ -32,15 +32,9 @@ bool is_watch_event(struct connection *c
    60.4  /* Look through our watches: if any of them have an event, queue it. */
    60.5  void queue_next_event(struct connection *conn);
    60.6  
    60.7 -/* Fire all watches: recurse means all the children are effected (ie. rm).
    60.8 +/* Fire all watches: recurse means all the children are affected (ie. rm).
    60.9   */
   60.10 -void fire_watches(struct connection *conn, const char *node, bool recurse);
   60.11 -
   60.12 -/* Find shortest timeout: if any, reduce tv (may already be set). */
   60.13 -void shortest_watch_ack_timeout(struct timeval *tv);
   60.14 -
   60.15 -/* Check for watches which may have timed out. */
   60.16 -void check_watch_ack_timeout(void);
   60.17 +void fire_watches(struct connection *conn, const char *name, bool recurse);
   60.18  
   60.19  void dump_watches(struct connection *conn);
   60.20  
    61.1 --- a/tools/xenstore/xs.c	Thu Sep 29 13:35:13 2005 -0600
    61.2 +++ b/tools/xenstore/xs.c	Thu Sep 29 16:22:02 2005 -0600
    61.3 @@ -497,13 +497,12 @@ bool xs_unwatch(struct xs_handle *h, con
    61.4  
    61.5  /* Start a transaction: changes by others will not be seen during this
    61.6   * transaction, and changes will not be visible to others until end.
    61.7 - * Transaction only applies to the given subtree.
    61.8   * You can only have one transaction at any time.
    61.9   * Returns false on failure.
   61.10   */
   61.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree)
   61.12 +bool xs_transaction_start(struct xs_handle *h)
   61.13  {
   61.14 -	return xs_bool(xs_single(h, XS_TRANSACTION_START, subtree, NULL));
   61.15 +	return xs_bool(xs_single(h, XS_TRANSACTION_START, "", NULL));
   61.16  }
   61.17  
   61.18  /* End a transaction.
    62.1 --- a/tools/xenstore/xs.h	Thu Sep 29 13:35:13 2005 -0600
    62.2 +++ b/tools/xenstore/xs.h	Thu Sep 29 16:22:02 2005 -0600
    62.3 @@ -109,16 +109,15 @@ bool xs_unwatch(struct xs_handle *h, con
    62.4  
    62.5  /* Start a transaction: changes by others will not be seen during this
    62.6   * transaction, and changes will not be visible to others until end.
    62.7 - * Transaction only applies to the given subtree.
    62.8   * You can only have one transaction at any time.
    62.9   * Returns false on failure.
   62.10   */
   62.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree);
   62.12 +bool xs_transaction_start(struct xs_handle *h);
   62.13  
   62.14  /* End a transaction.
   62.15   * If abandon is true, transaction is discarded instead of committed.
   62.16 - * Returns false on failure, which indicates an error: transactions will
   62.17 - * not fail spuriously.
   62.18 + * Returns false on failure: if errno == EAGAIN, you have to restart
   62.19 + * transaction.
   62.20   */
   62.21  bool xs_transaction_end(struct xs_handle *h, bool abort);
   62.22  
    63.1 --- a/tools/xenstore/xs_lib.c	Thu Sep 29 13:35:13 2005 -0600
    63.2 +++ b/tools/xenstore/xs_lib.c	Thu Sep 29 16:22:02 2005 -0600
    63.3 @@ -50,6 +50,13 @@ static const char *xs_daemon_path(void)
    63.4  	return buf;
    63.5  }
    63.6  
    63.7 +const char *xs_daemon_tdb(void)
    63.8 +{
    63.9 +	static char buf[PATH_MAX];
   63.10 +	sprintf(buf, "%s/tdb", xs_daemon_rootdir());
   63.11 +	return buf;
   63.12 +}
   63.13 +
   63.14  const char *xs_daemon_socket(void)
   63.15  {
   63.16  	return xs_daemon_path();
   63.17 @@ -66,24 +73,6 @@ const char *xs_daemon_socket_ro(void)
   63.18  	return buf;
   63.19  }
   63.20  
   63.21 -const char *xs_daemon_store(void)
   63.22 -{
   63.23 -	static char buf[PATH_MAX];
   63.24 -	if (snprintf(buf, PATH_MAX, "%s/store",
   63.25 -		     xs_daemon_rootdir()) >= PATH_MAX)
   63.26 -		return NULL;
   63.27 -	return buf;
   63.28 -}
   63.29 -
   63.30 -const char *xs_daemon_transactions(void)
   63.31 -{
   63.32 -	static char buf[PATH_MAX];
   63.33 -	if (snprintf(buf, PATH_MAX, "%s/transactions",
   63.34 -		     xs_daemon_rootdir()) >= PATH_MAX)
   63.35 -		return NULL;
   63.36 -	return buf;
   63.37 -}
   63.38 -
   63.39  const char *xs_domain_dev(void)
   63.40  {
   63.41  	char *s = getenv("XENSTORED_PATH");
    64.1 --- a/tools/xenstore/xs_lib.h	Thu Sep 29 13:35:13 2005 -0600
    64.2 +++ b/tools/xenstore/xs_lib.h	Thu Sep 29 16:22:02 2005 -0600
    64.3 @@ -36,7 +36,7 @@ enum xs_perm_type {
    64.4  
    64.5  struct xs_permissions
    64.6  {
    64.7 -	domid_t id;
    64.8 +	unsigned int id;
    64.9  	enum xs_perm_type perms;
   64.10  };
   64.11  
   64.12 @@ -46,9 +46,8 @@ struct xs_permissions
   64.13  /* Path for various daemon things: env vars can override. */
   64.14  const char *xs_daemon_socket(void);
   64.15  const char *xs_daemon_socket_ro(void);
   64.16 -const char *xs_daemon_store(void);
   64.17 -const char *xs_daemon_transactions(void);
   64.18  const char *xs_domain_dev(void);
   64.19 +const char *xs_daemon_tdb(void);
   64.20  
   64.21  /* Simple write function: loops for you. */
   64.22  bool xs_write_all(int fd, const void *data, unsigned int len);
    65.1 --- a/tools/xenstore/xs_random.c	Thu Sep 29 13:35:13 2005 -0600
    65.2 +++ b/tools/xenstore/xs_random.c	Thu Sep 29 16:22:02 2005 -0600
    65.3 @@ -41,7 +41,7 @@ struct ops
    65.4  			  struct xs_permissions *perms,
    65.5  			  unsigned int num);
    65.6  
    65.7 -	bool (*transaction_start)(void *h, const char *subtree);
    65.8 +	bool (*transaction_start)(void *h);
    65.9  	bool (*transaction_end)(void *h, bool abort);
   65.10  
   65.11  	/* Create and destroy a new handle. */
   65.12 @@ -53,7 +53,6 @@ struct file_ops_info
   65.13  {
   65.14  	const char *base;
   65.15  	char *transact_base;
   65.16 -	char *transact;
   65.17  };
   65.18  
   65.19  static void convert_to_dir(const char *dirname)
   65.20 @@ -96,31 +95,6 @@ static char *path_to_name(struct file_op
   65.21  	return filename;
   65.22  }
   65.23  
   65.24 -/* Is child a subnode of parent, or equal? */
   65.25 -static bool is_child(const char *child, const char *parent)
   65.26 -{
   65.27 -	unsigned int len = strlen(parent);
   65.28 -
   65.29 -	/* / should really be "" for this algorithm to work, but that's a
   65.30 -	 * usability nightmare. */
   65.31 -	if (streq(parent, "/"))
   65.32 -		return true;
   65.33 -
   65.34 -	if (strncmp(child, parent, len) != 0)
   65.35 -		return false;
   65.36 -
   65.37 -	return child[len] == '/' || child[len] == '\0';
   65.38 -}
   65.39 -
   65.40 -static bool write_ok(struct file_ops_info *info, const char *path)
   65.41 -{
   65.42 -	if (info->transact && !is_child(path, info->transact)) {
   65.43 -		errno = EROFS;
   65.44 -		return false;
   65.45 -	}
   65.46 -	return true;
   65.47 -}	
   65.48 -
   65.49  static char **file_directory(struct file_ops_info *info,
   65.50  			     const char *path, unsigned int *num)
   65.51  {
   65.52 @@ -184,8 +158,10 @@ static void *file_read(struct file_ops_i
   65.53  
   65.54  	ret = grab_file(filename, &size);
   65.55  	/* Directory exists, .DATA doesn't. */
   65.56 -	if (!ret && errno == ENOENT && strends(filename, ".DATA"))
   65.57 -		errno = EISDIR;
   65.58 +	if (!ret && errno == ENOENT && strends(filename, ".DATA")) {
   65.59 +		ret = strdup("");
   65.60 +		size = 0;
   65.61 +	}
   65.62  	*len = size;
   65.63  	return ret;
   65.64  }
   65.65 @@ -270,9 +246,6 @@ static bool file_set_perms(struct file_o
   65.66  		return false;
   65.67  	}
   65.68  
   65.69 -	if (!write_ok(info, path))
   65.70 -		return false;
   65.71 -
   65.72  	/* Check non-perm file exists/ */
   65.73  	if (lstat(filename, &st) != 0)
   65.74  		return false;
   65.75 @@ -338,9 +311,6 @@ static bool file_write(struct file_ops_i
   65.76  	char *filename = filename_to_data(path_to_name(info, path));
   65.77  	int fd;
   65.78  
   65.79 -	if (!write_ok(info, path))
   65.80 -		return false;
   65.81 -
   65.82  	make_dirs(parent_filename(filename));
   65.83  	fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0600);
   65.84  	if (fd < 0)
   65.85 @@ -358,9 +328,6 @@ static bool file_mkdir(struct file_ops_i
   65.86  {
   65.87  	char *dirname = path_to_name(info, path);
   65.88  
   65.89 -	if (!write_ok(info, path))
   65.90 -		return false;
   65.91 -
   65.92  	make_dirs(parent_filename(dirname));
   65.93  	if (mkdir(dirname, 0700) != 0)
   65.94  		return (errno == EEXIST);
   65.95 @@ -374,20 +341,12 @@ static bool file_rm(struct file_ops_info
   65.96  	char *filename = path_to_name(info, path);
   65.97  	struct stat st;
   65.98  
   65.99 -	if (info->transact && streq(info->transact, path)) {
  65.100 -		errno = EINVAL;
  65.101 -		return false;
  65.102 -	}
  65.103 -
  65.104  	if (lstat(filename, &st) != 0) {
  65.105  		if (lstat(parent_filename(filename), &st) != 0)
  65.106  			return false;
  65.107  		return true;
  65.108  	}
  65.109  
  65.110 -	if (!write_ok(info, path))
  65.111 -		return false;
  65.112 -
  65.113  	if (streq(path, "/")) {
  65.114  		errno = EINVAL;
  65.115  		return false;
  65.116 @@ -398,28 +357,20 @@ static bool file_rm(struct file_ops_info
  65.117  	return true;
  65.118  }
  65.119  
  65.120 -static bool file_transaction_start(struct file_ops_info *info,
  65.121 -				   const char *subtree)
  65.122 +static bool file_transaction_start(struct file_ops_info *info)
  65.123  {
  65.124  	char *cmd;
  65.125 -	char *filename = path_to_name(info, subtree);
  65.126 -	struct stat st;
  65.127  
  65.128 -	if (info->transact) {
  65.129 +	if (info->transact_base) {
  65.130  		errno = EBUSY;
  65.131  		return false;
  65.132  	}
  65.133  
  65.134 -	if (lstat(filename, &st) != 0)
  65.135 -		return false;
  65.136 -
  65.137 -	cmd = talloc_asprintf(NULL, "cp -r %s %s.transact",
  65.138 -			      info->base, info->base);
  65.139 +	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  65.140 +	cmd = talloc_asprintf(NULL, "cp -r %s %s",
  65.141 +			      info->base, info->transact_base);
  65.142  	do_command(cmd);
  65.143  	talloc_free(cmd);
  65.144 -
  65.145 -	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  65.146 -	info->transact = talloc_strdup(NULL, subtree);
  65.147  	return true;
  65.148  }
  65.149  
  65.150 @@ -427,7 +378,7 @@ static bool file_transaction_end(struct 
  65.151  {
  65.152  	char *old, *cmd;
  65.153  
  65.154 -	if (!info->transact) {
  65.155 +	if (!info->transact_base) {
  65.156  		errno = ENOENT;
  65.157  		return false;
  65.158  	}
  65.159 @@ -448,9 +399,7 @@ static bool file_transaction_end(struct 
  65.160  
  65.161  success:
  65.162  	talloc_free(cmd);
  65.163 -	talloc_free(info->transact);
  65.164  	talloc_free(info->transact_base);
  65.165 -	info->transact = NULL;
  65.166  	info->transact_base = NULL;
  65.167  	return true;
  65.168  }
  65.169 @@ -461,7 +410,6 @@ static struct file_ops_info *file_handle
  65.170  
  65.171  	info->base = dir;
  65.172  	info->transact_base = NULL;
  65.173 -	info->transact = NULL;
  65.174  	return info;
  65.175  }
  65.176  
  65.177 @@ -898,11 +846,10 @@ static char *do_next_op(struct ops *ops,
  65.178  	case 7: {
  65.179  		if (verbose)
  65.180  			printf("START %s\n", name);
  65.181 -		ret = bool_to_errstring(ops->transaction_start(h, name));
  65.182 +		ret = bool_to_errstring(ops->transaction_start(h));
  65.183  		if (streq(ret, "OK")) {
  65.184  			talloc_free(ret);
  65.185 -			ret = talloc_asprintf(NULL, "OK:START-TRANSACT:%s",
  65.186 -					      name);
  65.187 +			ret = talloc_asprintf(NULL, "OK:START-TRANSACT");
  65.188  		}
  65.189  
  65.190  		break;
  65.191 @@ -978,6 +925,8 @@ static void setup_file_ops(const char *d
  65.192  		barf_perror("Creating directory %s/tool", dir);
  65.193  	if (!file_set_perms(h, talloc_strdup(h, "/"), &perm, 1))
  65.194  		barf_perror("Setting root perms in %s", dir);
  65.195 +	if (!file_set_perms(h, talloc_strdup(h, "/tool"), &perm, 1))
  65.196 +		barf_perror("Setting root perms in %s/tool", dir);
  65.197  	file_close(h);
  65.198  }
  65.199  
  65.200 @@ -1071,7 +1020,7 @@ static unsigned int try_simple(const boo
  65.201  			goto out;
  65.202  
  65.203  		if (!data->fast) {
  65.204 -			if (strstarts(ret, "OK:START-TRANSACT:")) {
  65.205 +			if (streq(ret, "OK:START-TRANSACT")) {
  65.206  				void *pre = data->ops->handle(data->dir);
  65.207  
  65.208  				snapshot = dump(data->ops, pre);
  65.209 @@ -1303,7 +1252,7 @@ static unsigned int try_diff(const bool 
  65.210  			     void *_data)
  65.211  {
  65.212  	void *fileh, *xsh;
  65.213 -	char *transact = NULL;
  65.214 +	bool transact = false;
  65.215  	struct ops *fail;
  65.216  	struct diff_data *data = _data;
  65.217  	unsigned int i, print;
  65.218 @@ -1348,13 +1297,9 @@ static unsigned int try_diff(const bool 
  65.219  			goto out;
  65.220  
  65.221  		if (strstarts(file, "OK:START-TRANSACT:"))
  65.222 -			transact = talloc_strdup(NULL,
  65.223 -						 file +
  65.224 -						 strlen("OK:START-TRANSACT:"));
  65.225 -		else if (streq(file, "OK:STOP-TRANSACT")) {
  65.226 -			talloc_free(transact);
  65.227 -			transact = NULL;
  65.228 -		}
  65.229 +			transact = true;
  65.230 +		else if (streq(file, "OK:STOP-TRANSACT"))
  65.231 +			transact = false;
  65.232  
  65.233  		talloc_free(file);
  65.234  		talloc_free(xs);
  65.235 @@ -1379,7 +1324,7 @@ static unsigned int try_diff(const bool 
  65.236  
  65.237  			fail = NULL;
  65.238  			if (!ops_equal(&xs_ops, xsh_pre, &file_ops, fileh_pre,
  65.239 -				       transact, &fail)) {
  65.240 +				       "/", &fail)) {
  65.241  				if (fail)
  65.242  					barf("%s failed during transact\n",
  65.243  					     fail->name);
  65.244 @@ -1456,9 +1401,6 @@ static unsigned int try_fail(const bool 
  65.245  	fileh = file_handle(data->dir);
  65.246  	xsh = xs_handle(data->dir);
  65.247  
  65.248 -	sprintf(seed, "%i", data->seed);
  65.249 -	free(xs_debug_command(xsh, "failtest", seed, strlen(seed)+1));
  65.250 -
  65.251  	print = number / 76;
  65.252  	if (!print)
  65.253  		print = 1;
  65.254 @@ -1491,8 +1433,12 @@ static unsigned int try_fail(const bool 
  65.255  		if (trymap && !trymap[i])
  65.256  			continue;
  65.257  
  65.258 +		/* Turn on failure. */
  65.259 +		sprintf(seed, "%i", data->seed + i);
  65.260 +		free(xs_debug_command(xsh, "failtest",seed,strlen(seed)+1));
  65.261 +
  65.262  		if (verbose)
  65.263 -			printf("(%i) ", i);
  65.264 +			printf("(%i) seed %s ", i, seed);
  65.265  		ret = do_next_op(&xs_ops, xsh, i + data->seed, verbose);
  65.266  		if (streq(ret, "FAILED:Connection reset by peer")
  65.267  		    || streq(ret, "FAILED:Bad file descriptor")
  65.268 @@ -1549,8 +1495,6 @@ static unsigned int try_fail(const bool 
  65.269  		fail = NULL;
  65.270  		if (!ops_equal(&xs_ops, tmpxsh, &file_ops, tmpfileh, "/",
  65.271  			       &fail)) {
  65.272 -			xs_close(tmpxsh);
  65.273 -			file_close(tmpfileh);
  65.274  			if (fail) {
  65.275  				if (verbose)
  65.276  					printf("%s failed\n", fail->name);
  65.277 @@ -1561,10 +1505,16 @@ static unsigned int try_fail(const bool 
  65.278  				failed = 0;
  65.279  				if (verbose)
  65.280  					printf("(Looks like it succeeded)\n");
  65.281 +				xs_close(tmpxsh);
  65.282 +				file_close(tmpfileh);
  65.283  				goto try_applying;
  65.284  			}
  65.285  			if (verbose)
  65.286 -				printf("Two backends not equal\n");
  65.287 +				printf("Trees differ:\nXS:%s\nFILE:%s\n",
  65.288 +				       dump(&xs_ops, tmpxsh),
  65.289 +				       dump(&file_ops, tmpfileh));
  65.290 +			xs_close(tmpxsh);
  65.291 +			file_close(tmpfileh);
  65.292  			goto out;
  65.293  		}
  65.294  
  65.295 @@ -1572,8 +1522,6 @@ static unsigned int try_fail(const bool 
  65.296  		if (!xsh)
  65.297  			file_transaction_end(fileh, true);
  65.298  
  65.299 -		/* Turn failures back on. */
  65.300 -		free(xs_debug_command(tmpxsh, "failtest",  NULL, 0));
  65.301  		xs_close(tmpxsh);
  65.302  		file_close(tmpfileh);
  65.303  	}
    66.1 --- a/tools/xenstore/xs_stress.c	Thu Sep 29 13:35:13 2005 -0600
    66.2 +++ b/tools/xenstore/xs_stress.c	Thu Sep 29 16:22:02 2005 -0600
    66.3 @@ -8,6 +8,7 @@
    66.4  #include <sys/stat.h>
    66.5  #include <fcntl.h>
    66.6  #include <unistd.h>
    66.7 +#include <errno.h>
    66.8  
    66.9  #define NUM_HANDLES 2
   66.10  #define DIR_FANOUT 3
   66.11 @@ -36,24 +37,18 @@ static void work(unsigned int cycles, un
   66.12  
   66.13  	srandom(childnum);
   66.14  	for (i = 0; i < cycles; i++) {
   66.15 -		unsigned int lockdepth, j, len;
   66.16 -		char file[100] = "", lockdir[100];
   66.17 +		unsigned int j, len;
   66.18 +		char file[100] = "";
   66.19  		char *contents, tmp[100];
   66.20  		struct xs_handle *h = handles[random() % NUM_HANDLES];
   66.21  
   66.22 -		lockdepth = random() % DIR_DEPTH;
   66.23 -		for (j = 0; j < DIR_DEPTH; j++) {
   66.24 -			if (j == lockdepth)
   66.25 -				strcpy(lockdir, file);
   66.26 +		for (j = 0; j < DIR_DEPTH; j++)
   66.27  			sprintf(file + strlen(file), "/%li",
   66.28  				random()%DIR_FANOUT);
   66.29 -		}
   66.30 -		if (streq(lockdir, ""))
   66.31 -			strcpy(lockdir, "/");
   66.32  
   66.33 -		if (!xs_transaction_start(h, lockdir))
   66.34 -			barf_perror("%i: starting transaction %i on %s",
   66.35 -				    childnum, i, lockdir);
   66.36 +		if (!xs_transaction_start(h))
   66.37 +			barf_perror("%i: starting transaction %i",
   66.38 +				    childnum, i);
   66.39  
   66.40  		sprintf(file + strlen(file), "/count");
   66.41  		contents = xs_read(h, file, &len);
   66.42 @@ -68,18 +63,23 @@ static void work(unsigned int cycles, un
   66.43  		/* Abandon 1 in 10 */
   66.44  		if (random() % 10 == 0) {
   66.45  			if (!xs_transaction_end(h, true))
   66.46 -				barf_perror("%i: can't abort transact %s",
   66.47 -					    childnum, lockdir);
   66.48 +				barf_perror("%i: can't abort transact",
   66.49 +					    childnum);
   66.50  			i--;
   66.51  		} else {
   66.52 -			if (!xs_transaction_end(h, false))
   66.53 -				barf_perror("%i: can't commit transact %s",
   66.54 -					    childnum, lockdir);
   66.55 -
   66.56 -			/* Offset when we print . so kids don't all
   66.57 -			 * print at once. */
   66.58 -			if ((i + print/(childnum+1)) % print == 0)
   66.59 -				write(STDOUT_FILENO, &id, 1);
   66.60 +			if (!xs_transaction_end(h, false)) {
   66.61 +				if (errno == EAGAIN) {
   66.62 +					write(STDOUT_FILENO, "!", 1);
   66.63 +					i--;
   66.64 +				} else
   66.65 +					barf_perror("%i: can't commit trans",
   66.66 +						    childnum);
   66.67 +			} else {
   66.68 +				/* Offset when we print . so kids don't all
   66.69 +				 * print at once. */
   66.70 +				if ((i + print/(childnum+1)) % print == 0)
   66.71 +					write(STDOUT_FILENO, &id, 1);
   66.72 +			}
   66.73  		}
   66.74  	}
   66.75  }
   66.76 @@ -201,7 +201,7 @@ int main(int argc, char *argv[])
   66.77  	printf("\nCounting results...\n");
   66.78  	i = tally_counts();
   66.79  	if (i != (unsigned)atoi(argv[1]))
   66.80 -		barf("Total counts %i not %s", i, atoi(argv[1]));
   66.81 +		barf("Total counts %i not %s", i, argv[1]);
   66.82  	printf("Success!\n");
   66.83  	exit(0);
   66.84  }
    67.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    67.2 +++ b/tools/xenstore/xs_tdb_dump.c	Thu Sep 29 16:22:02 2005 -0600
    67.3 @@ -0,0 +1,82 @@
    67.4 +/* Simple program to dump out all records of TDB */
    67.5 +#include <stdint.h>
    67.6 +#include <stdlib.h>
    67.7 +#include <fcntl.h>
    67.8 +#include <stdio.h>
    67.9 +#include <stdarg.h>
   67.10 +
   67.11 +#include "xs_lib.h"
   67.12 +#include "tdb.h"
   67.13 +#include "talloc.h"
   67.14 +#include "utils.h"
   67.15 +
   67.16 +struct record_hdr {
   67.17 +	u32 num_perms;
   67.18 +	u32 datalen;
   67.19 +	u32 childlen;
   67.20 +	struct xs_permissions perms[0];
   67.21 +};
   67.22 +
   67.23 +static u32 total_size(struct record_hdr *hdr)
   67.24 +{
   67.25 +	return sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) 
   67.26 +		+ hdr->datalen + hdr->childlen;
   67.27 +}
   67.28 +
   67.29 +static char perm_to_char(enum xs_perm_type perm)
   67.30 +{
   67.31 +	return perm == XS_PERM_READ ? 'r' :
   67.32 +		perm == XS_PERM_WRITE ? 'w' :
   67.33 +		perm == XS_PERM_NONE ? '-' :
   67.34 +		perm == (XS_PERM_READ|XS_PERM_WRITE) ? 'b' :
   67.35 +		'?';
   67.36 +}
   67.37 +
   67.38 +int main(int argc, char *argv[])
   67.39 +{
   67.40 +	TDB_DATA key;
   67.41 +	TDB_CONTEXT *tdb;
   67.42 +
   67.43 +	if (argc != 2)
   67.44 +		barf("Usage: xs_tdb_dump <tdbfile>");
   67.45 +
   67.46 +	tdb = tdb_open(talloc_strdup(NULL, argv[1]), 0, 0, O_RDONLY, 0);
   67.47 +	if (!tdb)
   67.48 +		barf_perror("Could not open %s", argv[1]);
   67.49 +
   67.50 +	key = tdb_firstkey(tdb);
   67.51 +	while (key.dptr) {
   67.52 +		TDB_DATA data;
   67.53 +		struct record_hdr *hdr;
   67.54 +
   67.55 +		data = tdb_fetch(tdb, key);
   67.56 +		hdr = (void *)data.dptr;
   67.57 +		if (data.dsize < sizeof(*hdr))
   67.58 +			fprintf(stderr, "%.*s: BAD truncated\n",
   67.59 +				key.dsize, key.dptr);
   67.60 +		else if (data.dsize != total_size(hdr))
   67.61 +			fprintf(stderr, "%.*s: BAD length %i for %i/%i/%i (%i)\n",
   67.62 +				key.dsize, key.dptr, data.dsize,
   67.63 +				hdr->num_perms, hdr->datalen,
   67.64 +				hdr->childlen, total_size(hdr));
   67.65 +		else {
   67.66 +			unsigned int i;
   67.67 +			char *p;
   67.68 +
   67.69 +			printf("%.*s: ", key.dsize, key.dptr);
   67.70 +			for (i = 0; i < hdr->num_perms; i++)
   67.71 +				printf("%s%c%i",
   67.72 +				       i == 0 ? "" : ",",
   67.73 +				       perm_to_char(hdr->perms[i].perms),
   67.74 +				       hdr->perms[i].id);
   67.75 +			p = (void *)&hdr->perms[hdr->num_perms];
   67.76 +			printf(" %.*s\n", hdr->datalen, p);
   67.77 +			p += hdr->datalen;
   67.78 +			for (i = 0; i < hdr->childlen; i += strlen(p+i)+1)
   67.79 +				printf("\t-> %s\n", p+i);
   67.80 +		}
   67.81 +		key = tdb_nextkey(tdb, key);
   67.82 +	}
   67.83 +	return 0;
   67.84 +}
   67.85 +
    68.1 --- a/tools/xenstore/xs_test.c	Thu Sep 29 13:35:13 2005 -0600
    68.2 +++ b/tools/xenstore/xs_test.c	Thu Sep 29 16:22:02 2005 -0600
    68.3 @@ -562,9 +562,9 @@ static void do_unwatch(unsigned int hand
    68.4  		failed(handle);
    68.5  }
    68.6  
    68.7 -static void do_start(unsigned int handle, const char *node)
    68.8 +static void do_start(unsigned int handle)
    68.9  {
   68.10 -	if (!xs_transaction_start(handles[handle], node))
   68.11 +	if (!xs_transaction_start(handles[handle]))
   68.12  		failed(handle);
   68.13  }
   68.14  
   68.15 @@ -791,7 +791,7 @@ static void do_command(unsigned int defa
   68.16  		xs_daemon_close(handles[handle]);
   68.17  		handles[handle] = NULL;
   68.18  	} else if (streq(command, "start"))
   68.19 -		do_start(handle, arg(line, 1));
   68.20 +		do_start(handle);
   68.21  	else if (streq(command, "commit"))
   68.22  		do_end(handle, false);
   68.23  	else if (streq(command, "abort"))
    69.1 --- a/xen/arch/x86/mm.c	Thu Sep 29 13:35:13 2005 -0600
    69.2 +++ b/xen/arch/x86/mm.c	Thu Sep 29 16:22:02 2005 -0600
    69.3 @@ -2273,8 +2273,7 @@ int do_mmu_update(
    69.4  
    69.5  
    69.6  int update_grant_pte_mapping(
    69.7 -    unsigned long pte_addr, l1_pgentry_t _nl1e, 
    69.8 -    struct domain *d, struct vcpu *v)
    69.9 +    unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
   69.10  {
   69.11      int rc = GNTST_okay;
   69.12      void *va;
   69.13 @@ -2282,6 +2281,7 @@ int update_grant_pte_mapping(
   69.14      struct pfn_info *page;
   69.15      u32 type_info;
   69.16      l1_pgentry_t ol1e;
   69.17 +    struct domain *d = v->domain;
   69.18  
   69.19      ASSERT(spin_is_locked(&d->big_lock));
   69.20      ASSERT(!shadow_mode_refcounts(d));
   69.21 @@ -2319,8 +2319,6 @@ int update_grant_pte_mapping(
   69.22  
   69.23      put_page_from_l1e(ol1e, d);
   69.24  
   69.25 -    rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_all : GNTST_okay;
   69.26 -
   69.27      if ( unlikely(shadow_mode_enabled(d)) )
   69.28      {
   69.29          struct domain_mmap_cache sh_mapcache;
   69.30 @@ -2415,10 +2413,10 @@ int clear_grant_pte_mapping(
   69.31  
   69.32  
   69.33  int update_grant_va_mapping(
   69.34 -    unsigned long va, l1_pgentry_t _nl1e, struct domain *d, struct vcpu *v)
   69.35 +    unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
   69.36  {
   69.37 -    int rc = GNTST_okay;
   69.38      l1_pgentry_t *pl1e, ol1e;
   69.39 +    struct domain *d = v->domain;
   69.40      
   69.41      ASSERT(spin_is_locked(&d->big_lock));
   69.42      ASSERT(!shadow_mode_refcounts(d));
   69.43 @@ -2439,12 +2437,10 @@ int update_grant_va_mapping(
   69.44  
   69.45      put_page_from_l1e(ol1e, d);
   69.46  
   69.47 -    rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_one : GNTST_okay;
   69.48 -
   69.49      if ( unlikely(shadow_mode_enabled(d)) )
   69.50          shadow_do_update_va_mapping(va, _nl1e, v);
   69.51  
   69.52 -    return rc;
   69.53 +    return GNTST_okay;
   69.54  }
   69.55  
   69.56  int clear_grant_va_mapping(unsigned long addr, unsigned long frame)
    70.1 --- a/xen/arch/x86/vmx_vmcs.c	Thu Sep 29 13:35:13 2005 -0600
    70.2 +++ b/xen/arch/x86/vmx_vmcs.c	Thu Sep 29 16:22:02 2005 -0600
    70.3 @@ -37,19 +37,19 @@
    70.4  #endif
    70.5  #ifdef CONFIG_VMX
    70.6  
    70.7 -struct vmcs_struct *alloc_vmcs(void) 
    70.8 +struct vmcs_struct *alloc_vmcs(void)
    70.9  {
   70.10      struct vmcs_struct *vmcs;
   70.11      u32 vmx_msr_low, vmx_msr_high;
   70.12  
   70.13      rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
   70.14      vmcs_size = vmx_msr_high & 0x1fff;
   70.15 -    vmcs = alloc_xenheap_pages(get_order_from_bytes(vmcs_size)); 
   70.16 +    vmcs = alloc_xenheap_pages(get_order_from_bytes(vmcs_size));
   70.17      memset((char *)vmcs, 0, vmcs_size); /* don't remove this */
   70.18  
   70.19      vmcs->vmcs_revision_id = vmx_msr_low;
   70.20      return vmcs;
   70.21 -} 
   70.22 +}
   70.23  
   70.24  void free_vmcs(struct vmcs_struct *vmcs)
   70.25  {
   70.26 @@ -65,7 +65,7 @@ static inline int construct_vmcs_control
   70.27      void *io_bitmap_a;
   70.28      void *io_bitmap_b;
   70.29  
   70.30 -    error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL, 
   70.31 +    error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
   70.32                         MONITOR_PIN_BASED_EXEC_CONTROLS);
   70.33  
   70.34      error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS);
   70.35 @@ -73,8 +73,8 @@ static inline int construct_vmcs_control
   70.36      error |= __vmwrite(VM_ENTRY_CONTROLS, MONITOR_VM_ENTRY_CONTROLS);
   70.37  
   70.38      /* need to use 0x1000 instead of PAGE_SIZE */
   70.39 -    io_bitmap_a = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000)); 
   70.40 -    io_bitmap_b = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000)); 
   70.41 +    io_bitmap_a = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000));
   70.42 +    io_bitmap_b = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000));
   70.43      memset(io_bitmap_a, 0xff, 0x1000);
   70.44      /* don't bother debug port access */
   70.45      clear_bit(PC_DEBUG_PORT, io_bitmap_a);
   70.46 @@ -89,8 +89,10 @@ static inline int construct_vmcs_control
   70.47      return error;
   70.48  }
   70.49  
   70.50 -#define GUEST_SEGMENT_LIMIT     0xffffffff      
   70.51 -#define HOST_SEGMENT_LIMIT      0xffffffff      
   70.52 +#define GUEST_LAUNCH_DS         0x08
   70.53 +#define GUEST_LAUNCH_CS         0x10
   70.54 +#define GUEST_SEGMENT_LIMIT     0xffffffff
   70.55 +#define HOST_SEGMENT_LIMIT      0xffffffff
   70.56  
   70.57  struct host_execution_env {
   70.58      /* selectors */
   70.59 @@ -110,72 +112,76 @@ struct host_execution_env {
   70.60      unsigned long tr_base;
   70.61      unsigned long ds_base;
   70.62      unsigned long cs_base;
   70.63 -#ifdef __x86_64__ 
   70.64 -    unsigned long fs_base; 
   70.65 -    unsigned long gs_base; 
   70.66 -#endif 
   70.67 +#ifdef __x86_64__
   70.68 +    unsigned long fs_base;
   70.69 +    unsigned long gs_base;
   70.70 +#endif
   70.71  };
   70.72  
   70.73 -#define round_pgdown(_p) ((_p)&PAGE_MASK) /* coped from domain.c */
   70.74 -
   70.75 -int vmx_setup_platform(struct vcpu *d, struct cpu_user_regs *regs)
   70.76 +static void vmx_setup_platform(struct vcpu *v, struct cpu_user_regs *regs)
   70.77  {
   70.78      int i;
   70.79 -    unsigned int n;
   70.80 -    unsigned long *p, mpfn, offset, addr;
   70.81 -    struct e820entry *e820p;
   70.82 +    unsigned char e820_map_nr;
   70.83 +    struct e820entry *e820entry;
   70.84 +    unsigned char *p;
   70.85 +    unsigned long mpfn;
   70.86      unsigned long gpfn = 0;
   70.87  
   70.88      local_flush_tlb_pge();
   70.89 -    regs->ebx = 0;   /* Linux expects ebx to be 0 for boot proc */
   70.90  
   70.91 -    n = regs->ecx;
   70.92 -    if (n > 32) {
   70.93 -        VMX_DBG_LOG(DBG_LEVEL_1, "Too many e820 entries: %d", n);
   70.94 -        return -1;
   70.95 +    mpfn = get_mfn_from_pfn(E820_MAP_PAGE >> PAGE_SHIFT);
   70.96 +    if (mpfn == INVALID_MFN) {
   70.97 +        printk("Can not find E820 memory map page for VMX domain.\n");
   70.98 +        domain_crash();
   70.99      }
  70.100  
  70.101 -    addr = regs->edi;
  70.102 -    offset = (addr & ~PAGE_MASK);
  70.103 -    addr = round_pgdown(addr);
  70.104 -
  70.105 -    mpfn = get_mfn_from_pfn(addr >> PAGE_SHIFT);
  70.106      p = map_domain_page(mpfn);
  70.107 -
  70.108 -    e820p = (struct e820entry *) ((unsigned long) p + offset); 
  70.109 +    if (p == NULL) {
  70.110 +        printk("Can not map E820 memory map page for VMX domain.\n");
  70.111 +        domain_crash();
  70.112 +    }
  70.113  
  70.114 -#ifndef NDEBUG
  70.115 -    print_e820_memory_map(e820p, n);
  70.116 -#endif
  70.117 +    e820_map_nr = *(p + E820_MAP_NR_OFFSET);
  70.118 +    e820entry = (struct e820entry *)(p + E820_MAP_OFFSET);
  70.119  
  70.120 -    for ( i = 0; i < n; i++ )
  70.121 +    for ( i = 0; i < e820_map_nr; i++ )
  70.122      {
  70.123 -        if ( e820p[i].type == E820_SHARED_PAGE )
  70.124 +        if (e820entry[i].type == E820_SHARED_PAGE)
  70.125          {
  70.126 -            gpfn = (e820p[i].addr >> PAGE_SHIFT);
  70.127 +            gpfn = (e820entry[i].addr >> PAGE_SHIFT);
  70.128              break;
  70.129          }
  70.130      }
  70.131  
  70.132 -    if ( gpfn == 0 )
  70.133 -    {
  70.134 -        unmap_domain_page(p);        
  70.135 -        return -1;
  70.136 -    }   
  70.137 +    if ( gpfn == 0 ) {
  70.138 +        printk("Can not get io request shared page"
  70.139 +               " from E820 memory map for VMX domain.\n");
  70.140 +        unmap_domain_page(p);
  70.141 +        domain_crash();
  70.142 +    }
  70.143 +    unmap_domain_page(p);
  70.144  
  70.145 -    unmap_domain_page(p);        
  70.146 +    if (v->vcpu_id)
  70.147 +        return;
  70.148  
  70.149      /* Initialise shared page */
  70.150      mpfn = get_mfn_from_pfn(gpfn);
  70.151 -    p = map_domain_page(mpfn);
  70.152 -    d->domain->arch.vmx_platform.shared_page_va = (unsigned long)p;
  70.153 +    if (mpfn == INVALID_MFN) {
  70.154 +        printk("Can not find io request shared page for VMX domain.\n");
  70.155 +        domain_crash();
  70.156 +    }
  70.157  
  70.158 -    VMX_DBG_LOG(DBG_LEVEL_1, "eport: %x\n", iopacket_port(d->domain));
  70.159 +    p = map_domain_page(mpfn);
  70.160 +    if (p == NULL) {
  70.161 +        printk("Can not map io request shared page for VMX domain.\n");
  70.162 +        domain_crash();
  70.163 +    }
  70.164 +    v->domain->arch.vmx_platform.shared_page_va = (unsigned long)p;
  70.165  
  70.166 -    clear_bit(iopacket_port(d->domain), 
  70.167 -              &d->domain->shared_info->evtchn_mask[0]);
  70.168 +    VMX_DBG_LOG(DBG_LEVEL_1, "eport: %x\n", iopacket_port(v->domain));
  70.169  
  70.170 -    return 0;
  70.171 +    clear_bit(iopacket_port(v->domain),
  70.172 +              &v->domain->shared_info->evtchn_mask[0]);
  70.173  }
  70.174  
  70.175  void vmx_set_host_env(struct vcpu *v)
  70.176 @@ -203,7 +209,7 @@ void vmx_set_host_env(struct vcpu *v)
  70.177      error |= __vmwrite(HOST_TR_BASE, host_env.tr_base);
  70.178  }
  70.179  
  70.180 -void vmx_do_launch(struct vcpu *v) 
  70.181 +void vmx_do_launch(struct vcpu *v)
  70.182  {
  70.183  /* Update CR3, GDT, LDT, TR */
  70.184      unsigned int  error = 0;
  70.185 @@ -217,7 +223,7 @@ void vmx_do_launch(struct vcpu *v)
  70.186      error |= __vmwrite(GUEST_CR0, cr0);
  70.187      cr0 &= ~X86_CR0_PG;
  70.188      error |= __vmwrite(CR0_READ_SHADOW, cr0);
  70.189 -    error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, 
  70.190 +    error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
  70.191                         MONITOR_CPU_BASED_EXEC_CONTROLS);
  70.192  
  70.193      __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (cr4) : );
  70.194 @@ -247,7 +253,7 @@ void vmx_do_launch(struct vcpu *v)
  70.195      error |= __vmwrite(GUEST_LDTR_SELECTOR, 0);
  70.196      error |= __vmwrite(GUEST_LDTR_BASE, 0);
  70.197      error |= __vmwrite(GUEST_LDTR_LIMIT, 0);
  70.198 -        
  70.199 +
  70.200      error |= __vmwrite(GUEST_TR_BASE, 0);
  70.201      error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
  70.202  
  70.203 @@ -261,10 +267,8 @@ void vmx_do_launch(struct vcpu *v)
  70.204  /*
  70.205   * Initially set the same environement as host.
  70.206   */
  70.207 -static inline int 
  70.208 -construct_init_vmcs_guest(struct cpu_user_regs *regs, 
  70.209 -                          struct vcpu_guest_context *ctxt,
  70.210 -                          struct host_execution_env *host_env)
  70.211 +static inline int
  70.212 +construct_init_vmcs_guest(struct cpu_user_regs *regs)
  70.213  {
  70.214      int error = 0;
  70.215      union vmcs_arbytes arbytes;
  70.216 @@ -292,31 +296,37 @@ construct_init_vmcs_guest(struct cpu_use
  70.217      error |= __vmwrite(CR3_TARGET_COUNT, 0);
  70.218  
  70.219      /* Guest Selectors */
  70.220 -    error |= __vmwrite(GUEST_CS_SELECTOR, regs->cs);
  70.221 -    error |= __vmwrite(GUEST_ES_SELECTOR, regs->es);
  70.222 -    error |= __vmwrite(GUEST_SS_SELECTOR, regs->ss);
  70.223 -    error |= __vmwrite(GUEST_DS_SELECTOR, regs->ds);
  70.224 -    error |= __vmwrite(GUEST_FS_SELECTOR, regs->fs);
  70.225 -    error |= __vmwrite(GUEST_GS_SELECTOR, regs->gs);
  70.226 +    error |= __vmwrite(GUEST_ES_SELECTOR, GUEST_LAUNCH_DS);
  70.227 +    error |= __vmwrite(GUEST_SS_SELECTOR, GUEST_LAUNCH_DS);
  70.228 +    error |= __vmwrite(GUEST_DS_SELECTOR, GUEST_LAUNCH_DS);
  70.229 +    error |= __vmwrite(GUEST_FS_SELECTOR, GUEST_LAUNCH_DS);
  70.230 +    error |= __vmwrite(GUEST_GS_SELECTOR, GUEST_LAUNCH_DS);
  70.231 +    error |= __vmwrite(GUEST_CS_SELECTOR, GUEST_LAUNCH_CS);
  70.232 +
  70.233 +    /* Guest segment bases */
  70.234 +    error |= __vmwrite(GUEST_ES_BASE, 0);
  70.235 +    error |= __vmwrite(GUEST_SS_BASE, 0);
  70.236 +    error |= __vmwrite(GUEST_DS_BASE, 0);
  70.237 +    error |= __vmwrite(GUEST_FS_BASE, 0);
  70.238 +    error |= __vmwrite(GUEST_GS_BASE, 0);
  70.239 +    error |= __vmwrite(GUEST_CS_BASE, 0);
  70.240  
  70.241      /* Guest segment Limits */
  70.242 -    error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
  70.243      error |= __vmwrite(GUEST_ES_LIMIT, GUEST_SEGMENT_LIMIT);
  70.244      error |= __vmwrite(GUEST_SS_LIMIT, GUEST_SEGMENT_LIMIT);
  70.245      error |= __vmwrite(GUEST_DS_LIMIT, GUEST_SEGMENT_LIMIT);
  70.246      error |= __vmwrite(GUEST_FS_LIMIT, GUEST_SEGMENT_LIMIT);
  70.247      error |= __vmwrite(GUEST_GS_LIMIT, GUEST_SEGMENT_LIMIT);
  70.248 +    error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
  70.249  
  70.250 -    error |= __vmwrite(GUEST_IDTR_LIMIT, host_env->idtr_limit);
  70.251 -
  70.252 -    /* AR bytes */
  70.253 +    /* Guest segment AR bytes */
  70.254      arbytes.bytes = 0;
  70.255      arbytes.fields.seg_type = 0x3;          /* type = 3 */
  70.256      arbytes.fields.s = 1;                   /* code or data, i.e. not system */
  70.257      arbytes.fields.dpl = 0;                 /* DPL = 3 */
  70.258      arbytes.fields.p = 1;                   /* segment present */
  70.259      arbytes.fields.default_ops_size = 1;    /* 32-bit */
  70.260 -    arbytes.fields.g = 1;   
  70.261 +    arbytes.fields.g = 1;
  70.262      arbytes.fields.null_bit = 0;            /* not null */
  70.263  
  70.264      error |= __vmwrite(GUEST_ES_AR_BYTES, arbytes.bytes);
  70.265 @@ -328,35 +338,31 @@ construct_init_vmcs_guest(struct cpu_use
  70.266      arbytes.fields.seg_type = 0xb;          /* type = 0xb */
  70.267      error |= __vmwrite(GUEST_CS_AR_BYTES, arbytes.bytes);
  70.268  
  70.269 -    error |= __vmwrite(GUEST_GDTR_BASE, regs->edx);
  70.270 -    regs->edx = 0;
  70.271 -    error |= __vmwrite(GUEST_GDTR_LIMIT, regs->eax);
  70.272 -    regs->eax = 0;
  70.273 +    /* Guest GDT */
  70.274 +    error |= __vmwrite(GUEST_GDTR_BASE, 0);
  70.275 +    error |= __vmwrite(GUEST_GDTR_LIMIT, 0);
  70.276  
  70.277 +    /* Guest IDT */
  70.278 +    error |= __vmwrite(GUEST_IDTR_BASE, 0);
  70.279 +    error |= __vmwrite(GUEST_IDTR_LIMIT, 0);
  70.280 +
  70.281 +    /* Guest LDT & TSS */
  70.282      arbytes.fields.s = 0;                   /* not code or data segement */
  70.283      arbytes.fields.seg_type = 0x2;          /* LTD */
  70.284      arbytes.fields.default_ops_size = 0;    /* 16-bit */
  70.285 -    arbytes.fields.g = 0;   
  70.286 +    arbytes.fields.g = 0;
  70.287      error |= __vmwrite(GUEST_LDTR_AR_BYTES, arbytes.bytes);
  70.288  
  70.289      arbytes.fields.seg_type = 0xb;          /* 32-bit TSS (busy) */
  70.290      error |= __vmwrite(GUEST_TR_AR_BYTES, arbytes.bytes);
  70.291      /* CR3 is set in vmx_final_setup_guest */
  70.292  
  70.293 -    error |= __vmwrite(GUEST_ES_BASE, host_env->ds_base);
  70.294 -    error |= __vmwrite(GUEST_CS_BASE, host_env->cs_base);
  70.295 -    error |= __vmwrite(GUEST_SS_BASE, host_env->ds_base);
  70.296 -    error |= __vmwrite(GUEST_DS_BASE, host_env->ds_base);
  70.297 -    error |= __vmwrite(GUEST_FS_BASE, host_env->ds_base);
  70.298 -    error |= __vmwrite(GUEST_GS_BASE, host_env->ds_base);
  70.299 -    error |= __vmwrite(GUEST_IDTR_BASE, host_env->idtr_base);
  70.300 -
  70.301 -    error |= __vmwrite(GUEST_RSP, regs->esp);
  70.302 +    error |= __vmwrite(GUEST_RSP, 0);
  70.303      error |= __vmwrite(GUEST_RIP, regs->eip);
  70.304  
  70.305 +    /* Guest EFLAGS */
  70.306      eflags = regs->eflags & ~VMCS_EFLAGS_RESERVED_0; /* clear 0s */
  70.307      eflags |= VMCS_EFLAGS_RESERVED_1; /* set 1s */
  70.308 -
  70.309      error |= __vmwrite(GUEST_RFLAGS, eflags);
  70.310  
  70.311      error |= __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
  70.312 @@ -381,14 +387,14 @@ static inline int construct_vmcs_host(st
  70.313  #if defined (__i386__)
  70.314      error |= __vmwrite(HOST_FS_SELECTOR, host_env->ds_selector);
  70.315      error |= __vmwrite(HOST_GS_SELECTOR, host_env->ds_selector);
  70.316 -    error |= __vmwrite(HOST_FS_BASE, host_env->ds_base); 
  70.317 -    error |= __vmwrite(HOST_GS_BASE, host_env->ds_base); 
  70.318 +    error |= __vmwrite(HOST_FS_BASE, host_env->ds_base);
  70.319 +    error |= __vmwrite(HOST_GS_BASE, host_env->ds_base);
  70.320  
  70.321  #else
  70.322 -    rdmsrl(MSR_FS_BASE, host_env->fs_base); 
  70.323 -    rdmsrl(MSR_GS_BASE, host_env->gs_base); 
  70.324 -    error |= __vmwrite(HOST_FS_BASE, host_env->fs_base); 
  70.325 -    error |= __vmwrite(HOST_GS_BASE, host_env->gs_base); 
  70.326 +    rdmsrl(MSR_FS_BASE, host_env->fs_base);
  70.327 +    rdmsrl(MSR_GS_BASE, host_env->gs_base);
  70.328 +    error |= __vmwrite(HOST_FS_BASE, host_env->fs_base);
  70.329 +    error |= __vmwrite(HOST_GS_BASE, host_env->gs_base);
  70.330  
  70.331  #endif
  70.332      host_env->cs_selector = __HYPERVISOR_CS;
  70.333 @@ -401,16 +407,16 @@ static inline int construct_vmcs_host(st
  70.334      error |= __vmwrite(HOST_CR0, crn); /* same CR0 */
  70.335  
  70.336      /* CR3 is set in vmx_final_setup_hostos */
  70.337 -    __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) : ); 
  70.338 +    __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) : );
  70.339      error |= __vmwrite(HOST_CR4, crn);
  70.340  
  70.341      error |= __vmwrite(HOST_RIP, (unsigned long) vmx_asm_vmexit_handler);
  70.342 -#ifdef __x86_64__ 
  70.343 -    /* TBD: support cr8 for 64-bit guest */ 
  70.344 -    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0); 
  70.345 -    __vmwrite(TPR_THRESHOLD, 0); 
  70.346 -    __vmwrite(SECONDARY_VM_EXEC_CONTROL, 0); 
  70.347 -#endif 
  70.348 +#ifdef __x86_64__
  70.349 +    /* TBD: support cr8 for 64-bit guest */
  70.350 +    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0);
  70.351 +    __vmwrite(TPR_THRESHOLD, 0);
  70.352 +    __vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
  70.353 +#endif
  70.354  
  70.355      return error;
  70.356  }
  70.357 @@ -440,37 +446,37 @@ int construct_vmcs(struct arch_vmx_struc
  70.358  
  70.359      if ((error = __vmpclear (vmcs_phys_ptr))) {
  70.360          printk("construct_vmcs: VMCLEAR failed\n");
  70.361 -        return -EINVAL;         
  70.362 +        return -EINVAL;
  70.363      }
  70.364      if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
  70.365          printk("construct_vmcs: load_vmcs failed: VMCS = %lx\n",
  70.366                 (unsigned long) vmcs_phys_ptr);
  70.367 -        return -EINVAL; 
  70.368 +        return -EINVAL;
  70.369      }
  70.370      if ((error = construct_vmcs_controls(arch_vmx))) {
  70.371          printk("construct_vmcs: construct_vmcs_controls failed\n");
  70.372 -        return -EINVAL;         
  70.373 +        return -EINVAL;
  70.374      }
  70.375      /* host selectors */
  70.376      if ((error = construct_vmcs_host(&host_env))) {
  70.377          printk("construct_vmcs: construct_vmcs_host failed\n");
  70.378 -        return -EINVAL;         
  70.379 +        return -EINVAL;
  70.380      }
  70.381      /* guest selectors */
  70.382 -    if ((error = construct_init_vmcs_guest(regs, ctxt, &host_env))) {
  70.383 +    if ((error = construct_init_vmcs_guest(regs))) {
  70.384          printk("construct_vmcs: construct_vmcs_guest failed\n");
  70.385 -        return -EINVAL;         
  70.386 -    }       
  70.387 +        return -EINVAL;
  70.388 +    }
  70.389  
  70.390 -    if ((error |= __vmwrite(EXCEPTION_BITMAP, 
  70.391 +    if ((error |= __vmwrite(EXCEPTION_BITMAP,
  70.392                              MONITOR_DEFAULT_EXCEPTION_BITMAP))) {
  70.393          printk("construct_vmcs: setting Exception bitmap failed\n");
  70.394 -        return -EINVAL;         
  70.395 +        return -EINVAL;
  70.396      }
  70.397  
  70.398      if (regs->eflags & EF_TF)
  70.399          __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
  70.400 -    else 
  70.401 +    else
  70.402          __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
  70.403  
  70.404      return 0;
  70.405 @@ -491,7 +497,7 @@ int modify_vmcs(struct arch_vmx_struct *
  70.406      if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
  70.407          printk("modify_vmcs: load_vmcs failed: VMCS = %lx\n",
  70.408                 (unsigned long) vmcs_phys_ptr);
  70.409 -        return -EINVAL; 
  70.410 +        return -EINVAL;
  70.411      }
  70.412      load_cpu_user_regs(regs);
  70.413  
  70.414 @@ -500,23 +506,23 @@ int modify_vmcs(struct arch_vmx_struct *
  70.415      return 0;
  70.416  }
  70.417  
  70.418 -int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
  70.419 +int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
  70.420  {
  70.421      int error;
  70.422  
  70.423      if ((error = __vmptrld(phys_ptr))) {
  70.424 -        clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  70.425 +        clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  70.426          return error;
  70.427      }
  70.428 -    set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  70.429 +    set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  70.430      return 0;
  70.431  }
  70.432  
  70.433 -int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
  70.434 +int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
  70.435  {
  70.436      /* take the current VMCS */
  70.437      __vmptrst(phys_ptr);
  70.438 -    clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  70.439 +    clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  70.440      return 0;
  70.441  }
  70.442  
  70.443 @@ -536,7 +542,7 @@ void vm_resume_fail(unsigned long eflags
  70.444      __vmx_bug(guest_cpu_user_regs());
  70.445  }
  70.446  
  70.447 -void arch_vmx_do_resume(struct vcpu *v) 
  70.448 +void arch_vmx_do_resume(struct vcpu *v)
  70.449  {
  70.450      u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
  70.451  
  70.452 @@ -545,7 +551,7 @@ void arch_vmx_do_resume(struct vcpu *v)
  70.453      reset_stack_and_jump(vmx_asm_do_resume);
  70.454  }
  70.455  
  70.456 -void arch_vmx_do_launch(struct vcpu *v) 
  70.457 +void arch_vmx_do_launch(struct vcpu *v)
  70.458  {
  70.459      u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
  70.460  
    71.1 --- a/xen/common/grant_table.c	Thu Sep 29 13:35:13 2005 -0600
    71.2 +++ b/xen/common/grant_table.c	Thu Sep 29 16:22:02 2005 -0600
    71.3 @@ -24,10 +24,6 @@
    71.4   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    71.5   */
    71.6  
    71.7 -#define GRANT_DEBUG 0
    71.8 -#define GRANT_DEBUG_VERBOSE 0
    71.9 -
   71.10 -#include <xen/config.h>
   71.11  #include <xen/lib.h>
   71.12  #include <xen/sched.h>
   71.13  #include <xen/shadow.h>
   71.14 @@ -68,39 +64,32 @@ put_maptrack_handle(
   71.15      t->map_count--;
   71.16  }
   71.17  
   71.18 +/*
   71.19 + * Returns 0 if TLB flush / invalidate required by caller.
   71.20 + * va will indicate the address to be invalidated.
   71.21 + * 
   71.22 + * addr is _either_ a host virtual address, or the address of the pte to
   71.23 + * update, as indicated by the GNTMAP_contains_pte flag.
   71.24 + */
   71.25  static int
   71.26 -__gnttab_activate_grant_ref(
   71.27 -    struct domain   *mapping_d,          /* IN */
   71.28 -    struct vcpu     *mapping_ed,
   71.29 -    struct domain   *granting_d,
   71.30 -    grant_ref_t      ref,
   71.31 -    u16              dev_hst_ro_flags,
   71.32 -    u64              addr,
   71.33 -    unsigned long   *pframe )            /* OUT */
   71.34 +__gnttab_map_grant_ref(
   71.35 +    gnttab_map_grant_ref_t *uop)
   71.36  {
   71.37 -    domid_t               sdom;
   71.38 -    u16                   sflags;
   71.39 +    domid_t        dom;
   71.40 +    grant_ref_t    ref;
   71.41 +    struct domain *ld, *rd;
   71.42 +    struct vcpu   *led;
   71.43 +    u16            dev_hst_ro_flags;
   71.44 +    int            handle;
   71.45 +    u64            addr;
   71.46 +    unsigned long  frame = 0;
   71.47 +    int            rc = GNTST_okay;
   71.48      active_grant_entry_t *act;
   71.49 -    grant_entry_t        *sha;
   71.50 -    s16                   rc = 1;
   71.51 -    unsigned long         frame = 0;
   71.52 -    int                   retries = 0;
   71.53  
   71.54 -    /*
   71.55 -     * Objectives of this function:
   71.56 -     * . Make the record ( granting_d, ref ) active, if not already.
   71.57 -     * . Update shared grant entry of owner, indicating frame is mapped.
   71.58 -     * . Increment the owner act->pin reference counts.
   71.59 -     * . get_page on shared frame if new mapping.
   71.60 -     * . get_page_type if this is first RW mapping of frame.
   71.61 -     * . Add PTE to virtual address space of mapping_d, if necessary.
   71.62 -     * Returns:
   71.63 -     * .  -ve: error
   71.64 -     * .    1: ok
   71.65 -     * .    0: ok and TLB invalidate of host_addr needed.
   71.66 -     *
   71.67 -     * On success, *pframe contains mfn.
   71.68 -     */
   71.69 +    /* Entry details from @rd's shared grant table. */
   71.70 +    grant_entry_t *sha;
   71.71 +    domid_t        sdom;
   71.72 +    u16            sflags;
   71.73  
   71.74      /*
   71.75       * We bound the number of times we retry CMPXCHG on memory locations that
   71.76 @@ -110,11 +99,88 @@ static int
   71.77       * the guest to race our updates (e.g., to change the GTF_readonly flag),
   71.78       * so we allow a few retries before failing.
   71.79       */
   71.80 +    int retries = 0;
   71.81  
   71.82 -    act = &granting_d->grant_table->active[ref];
   71.83 -    sha = &granting_d->grant_table->shared[ref];
   71.84 +    led = current;
   71.85 +    ld = led->domain;
   71.86  
   71.87 -    spin_lock(&granting_d->grant_table->lock);
   71.88 +    /* Bitwise-OR avoids short-circuiting which screws control flow. */
   71.89 +    if ( unlikely(__get_user(dom, &uop->dom) |
   71.90 +                  __get_user(ref, &uop->ref) |
   71.91 +                  __get_user(addr, &uop->host_addr) |
   71.92 +                  __get_user(dev_hst_ro_flags, &uop->flags)) )
   71.93 +    {
   71.94 +        DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n");
   71.95 +        return -EFAULT; /* don't set status */
   71.96 +    }
   71.97 +
   71.98 +    if ( unlikely(ref >= NR_GRANT_ENTRIES) ||
   71.99 +         unlikely((dev_hst_ro_flags &
  71.100 +                   (GNTMAP_device_map|GNTMAP_host_map)) == 0) )
  71.101 +    {
  71.102 +        DPRINTK("Bad ref (%d) or flags (%x).\n", ref, dev_hst_ro_flags);
  71.103 +        (void)__put_user(GNTST_bad_gntref, &uop->handle);
  71.104 +        return GNTST_bad_gntref;
  71.105 +    }
  71.106 +
  71.107 +    if ( acm_pre_grant_map_ref(dom) )
  71.108 +    {
  71.109 +        (void)__put_user(GNTST_permission_denied, &uop->handle);
  71.110 +        return GNTST_permission_denied;
  71.111 +    }
  71.112 +
  71.113 +    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
  71.114 +         unlikely(ld == rd) )
  71.115 +    {
  71.116 +        if ( rd != NULL )
  71.117 +            put_domain(rd);
  71.118 +        DPRINTK("Could not find domain %d\n", dom);
  71.119 +        (void)__put_user(GNTST_bad_domain, &uop->handle);
  71.120 +        return GNTST_bad_domain;
  71.121 +    }
  71.122 +
  71.123 +    /* Get a maptrack handle. */
  71.124 +    if ( unlikely((handle = get_maptrack_handle(ld->grant_table)) == -1) )
  71.125 +    {
  71.126 +        int              i;
  71.127 +        grant_mapping_t *new_mt;
  71.128 +        grant_table_t   *lgt = ld->grant_table;
  71.129 +
  71.130 +        if ( (lgt->maptrack_limit << 1) > MAPTRACK_MAX_ENTRIES )
  71.131 +        {
  71.132 +            put_domain(rd);
  71.133 +            DPRINTK("Maptrack table is at maximum size.\n");
  71.134 +            (void)__put_user(GNTST_no_device_space, &uop->handle);
  71.135 +            return GNTST_no_device_space;
  71.136 +        }
  71.137 +
  71.138 +        /* Grow the maptrack table. */
  71.139 +        new_mt = alloc_xenheap_pages(lgt->maptrack_order + 1);
  71.140 +        if ( new_mt == NULL )
  71.141 +        {
  71.142 +            put_domain(rd);
  71.143 +            DPRINTK("No more map handles available.\n");
  71.144 +            (void)__put_user(GNTST_no_device_space, &uop->handle);
  71.145 +            return GNTST_no_device_space;
  71.146 +        }
  71.147 +
  71.148 +        memcpy(new_mt, lgt->maptrack, PAGE_SIZE << lgt->maptrack_order);
  71.149 +        for ( i = lgt->maptrack_limit; i < (lgt->maptrack_limit << 1); i++ )
  71.150 +            new_mt[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;
  71.151 +
  71.152 +        free_xenheap_pages(lgt->maptrack, lgt->maptrack_order);
  71.153 +        lgt->maptrack          = new_mt;
  71.154 +        lgt->maptrack_order   += 1;
  71.155 +        lgt->maptrack_limit  <<= 1;
  71.156 +
  71.157 +        DPRINTK("Doubled maptrack size\n");
  71.158 +        handle = get_maptrack_handle(ld->grant_table);
  71.159 +    }
  71.160 +
  71.161 +    act = &rd->grant_table->active[ref];
  71.162 +    sha = &rd->grant_table->shared[ref];
  71.163 +
  71.164 +    spin_lock(&rd->grant_table->lock);
  71.165  
  71.166      if ( act->pin == 0 )
  71.167      {
  71.168 @@ -132,10 +198,10 @@ static int
  71.169              u32 scombo, prev_scombo, new_scombo;
  71.170  
  71.171              if ( unlikely((sflags & GTF_type_mask) != GTF_permit_access) ||
  71.172 -                 unlikely(sdom != mapping_d->domain_id) )
  71.173 +                 unlikely(sdom != led->domain->domain_id) )
  71.174                  PIN_FAIL(unlock_out, GNTST_general_error,
  71.175                           "Bad flags (%x) or dom (%d). (NB. expected dom %d)\n",
  71.176 -                        sflags, sdom, mapping_d->domain_id);
  71.177 +                        sflags, sdom, led->domain->domain_id);
  71.178  
  71.179              /* Merge two 16-bit values into a 32-bit combined update. */
  71.180              /* NB. Endianness! */
  71.181 @@ -173,12 +239,12 @@ static int
  71.182  
  71.183          /* rmb(); */ /* not on x86 */
  71.184  
  71.185 -        frame = __gpfn_to_mfn_foreign(granting_d, sha->frame);
  71.186 +        frame = __gpfn_to_mfn_foreign(rd, sha->frame);
  71.187  
  71.188          if ( unlikely(!pfn_valid(frame)) ||
  71.189               unlikely(!((dev_hst_ro_flags & GNTMAP_readonly) ?
  71.190 -                        get_page(&frame_table[frame], granting_d) :
  71.191 -                        get_page_and_type(&frame_table[frame], granting_d,
  71.192 +                        get_page(&frame_table[frame], rd) :
  71.193 +                        get_page_and_type(&frame_table[frame], rd,
  71.194                                            PGT_writable_page))) )
  71.195          {
  71.196              clear_bit(_GTF_writing, &sha->flags);
  71.197 @@ -208,10 +274,11 @@ static int
  71.198              PIN_FAIL(unlock_out, ENOSPC,
  71.199                       "Risk of counter overflow %08x\n", act->pin);
  71.200  
  71.201 -        frame = act->frame;
  71.202 +        sflags = sha->flags;
  71.203 +        frame  = act->frame;
  71.204  
  71.205 -        if ( !(dev_hst_ro_flags & GNTMAP_readonly) && 
  71.206 -             !((sflags = sha->flags) & GTF_writing) )
  71.207 +        if ( !(dev_hst_ro_flags & GNTMAP_readonly) &&
  71.208 +             !(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) )
  71.209          {
  71.210              for ( ; ; )
  71.211              {
  71.212 @@ -264,9 +331,9 @@ static int
  71.213       * frame contains the mfn.
  71.214       */
  71.215  
  71.216 -    spin_unlock(&granting_d->grant_table->lock);
  71.217 +    spin_unlock(&rd->grant_table->lock);
  71.218  
  71.219 -    if ( (addr != 0) && (dev_hst_ro_flags & GNTMAP_host_map) )
  71.220 +    if ( dev_hst_ro_flags & GNTMAP_host_map )
  71.221      {
  71.222          /* Write update into the pagetable. */
  71.223          l1_pgentry_t pte;
  71.224 @@ -278,18 +345,15 @@ static int
  71.225              l1e_add_flags(pte,_PAGE_RW);
  71.226  
  71.227          if ( dev_hst_ro_flags & GNTMAP_contains_pte )
  71.228 -            rc = update_grant_pte_mapping(addr, pte, mapping_d, mapping_ed);
  71.229 +            rc = update_grant_pte_mapping(addr, pte, led);
  71.230          else
  71.231 -            rc = update_grant_va_mapping(addr, pte, mapping_d, mapping_ed);
  71.232 +            rc = update_grant_va_mapping(addr, pte, led);
  71.233  
  71.234 -        /* IMPORTANT: rc indicates the degree of TLB flush that is required.
  71.235 -         * GNTST_flush_one (1) or GNTST_flush_all (2). This is done in the 
  71.236 -         * outer gnttab_map_grant_ref. */
  71.237          if ( rc < 0 )
  71.238          {
  71.239              /* Failure: undo and abort. */
  71.240  
  71.241 -            spin_lock(&granting_d->grant_table->lock);
  71.242 +            spin_lock(&rd->grant_table->lock);
  71.243  
  71.244              if ( dev_hst_ro_flags & GNTMAP_readonly )
  71.245              {
  71.246 @@ -311,186 +375,44 @@ static int
  71.247                  put_page(&frame_table[frame]);
  71.248              }
  71.249  
  71.250 -            spin_unlock(&granting_d->grant_table->lock);
  71.251 +            spin_unlock(&rd->grant_table->lock);
  71.252          }
  71.253 -
  71.254 -    }
  71.255 -
  71.256 -    *pframe = frame;
  71.257 -    return rc;
  71.258 -
  71.259 - unlock_out:
  71.260 -    spin_unlock(&granting_d->grant_table->lock);
  71.261 -    return rc;
  71.262 -}
  71.263 -
  71.264 -/*
  71.265 - * Returns 0 if TLB flush / invalidate required by caller.
  71.266 - * va will indicate the address to be invalidated.
  71.267 - * 
  71.268 - * addr is _either_ a host virtual address, or the address of the pte to
  71.269 - * update, as indicated by the GNTMAP_contains_pte flag.
  71.270 - */
  71.271 -static int
  71.272 -__gnttab_map_grant_ref(
  71.273 -    gnttab_map_grant_ref_t *uop,
  71.274 -    unsigned long *va)
  71.275 -{
  71.276 -    domid_t        dom;
  71.277 -    grant_ref_t    ref;
  71.278 -    struct domain *ld, *rd;
  71.279 -    struct vcpu   *led;
  71.280 -    u16            dev_hst_ro_flags;
  71.281 -    int            handle;
  71.282 -    u64            addr;
  71.283 -    unsigned long  frame = 0;
  71.284 -    int            rc;
  71.285 -
  71.286 -    led = current;
  71.287 -    ld = led->domain;
  71.288 -
  71.289 -    /* Bitwise-OR avoids short-circuiting which screws control flow. */
  71.290 -    if ( unlikely(__get_user(dom, &uop->dom) |
  71.291 -                  __get_user(ref, &uop->ref) |
  71.292 -                  __get_user(addr, &uop->host_addr) |
  71.293 -                  __get_user(dev_hst_ro_flags, &uop->flags)) )
  71.294 -    {
  71.295 -        DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n");
  71.296 -        return -EFAULT; /* don't set status */
  71.297 -    }
  71.298 -
  71.299 -    if ( (dev_hst_ro_flags & GNTMAP_host_map) &&
  71.300 -         ( (addr == 0) ||
  71.301 -           (!(dev_hst_ro_flags & GNTMAP_contains_pte) && 
  71.302 -            unlikely(!__addr_ok(addr))) ) )
  71.303 -    {
  71.304 -        DPRINTK("Bad virtual address (%"PRIx64") or flags (%"PRIx16").\n",
  71.305 -                addr, dev_hst_ro_flags);
  71.306 -        (void)__put_user(GNTST_bad_virt_addr, &uop->handle);
  71.307 -        return GNTST_bad_gntref;
  71.308 -    }
  71.309 -
  71.310 -    if ( unlikely(ref >= NR_GRANT_ENTRIES) ||
  71.311 -         unlikely((dev_hst_ro_flags &
  71.312 -                   (GNTMAP_device_map|GNTMAP_host_map)) == 0) )
  71.313 -    {
  71.314 -        DPRINTK("Bad ref (%d) or flags (%x).\n", ref, dev_hst_ro_flags);
  71.315 -        (void)__put_user(GNTST_bad_gntref, &uop->handle);
  71.316 -        return GNTST_bad_gntref;
  71.317 -    }
  71.318 -
  71.319 -    if (acm_pre_grant_map_ref(dom)) {
  71.320 -        (void)__put_user(GNTST_permission_denied, &uop->handle);
  71.321 -        return GNTST_permission_denied;
  71.322      }
  71.323  
  71.324 -    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
  71.325 -         unlikely(ld == rd) )
  71.326 -    {
  71.327 -        if ( rd != NULL )
  71.328 -            put_domain(rd);
  71.329 -        DPRINTK("Could not find domain %d\n", dom);
  71.330 -        (void)__put_user(GNTST_bad_domain, &uop->handle);
  71.331 -        return GNTST_bad_domain;
  71.332 -    }
  71.333 -
  71.334 -    /* Get a maptrack handle. */
  71.335 -    if ( unlikely((handle = get_maptrack_handle(ld->grant_table)) == -1) )
  71.336 -    {
  71.337 -        int              i;
  71.338 -        grant_mapping_t *new_mt;
  71.339 -        grant_table_t   *lgt = ld->grant_table;
  71.340 -
  71.341 -        if ( (lgt->maptrack_limit << 1) > MAPTRACK_MAX_ENTRIES )
  71.342 -        {
  71.343 -            put_domain(rd);
  71.344 -            DPRINTK("Maptrack table is at maximum size.\n");
  71.345 -            (void)__put_user(GNTST_no_device_space, &uop->handle);
  71.346 -            return GNTST_no_device_space;
  71.347 -        }
  71.348 -
  71.349 -        /* Grow the maptrack table. */
  71.350 -        new_mt = alloc_xenheap_pages(lgt->maptrack_order + 1);
  71.351 -        if ( new_mt == NULL )
  71.352 -        {
  71.353 -            put_domain(rd);
  71.354 -            DPRINTK("No more map handles available.\n");
  71.355 -            (void)__put_user(GNTST_no_device_space, &uop->handle);
  71.356 -            return GNTST_no_device_space;
  71.357 -        }
  71.358 -
  71.359 -        memcpy(new_mt, lgt->maptrack, PAGE_SIZE << lgt->maptrack_order);
  71.360 -        for ( i = lgt->maptrack_limit; i < (lgt->maptrack_limit << 1); i++ )
  71.361 -            new_mt[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;
  71.362 +    ld->grant_table->maptrack[handle].domid         = dom;
  71.363 +    ld->grant_table->maptrack[handle].ref_and_flags =
  71.364 +        (ref << MAPTRACK_REF_SHIFT) |
  71.365 +        (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK);
  71.366  
  71.367 -        free_xenheap_pages(lgt->maptrack, lgt->maptrack_order);
  71.368 -        lgt->maptrack          = new_mt;
  71.369 -        lgt->maptrack_order   += 1;
  71.370 -        lgt->maptrack_limit  <<= 1;
  71.371 -
  71.372 -        DPRINTK("Doubled maptrack size\n");
  71.373 -        handle = get_maptrack_handle(ld->grant_table);
  71.374 -    }
  71.375 -
  71.376 -#if GRANT_DEBUG_VERBOSE
  71.377 -    DPRINTK("Mapping grant ref (%hu) for domain (%hu) with flags (%x)\n",
  71.378 -            ref, dom, dev_hst_ro_flags);
  71.379 -#endif
  71.380 -
  71.381 -    if ( (rc = __gnttab_activate_grant_ref(ld, led, rd, ref, dev_hst_ro_flags,
  71.382 -                                           addr, &frame)) >= 0 )
  71.383 -    {
  71.384 -        /*
  71.385 -         * Only make the maptrack live _after_ writing the pte, in case we 
  71.386 -         * overwrite the same frame number, causing a maptrack walk to find it
  71.387 -         */
  71.388 -        ld->grant_table->maptrack[handle].domid = dom;
  71.389 -
  71.390 -        ld->grant_table->maptrack[handle].ref_and_flags
  71.391 -            = (ref << MAPTRACK_REF_SHIFT) |
  71.392 -              (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK);
  71.393 -
  71.394 -        (void)__put_user((u64)frame << PAGE_SHIFT, &uop->dev_bus_addr);
  71.395 -
  71.396 -        if ( ( dev_hst_ro_flags & GNTMAP_host_map ) &&
  71.397 -             !( dev_hst_ro_flags & GNTMAP_contains_pte) )
  71.398 -            *va = addr;
  71.399 -
  71.400 -        (void)__put_user(handle, &uop->handle);
  71.401 -    }
  71.402 -    else
  71.403 -    {
  71.404 -        (void)__put_user(rc, &uop->handle);
  71.405 -        put_maptrack_handle(ld->grant_table, handle);
  71.406 -    }
  71.407 +    (void)__put_user((u64)frame << PAGE_SHIFT, &uop->dev_bus_addr);
  71.408 +    (void)__put_user(handle, &uop->handle);
  71.409  
  71.410      put_domain(rd);
  71.411      return rc;
  71.412 +
  71.413 +
  71.414 + unlock_out:
  71.415 +    spin_unlock(&rd->grant_table->lock);
  71.416 +    (void)__put_user(rc, &uop->handle);
  71.417 +    put_maptrack_handle(ld->grant_table, handle);
  71.418 +    return rc;
  71.419  }
  71.420  
  71.421  static long
  71.422  gnttab_map_grant_ref(
  71.423      gnttab_map_grant_ref_t *uop, unsigned int count)
  71.424  {
  71.425 -    int i, rc, flush = 0;
  71.426 -    unsigned long va = 0;
  71.427 +    int i;
  71.428  
  71.429      for ( i = 0; i < count; i++ )
  71.430 -        if ( (rc =__gnttab_map_grant_ref(&uop[i], &va)) >= 0 )
  71.431 -            flush += rc;
  71.432 -
  71.433 -    if ( flush == 1 )
  71.434 -        flush_tlb_one_mask(current->domain->cpumask, va);
  71.435 -    else if ( flush != 0 ) 
  71.436 -        flush_tlb_mask(current->domain->cpumask);
  71.437 +        (void)__gnttab_map_grant_ref(&uop[i]);
  71.438  
  71.439      return 0;
  71.440  }
  71.441  
  71.442  static int
  71.443  __gnttab_unmap_grant_ref(
  71.444 -    gnttab_unmap_grant_ref_t *uop,
  71.445 -    unsigned long *va)
  71.446 +    gnttab_unmap_grant_ref_t *uop)
  71.447  {
  71.448      domid_t          dom;
  71.449      grant_ref_t      ref;
  71.450 @@ -500,7 +422,7 @@ static int
  71.451      grant_entry_t   *sha;
  71.452      grant_mapping_t *map;
  71.453      u16              flags;
  71.454 -    s16              rc = 1;
  71.455 +    s16              rc = 0;
  71.456      u64              addr, dev_bus_addr;
  71.457      unsigned long    frame;
  71.458  
  71.459 @@ -541,11 +463,6 @@ static int
  71.460          return GNTST_bad_domain;
  71.461      }
  71.462  
  71.463 -#if GRANT_DEBUG_VERBOSE
  71.464 -    DPRINTK("Unmapping grant ref (%hu) for domain (%hu) with handle (%hu)\n",
  71.465 -            ref, dom, handle);
  71.466 -#endif
  71.467 -
  71.468      act = &rd->grant_table->active[ref];
  71.469      sha = &rd->grant_table->shared[ref];
  71.470  
  71.471 @@ -566,8 +483,6 @@ static int
  71.472  
  71.473          map->ref_and_flags &= ~GNTMAP_device_map;
  71.474          (void)__put_user(0, &uop->dev_bus_addr);
  71.475 -
  71.476 -        /* Frame is now unmapped for device access. */
  71.477      }
  71.478  
  71.479      if ( (addr != 0) &&
  71.480 @@ -589,10 +504,6 @@ static int
  71.481  
  71.482          act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_hstr_inc
  71.483                                                : GNTPIN_hstw_inc;
  71.484 -
  71.485 -        rc = 0;
  71.486 -        if ( !( flags & GNTMAP_contains_pte) )
  71.487 -            *va = addr;
  71.488      }
  71.489  
  71.490      if ( (map->ref_and_flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0)
  71.491 @@ -632,17 +543,12 @@ static long
  71.492  gnttab_unmap_grant_ref(
  71.493      gnttab_unmap_grant_ref_t *uop, unsigned int count)
  71.494  {
  71.495 -    int i, flush = 0;
  71.496 -    unsigned long va = 0;
  71.497 +    int i;
  71.498  
  71.499      for ( i = 0; i < count; i++ )
  71.500 -        if ( __gnttab_unmap_grant_ref(&uop[i], &va) == 0 )
  71.501 -            flush++;
  71.502 +        (void)__gnttab_unmap_grant_ref(&uop[i]);
  71.503  
  71.504 -    if ( flush == 1 )
  71.505 -        flush_tlb_one_mask(current->domain->cpumask, va);
  71.506 -    else if ( flush != 0 ) 
  71.507 -        flush_tlb_mask(current->domain->cpumask);
  71.508 +    flush_tlb_mask(current->domain->cpumask);
  71.509  
  71.510      return 0;
  71.511  }
  71.512 @@ -703,9 +609,9 @@ gnttab_setup_table(
  71.513      return 0;
  71.514  }
  71.515  
  71.516 -#if GRANT_DEBUG
  71.517  static int
  71.518 -gnttab_dump_table(gnttab_dump_table_t *uop)
  71.519 +gnttab_dump_table(
  71.520 +    gnttab_dump_table_t *uop)
  71.521  {
  71.522      grant_table_t        *gt;
  71.523      gnttab_dump_table_t   op;
  71.524 @@ -716,6 +622,8 @@ gnttab_dump_table(gnttab_dump_table_t *u
  71.525      grant_mapping_t      *maptrack;
  71.526      int                   i;
  71.527  
  71.528 +    if ( !IS_PRIV(current->domain) )
  71.529 +        return -EPERM;
  71.530  
  71.531      if ( unlikely(copy_from_user(&op, uop, sizeof(op)) != 0) )
  71.532      {
  71.533 @@ -724,9 +632,7 @@ gnttab_dump_table(gnttab_dump_table_t *u
  71.534      }
  71.535  
  71.536      if ( op.dom == DOMID_SELF )
  71.537 -    {
  71.538          op.dom = current->domain->domain_id;
  71.539 -    }
  71.540  
  71.541      if ( unlikely((d = find_domain_by_id(op.dom)) == NULL) )
  71.542      {
  71.543 @@ -750,14 +656,11 @@ gnttab_dump_table(gnttab_dump_table_t *u
  71.544  
  71.545      for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
  71.546      {
  71.547 -        sha_copy =  gt->shared[i];
  71.548 -
  71.549 +        sha_copy = gt->shared[i];
  71.550          if ( sha_copy.flags )
  71.551 -        {
  71.552              DPRINTK("Grant: dom (%hu) SHARED (%d) flags:(%hx) "
  71.553                      "dom:(%hu) frame:(%x)\n",
  71.554                      op.dom, i, sha_copy.flags, sha_copy.domid, sha_copy.frame);
  71.555 -        }
  71.556      }
  71.557  
  71.558      spin_lock(&gt->lock);
  71.559 @@ -765,28 +668,22 @@ gnttab_dump_table(gnttab_dump_table_t *u
  71.560      for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
  71.561      {
  71.562          act = &gt->active[i];
  71.563 -
  71.564          if ( act->pin )
  71.565 -        {
  71.566              DPRINTK("Grant: dom (%hu) ACTIVE (%d) pin:(%x) "
  71.567                      "dom:(%hu) frame:(%lx)\n",
  71.568                      op.dom, i, act->pin, act->domid, act->frame);
  71.569 -        }
  71.570      }
  71.571  
  71.572      for ( i = 0; i < gt->maptrack_limit; i++ )
  71.573      {
  71.574          maptrack = &gt->maptrack[i];
  71.575 -
  71.576          if ( maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK )
  71.577 -        {
  71.578              DPRINTK("Grant: dom (%hu) MAP (%d) ref:(%hu) flags:(%x) "
  71.579                      "dom:(%hu)\n",
  71.580                      op.dom, i,
  71.581                      maptrack->ref_and_flags >> MAPTRACK_REF_SHIFT,
  71.582                      maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK,
  71.583                      maptrack->domid);
  71.584 -        }
  71.585      }
  71.586  
  71.587      spin_unlock(&gt->lock);
  71.588 @@ -794,10 +691,10 @@ gnttab_dump_table(gnttab_dump_table_t *u
  71.589      put_domain(d);
  71.590      return 0;
  71.591  }
  71.592 -#endif
  71.593  
  71.594  static long
  71.595 -gnttab_transfer(gnttab_transfer_t *uop, unsigned int count)
  71.596 +gnttab_transfer(
  71.597 +    gnttab_transfer_t *uop, unsigned int count)
  71.598  {
  71.599      struct domain *d = current->domain;
  71.600      struct domain *e;
  71.601 @@ -810,10 +707,7 @@ gnttab_transfer(gnttab_transfer_t *uop, 
  71.602      for ( i = 0; i < count; i++ )
  71.603      {
  71.604          gnttab_transfer_t *gop = &uop[i];
  71.605 -#if GRANT_DEBUG
  71.606 -        printk("gnttab_transfer: i=%d mfn=%lx domid=%d gref=%08x\n",
  71.607 -               i, gop->mfn, gop->domid, gop->handle);
  71.608 -#endif
  71.609 +
  71.610          page = &frame_table[gop->mfn];
  71.611          
  71.612          if ( unlikely(IS_XEN_HEAP_FRAME(page)))
  71.613 @@ -956,11 +850,9 @@ do_grant_table_op(
  71.614      case GNTTABOP_setup_table:
  71.615          rc = gnttab_setup_table((gnttab_setup_table_t *)uop, count);
  71.616          break;
  71.617 -#if GRANT_DEBUG
  71.618      case GNTTABOP_dump_table:
  71.619          rc = gnttab_dump_table((gnttab_dump_table_t *)uop);
  71.620          break;
  71.621 -#endif
  71.622      case GNTTABOP_transfer:
  71.623          if (unlikely(!array_access_ok(
  71.624              uop, count, sizeof(gnttab_transfer_t))))
  71.625 @@ -1002,12 +894,6 @@ gnttab_check_unmap(
  71.626      
  71.627      lgt = ld->grant_table;
  71.628      
  71.629 -#if GRANT_DEBUG_VERBOSE
  71.630 -    if ( ld->domain_id != 0 )
  71.631 -        DPRINTK("Foreign unref rd(%d) ld(%d) frm(%lx) flgs(%x).\n",
  71.632 -                rd->domain_id, ld->domain_id, frame, readonly);
  71.633 -#endif
  71.634 -    
  71.635      /* Fast exit if we're not mapping anything using grant tables */
  71.636      if ( lgt->map_count == 0 )
  71.637          return 0;
  71.638 @@ -1098,11 +984,6 @@ gnttab_prepare_for_transfer(
  71.639      int            retries = 0;
  71.640      unsigned long  target_pfn;
  71.641  
  71.642 -#if GRANT_DEBUG_VERBOSE
  71.643 -    DPRINTK("gnttab_prepare_for_transfer rd(%hu) ld(%hu) ref(%hu).\n",
  71.644 -            rd->domain_id, ld->domain_id, ref);
  71.645 -#endif
  71.646 -
  71.647      if ( unlikely((rgt = rd->grant_table) == NULL) ||
  71.648           unlikely(ref >= NR_GRANT_ENTRIES) )
  71.649      {
    72.1 --- a/xen/include/asm-x86/e820.h	Thu Sep 29 13:35:13 2005 -0600
    72.2 +++ b/xen/include/asm-x86/e820.h	Thu Sep 29 16:22:02 2005 -0600
    72.3 @@ -11,6 +11,11 @@
    72.4  #define E820_NVS          4
    72.5  #define E820_IO          16
    72.6  #define E820_SHARED_PAGE 17
    72.7 +#define E820_XENSTORE    18
    72.8 +
    72.9 +#define E820_MAP_PAGE        0x00090000
   72.10 +#define E820_MAP_NR_OFFSET   0x000001E8
   72.11 +#define E820_MAP_OFFSET      0x000002D0
   72.12  
   72.13  #ifndef __ASSEMBLY__
   72.14  struct e820entry {
    73.1 --- a/xen/include/asm-x86/mm.h	Thu Sep 29 13:35:13 2005 -0600
    73.2 +++ b/xen/include/asm-x86/mm.h	Thu Sep 29 16:22:02 2005 -0600
    73.3 @@ -380,11 +380,9 @@ extern int __sync_lazy_execstate(void);
    73.4   * hold a reference to the page.
    73.5   */
    73.6  int update_grant_va_mapping(
    73.7 -    unsigned long va, l1_pgentry_t _nl1e, 
    73.8 -    struct domain *d, struct vcpu *v);
    73.9 +    unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v);
   73.10  int update_grant_pte_mapping(
   73.11 -    unsigned long pte_addr, l1_pgentry_t _nl1e, 
   73.12 -    struct domain *d, struct vcpu *v);
   73.13 +    unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v);
   73.14  int clear_grant_va_mapping(unsigned long addr, unsigned long frame);
   73.15  int clear_grant_pte_mapping(
   73.16      unsigned long addr, unsigned long frame, struct domain *d);
    74.1 --- a/xen/include/asm-x86/vmx_platform.h	Thu Sep 29 13:35:13 2005 -0600
    74.2 +++ b/xen/include/asm-x86/vmx_platform.h	Thu Sep 29 16:22:02 2005 -0600
    74.3 @@ -93,7 +93,6 @@ struct virtual_platform_def {
    74.4  
    74.5  extern void handle_mmio(unsigned long, unsigned long);
    74.6  extern void vmx_wait_io(void);