direct-io.hg

changeset 7316:93e27f7ca8a8

Merge
author djm@kirby.fc.hp.com
date Thu Sep 29 16:22:02 2005 -0600 (2005-09-29)
parents c0ac925e8f1d 4e1031ce3bc2
children 61b3b357d827
files Makefile buildconfigs/Rules.mk docs/src/user/installation.tex linux-2.6-xen-sparse/arch/ia64/Kconfig linux-2.6-xen-sparse/arch/ia64/xen-mkbuildtree-pre linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_ia64 linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c linux-2.6-xen-sparse/arch/xen/kernel/reboot.c linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c linux-2.6-xen-sparse/include/asm-xen/asm-ia64/hypervisor.h linux-2.6-xen-sparse/include/asm-xen/xenbus.h tools/check/check_hotplug tools/examples/Makefile tools/examples/xmexample.vmx tools/firmware/vmxassist/Makefile tools/firmware/vmxassist/vmxloader.c tools/ioemu/hw/cirrus_vga.c tools/ioemu/hw/pc.c tools/ioemu/hw/vga.c tools/ioemu/target-i386-dm/helper2.c tools/ioemu/vl.c tools/libxc/xc_vmx_build.c tools/libxc/xenguest.h tools/libxc/xg_private.h tools/python/xen/lowlevel/xc/xc.c tools/python/xen/lowlevel/xs/xs.c tools/python/xen/xend/PrettyPrint.py tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/image.py tools/python/xen/xend/server/DevController.py tools/python/xen/xend/xenstore/xsnode.py tools/python/xen/xend/xenstore/xstransact.py tools/python/xen/xm/main.py tools/xenstore/Makefile tools/xenstore/speedtest.c tools/xenstore/tdb.c tools/xenstore/tdb.h tools/xenstore/testsuite/04rm.test tools/xenstore/testsuite/08transaction.slowtest tools/xenstore/testsuite/08transaction.test tools/xenstore/testsuite/12readonly.test tools/xenstore/testsuite/14complexperms.test tools/xenstore/testsuite/16block-watch-crash.test tools/xenstore/xenstore_client.c tools/xenstore/xenstored.h tools/xenstore/xenstored_core.c tools/xenstore/xenstored_core.h tools/xenstore/xenstored_domain.c tools/xenstore/xenstored_transaction.c tools/xenstore/xenstored_transaction.h tools/xenstore/xenstored_watch.c tools/xenstore/xenstored_watch.h tools/xenstore/xs.c tools/xenstore/xs.h tools/xenstore/xs_lib.c tools/xenstore/xs_lib.h tools/xenstore/xs_random.c tools/xenstore/xs_stress.c tools/xenstore/xs_tdb_dump.c tools/xenstore/xs_test.c xen/arch/ia64/asm-offsets.c xen/arch/ia64/vmx/vmx_process.c xen/arch/ia64/xen/process.c xen/arch/ia64/xen/vcpu.c xen/arch/x86/mm.c xen/arch/x86/vmx_vmcs.c xen/common/grant_table.c xen/include/asm-ia64/vcpu.h xen/include/asm-x86/e820.h xen/include/asm-x86/mm.h xen/include/asm-x86/vmx_platform.h xen/include/xen/grant_table.h
line diff
     1.1 --- a/Makefile	Thu Sep 29 13:35:13 2005 -0600
     1.2 +++ b/Makefile	Thu Sep 29 16:22:02 2005 -0600
     1.3 @@ -164,7 +164,7 @@ help:
     1.4  uninstall: DESTDIR=
     1.5  uninstall: D=$(DESTDIR)
     1.6  uninstall:
     1.7 -	[ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s`
     1.8 +	[ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s` || true
     1.9  	rm -rf $(D)/etc/init.d/xend*
    1.10  	rm -rf $(D)/etc/hotplug/xen-backend.agent
    1.11  	rm -rf $(D)/var/run/xen* $(D)/var/lib/xen*
     2.1 --- a/buildconfigs/Rules.mk	Thu Sep 29 13:35:13 2005 -0600
     2.2 +++ b/buildconfigs/Rules.mk	Thu Sep 29 16:22:02 2005 -0600
     2.3 @@ -16,7 +16,7 @@ PRISTINE_SRC_PATH	?= .:..
     2.4  vpath pristine-% $(PRISTINE_SRC_PATH)
     2.5  
     2.6  # By default, build Linux with ARCH=xen (overridden by some non arch's)
     2.7 -ifneq ($(ARCH),ia64)
     2.8 +ifneq ($(XEN_TARGET_ARCH),ia64)
     2.9  LINUX_ARCH	?= xen
    2.10  else
    2.11  LINUX_ARCH	?= ia64
     3.1 --- a/docs/src/user/installation.tex	Thu Sep 29 13:35:13 2005 -0600
     3.2 +++ b/docs/src/user/installation.tex	Thu Sep 29 16:22:02 2005 -0600
     3.3 @@ -21,6 +21,9 @@ required if you wish to build from sourc
     3.4  \item [$\dag$] The \path{iproute2} package.
     3.5  \item [$\dag$] The Linux bridge-utils\footnote{Available from {\tt
     3.6        http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl})
     3.7 +\item [$\dag$] The Linux hotplug system\footnote{Available from {\tt
     3.8 +      http://linux-hotplug.sourceforge.net/}} (e.g., \path{/sbin/hotplug}
     3.9 +      and related scripts)
    3.10  \item [$\dag$] An installation of Twisted~v1.3 or
    3.11    above\footnote{Available from {\tt http://www.twistedmatrix.com}}.
    3.12    There may be a binary package available for your distribution;
     7.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c	Thu Sep 29 13:35:13 2005 -0600
     7.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c	Thu Sep 29 16:22:02 2005 -0600
     7.3 @@ -1394,9 +1394,7 @@ static void handle_vcpu_hotplug_event(st
     7.4  			return;
     7.5  
     7.6  		/* get the state value */
     7.7 -		xenbus_transaction_start("cpu");
     7.8  		err = xenbus_scanf(dir, "availability", "%s", state);
     7.9 -		xenbus_transaction_end(0);
    7.10  
    7.11  		if (err != 1) {
    7.12  			printk(KERN_ERR
     8.1 --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Thu Sep 29 13:35:13 2005 -0600
     8.2 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Thu Sep 29 16:22:02 2005 -0600
     8.3 @@ -324,7 +324,7 @@ static void shutdown_handler(struct xenb
     8.4      int err;
     8.5  
     8.6   again:
     8.7 -    err = xenbus_transaction_start("control");
     8.8 +    err = xenbus_transaction_start();
     8.9      if (err)
    8.10  	return;
    8.11      str = (char *)xenbus_read("control", "shutdown", NULL);
    8.12 @@ -337,7 +337,7 @@ static void shutdown_handler(struct xenb
    8.13      xenbus_write("control", "shutdown", "");
    8.14  
    8.15      err = xenbus_transaction_end(0);
    8.16 -    if (err == -ETIMEDOUT) {
    8.17 +    if (err == -EAGAIN) {
    8.18  	kfree(str);
    8.19  	goto again;
    8.20      }
    8.21 @@ -366,7 +366,7 @@ static void sysrq_handler(struct xenbus_
    8.22      int err;
    8.23  
    8.24   again:
    8.25 -    err = xenbus_transaction_start("control");
    8.26 +    err = xenbus_transaction_start();
    8.27      if (err)
    8.28  	return;
    8.29      if (!xenbus_scanf("control", "sysrq", "%c", &sysrq_key)) {
    8.30 @@ -379,7 +379,7 @@ static void sysrq_handler(struct xenbus_
    8.31  	xenbus_printf("control", "sysrq", "%c", '\0');
    8.32  
    8.33      err = xenbus_transaction_end(0);
    8.34 -    if (err == -ETIMEDOUT)
    8.35 +    if (err == -EAGAIN)
    8.36  	goto again;
    8.37  
    8.38      if (sysrq_key != '\0') {
     9.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Thu Sep 29 13:35:13 2005 -0600
     9.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Thu Sep 29 16:22:02 2005 -0600
     9.3 @@ -80,8 +80,9 @@ static void frontend_changed(struct xenb
     9.4  		return;
     9.5  	}
     9.6  
     9.7 +again:
     9.8  	/* Supply the information about the device the frontend needs */
     9.9 -	err = xenbus_transaction_start(be->dev->nodename);
    9.10 +	err = xenbus_transaction_start();
    9.11  	if (err) {
    9.12  		xenbus_dev_error(be->dev, err, "starting transaction");
    9.13  		return;
    9.14 @@ -119,7 +120,15 @@ static void frontend_changed(struct xenb
    9.15  		goto abort;
    9.16  	}
    9.17  
    9.18 -	xenbus_transaction_end(0);
    9.19 +	err = xenbus_transaction_end(0);
    9.20 +	if (err == -EAGAIN)
    9.21 +		goto again;
    9.22 +	if (err) {
    9.23 +		xenbus_dev_error(be->dev, err, "ending transaction",
    9.24 +				 ring_ref, evtchn);
    9.25 +		goto abort;
    9.26 +	}
    9.27 +
    9.28  	xenbus_dev_ok(be->dev);
    9.29  
    9.30  	return;
    10.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Sep 29 13:35:13 2005 -0600
    10.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Sep 29 16:22:02 2005 -0600
    10.3 @@ -572,7 +572,8 @@ static int talk_to_backend(struct xenbus
    10.4  		goto out;
    10.5  	}
    10.6  
    10.7 -	err = xenbus_transaction_start(dev->nodename);
    10.8 +again:
    10.9 +	err = xenbus_transaction_start();
   10.10  	if (err) {
   10.11  		xenbus_dev_error(dev, err, "starting transaction");
   10.12  		goto destroy_blkring;
   10.13 @@ -603,6 +604,8 @@ static int talk_to_backend(struct xenbus
   10.14  
   10.15  	err = xenbus_transaction_end(0);
   10.16  	if (err) {
   10.17 +		if (err == -EAGAIN)
   10.18 +			goto again;
   10.19  		xenbus_dev_error(dev, err, "completing transaction");
   10.20  		goto destroy_blkring;
   10.21  	}
    11.1 --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Thu Sep 29 13:35:13 2005 -0600
    11.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Thu Sep 29 16:22:02 2005 -0600
    11.3 @@ -1122,7 +1122,8 @@ static int talk_to_backend(struct xenbus
    11.4  		goto out;
    11.5  	}
    11.6  
    11.7 -	err = xenbus_transaction_start(dev->nodename);
    11.8 +again:
    11.9 +	err = xenbus_transaction_start();
   11.10  	if (err) {
   11.11  		xenbus_dev_error(dev, err, "starting transaction");
   11.12  		goto destroy_ring;
   11.13 @@ -1160,6 +1161,8 @@ static int talk_to_backend(struct xenbus
   11.14  
   11.15  	err = xenbus_transaction_end(0);
   11.16  	if (err) {
   11.17 +		if (err == -EAGAIN)
   11.18 +			goto again;
   11.19  		xenbus_dev_error(dev, err, "completing transaction");
   11.20  		goto destroy_ring;
   11.21  	}
    12.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Thu Sep 29 13:35:13 2005 -0600
    12.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Thu Sep 29 16:22:02 2005 -0600
    12.3 @@ -93,7 +93,8 @@ static void frontend_changed(struct xenb
    12.4  	 * Tell the front-end that we are ready to go -
    12.5  	 * unless something bad happens
    12.6  	 */
    12.7 -	err = xenbus_transaction_start(be->dev->nodename);
    12.8 +again:
    12.9 +	err = xenbus_transaction_start();
   12.10  	if (err) {
   12.11  		xenbus_dev_error(be->dev, err, "starting transaction");
   12.12  		return;
   12.13 @@ -127,7 +128,14 @@ static void frontend_changed(struct xenb
   12.14  		goto abort;
   12.15  	}
   12.16  
   12.17 -	xenbus_transaction_end(0);
   12.18 +	err = xenbus_transaction_end(0);
   12.19 +	if (err == -EAGAIN)
   12.20 +		goto again;
   12.21 +	if (err) {
   12.22 +		xenbus_dev_error(be->dev, err, "end of transaction");
   12.23 +		goto abort;
   12.24 +	}
   12.25 +
   12.26  	xenbus_dev_ok(be->dev);
   12.27  	return;
   12.28  abort:
    13.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Thu Sep 29 13:35:13 2005 -0600
    13.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Thu Sep 29 16:22:02 2005 -0600
    13.3 @@ -331,7 +331,8 @@ static int talk_to_backend(struct xenbus
    13.4  		goto out;
    13.5  	}
    13.6  
    13.7 -	err = xenbus_transaction_start(dev->nodename);
    13.8 +again:
    13.9 +	err = xenbus_transaction_start();
   13.10  	if (err) {
   13.11  		xenbus_dev_error(dev, err, "starting transaction");
   13.12  		goto destroy_tpmring;
   13.13 @@ -363,6 +364,8 @@ static int talk_to_backend(struct xenbus
   13.14  	}
   13.15  
   13.16  	err = xenbus_transaction_end(0);
   13.17 +	if (err == -EAGAIN)
   13.18 +		goto again;
   13.19  	if (err) {
   13.20  		xenbus_dev_error(dev, err, "completing transaction");
   13.21  		goto destroy_tpmring;
    14.1 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c	Thu Sep 29 13:35:13 2005 -0600
    14.2 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c	Thu Sep 29 16:22:02 2005 -0600
    14.3 @@ -287,12 +287,11 @@ EXPORT_SYMBOL(xenbus_rm);
    14.4  
    14.5  /* Start a transaction: changes by others will not be seen during this
    14.6   * transaction, and changes will not be visible to others until end.
    14.7 - * Transaction only applies to the given subtree.
    14.8   * You can only have one transaction at any time.
    14.9   */
   14.10 -int xenbus_transaction_start(const char *subtree)
   14.11 +int xenbus_transaction_start(void)
   14.12  {
   14.13 -	return xs_error(xs_single(XS_TRANSACTION_START, subtree, NULL));
   14.14 +	return xs_error(xs_single(XS_TRANSACTION_START, "", NULL));
   14.15  }
   14.16  EXPORT_SYMBOL(xenbus_transaction_start);
   14.17  
    16.1 --- a/linux-2.6-xen-sparse/include/asm-xen/xenbus.h	Thu Sep 29 13:35:13 2005 -0600
    16.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/xenbus.h	Thu Sep 29 16:22:02 2005 -0600
    16.3 @@ -87,7 +87,7 @@ int xenbus_write(const char *dir, const 
    16.4  int xenbus_mkdir(const char *dir, const char *node);
    16.5  int xenbus_exists(const char *dir, const char *node);
    16.6  int xenbus_rm(const char *dir, const char *node);
    16.7 -int xenbus_transaction_start(const char *subtree);
    16.8 +int xenbus_transaction_start(void);
    16.9  int xenbus_transaction_end(int abort);
   16.10  
   16.11  /* Single read and scanf: returns -errno or num scanned if > 0. */
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/tools/check/check_hotplug	Thu Sep 29 16:22:02 2005 -0600
    17.3 @@ -0,0 +1,10 @@
    17.4 +#!/bin/bash
    17.5 +# CHECK-INSTALL
    17.6 +
    17.7 +function error {
    17.8 +   echo
    17.9 +   echo '  *** Check for the hotplug scripts (hotplug) FAILED'
   17.10 +   exit 1
   17.11 +}
   17.12 +
   17.13 +which hotplug 1>/dev/null 2>&1 || error
    18.1 --- a/tools/examples/Makefile	Thu Sep 29 13:35:13 2005 -0600
    18.2 +++ b/tools/examples/Makefile	Thu Sep 29 16:22:02 2005 -0600
    18.3 @@ -25,19 +25,13 @@ XEN_SCRIPTS += block-phy
    18.4  XEN_SCRIPTS += block-file
    18.5  XEN_SCRIPTS += block-enbd
    18.6  
    18.7 -# no 64-bit specifics in mem-map.sxp
    18.8 -# so place in /usr/lib, not /usr/lib64
    18.9 -XEN_BOOT_DIR = /usr/lib/xen/boot
   18.10 -XEN_BOOT = mem-map.sxp
   18.11 -
   18.12  XEN_HOTPLUG_DIR = /etc/hotplug
   18.13  XEN_HOTPLUG_SCRIPTS = xen-backend.agent
   18.14  
   18.15  all:
   18.16  build:
   18.17  
   18.18 -install: all install-initd install-configs install-scripts install-boot \
   18.19 -	 install-hotplug
   18.20 +install: all install-initd install-configs install-scripts install-hotplug
   18.21  
   18.22  install-initd:
   18.23  	[ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d
   18.24 @@ -62,14 +56,6 @@ install-scripts:
   18.25  	    $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
   18.26  	done
   18.27  
   18.28 -install-boot:
   18.29 -	[ -d $(DESTDIR)$(XEN_BOOT_DIR) ] || \
   18.30 -		$(INSTALL_DIR) $(DESTDIR)$(XEN_BOOT_DIR)
   18.31 -	for i in $(XEN_BOOT); \
   18.32 -	    do [ -a $(DESTDIR)$(XEN_BOOT_DIR)/$$i ] || \
   18.33 -	    $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_BOOT_DIR); \
   18.34 -	done
   18.35 -
   18.36  install-hotplug:
   18.37  	[ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \
   18.38  		$(INSTALL_DIR) $(DESTDIR)$(XEN_HOTPLUG_DIR)
    19.1 --- a/tools/examples/mem-map.sxp	Thu Sep 29 13:35:13 2005 -0600
    19.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.3 @@ -1,10 +0,0 @@
    19.4 -(memmap
    19.5 - (0000000000000000  000000000009f800 "AddressRangeMemory"   WB)
    19.6 - (000000000009f800  00000000000a0000 "AddressRangeReserved" UC)
    19.7 - (00000000000a0000  00000000000bffff "AddressRangeIO"       UC)
    19.8 - (00000000000f0000  0000000000100000 "AddressRangeReserved" UC)
    19.9 - (0000000000100000  0000000008000000 "AddressRangeMemory"   WB)
   19.10 - (0000000007fff000  0000000008000000 "AddressRangeShared"   WB)
   19.11 - (0000000008000000  0000000008003000 "AddressRangeNVS"      UC)
   19.12 - (0000000008003000  000000000800d000 "AddressRangeACPI"     WB)
   19.13 - (00000000fec00000  0000000100000000 "AddressRangeIO"       UC))
    20.1 --- a/tools/examples/xmexample.vmx	Thu Sep 29 13:35:13 2005 -0600
    20.2 +++ b/tools/examples/xmexample.vmx	Thu Sep 29 16:22:02 2005 -0600
    20.3 @@ -60,9 +60,6 @@ disk = [ 'file:/var/images/min-el3-i386.
    20.4  # New stuff
    20.5  device_model = '/usr/' + arch_libdir + '/xen/bin/qemu-dm'
    20.6  
    20.7 -# Advanced users only. Don't touch if you don't know what you're doing
    20.8 -memmap = '/usr/lib/xen/boot/mem-map.sxp'
    20.9 -
   20.10  #-----------------------------------------------------------------------------
   20.11  # Disk image for 
   20.12  #cdrom=
    21.1 --- a/tools/firmware/vmxassist/Makefile	Thu Sep 29 13:35:13 2005 -0600
    21.2 +++ b/tools/firmware/vmxassist/Makefile	Thu Sep 29 16:22:02 2005 -0600
    21.3 @@ -44,7 +44,7 @@ all: vmxloader
    21.4  vmxloader: roms.h vmxloader.c acpi.h acpi_madt.c
    21.5  	${CC} ${CFLAGS} ${DEFINES} -c vmxloader.c -c acpi_madt.c
    21.6  	$(CC) -o vmxloader.tmp -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,0x100000 vmxloader.o acpi_madt.o
    21.7 -	objcopy --change-addresses=0xC0000000 vmxloader.tmp vmxloader
    21.8 +	objcopy vmxloader.tmp vmxloader
    21.9  	rm -f vmxloader.tmp
   21.10  
   21.11  vmxassist.bin: vmxassist.ld ${OBJECTS}
    22.1 --- a/tools/firmware/vmxassist/vmxloader.c	Thu Sep 29 13:35:13 2005 -0600
    22.2 +++ b/tools/firmware/vmxassist/vmxloader.c	Thu Sep 29 16:22:02 2005 -0600
    22.3 @@ -34,28 +34,39 @@ int acpi_madt_update(unsigned char* acpi
    22.4  /*
    22.5   * C runtime start off
    22.6   */
    22.7 -asm("					\n\
    22.8 -	.text				\n\
    22.9 -	.globl	_start			\n\
   22.10 -_start:					\n\
   22.11 -	cli				\n\
   22.12 -	movl	$stack_top, %esp	\n\
   22.13 -	movl	%esp, %ebp		\n\
   22.14 -	call    main			\n\
   22.15 -	jmp	halt			\n\
   22.16 -					\n\
   22.17 -	.globl	halt			\n\
   22.18 -halt:					\n\
   22.19 -	sti				\n\
   22.20 -	jmp	.			\n\
   22.21 -					\n\
   22.22 -	.bss				\n\
   22.23 -	.align	8			\n\
   22.24 -	.globl	stack, stack_top	\n\
   22.25 -stack:					\n\
   22.26 -	.skip	0x4000			\n\
   22.27 -stack_top:				\n\
   22.28 -");
   22.29 +asm(
   22.30 +"	.text				\n"
   22.31 +"	.globl	_start			\n"
   22.32 +"_start:				\n"
   22.33 +"	cld				\n"
   22.34 +"	cli				\n"
   22.35 +"	lgdt	gdt_desr		\n"
   22.36 +"	movl	$stack_top, %esp	\n"
   22.37 +"	movl	%esp, %ebp		\n"
   22.38 +"	call	main			\n"
   22.39 +"	jmp	halt			\n"
   22.40 +"					\n"
   22.41 +"gdt_desr:				\n"
   22.42 +"	.word	gdt_end - gdt - 1	\n"
   22.43 +"	.long	gdt			\n"
   22.44 +"					\n"
   22.45 +"	.align	8			\n"
   22.46 +"gdt:					\n"
   22.47 +"	.quad	0x0000000000000000	\n"
   22.48 +"	.quad	0x00CF92000000FFFF	\n"
   22.49 +"	.quad	0x00CF9A000000FFFF	\n"
   22.50 +"gdt_end:				\n"
   22.51 +"					\n"
   22.52 +"halt:					\n"
   22.53 +"	sti				\n"
   22.54 +"	jmp	.			\n"
   22.55 +"					\n"
   22.56 +"	.bss				\n"
   22.57 +"	.align	8			\n"
   22.58 +"stack:					\n"
   22.59 +"	.skip	0x4000			\n"
   22.60 +"stack_top:				\n"
   22.61 +);
   22.62  
   22.63  void *
   22.64  memcpy(void *dest, const void *src, unsigned n)
   22.65 @@ -95,7 +106,7 @@ cirrus_check(void)
   22.66  }
   22.67  
   22.68  int
   22.69 -main()
   22.70 +main(void)
   22.71  {
   22.72  	puts("VMXAssist Loader\n");
   22.73  	puts("Loading ROMBIOS ...\n");
    23.1 --- a/tools/ioemu/hw/cirrus_vga.c	Thu Sep 29 13:35:13 2005 -0600
    23.2 +++ b/tools/ioemu/hw/cirrus_vga.c	Thu Sep 29 16:22:02 2005 -0600
    23.3 @@ -231,6 +231,8 @@ typedef struct CirrusVGAState {
    23.4      int cirrus_linear_io_addr;
    23.5      int cirrus_linear_bitblt_io_addr;
    23.6      int cirrus_mmio_io_addr;
    23.7 +    unsigned long cirrus_lfb_addr;
    23.8 +    unsigned long cirrus_lfb_end;
    23.9      uint32_t cirrus_addr_mask;
   23.10      uint32_t linear_mmio_mask;
   23.11      uint8_t cirrus_shadow_gr0;
   23.12 @@ -2447,6 +2449,10 @@ static void cirrus_update_memory_access(
   23.13  {
   23.14      unsigned mode;
   23.15  
   23.16 +    extern void unset_vram_mapping(unsigned long addr, unsigned long end);
   23.17 +    extern void set_vram_mapping(unsigned long addr, unsigned long end);
   23.18 +    extern int vga_accelerate;
   23.19 +
   23.20      if ((s->sr[0x17] & 0x44) == 0x44) {
   23.21          goto generic_io;
   23.22      } else if (s->cirrus_srcptr != s->cirrus_srcptr_end) {
   23.23 @@ -2454,17 +2460,21 @@ static void cirrus_update_memory_access(
   23.24      } else {
   23.25  	if ((s->gr[0x0B] & 0x14) == 0x14) {
   23.26              goto generic_io;
   23.27 -	} else if (s->gr[0x0B] & 0x02) {
   23.28 -            goto generic_io;
   23.29 -        }
   23.30 -        
   23.31 -	mode = s->gr[0x05] & 0x7;
   23.32 -	if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
   23.33 +    } else if (s->gr[0x0B] & 0x02) {
   23.34 +        goto generic_io;
   23.35 +    }
   23.36 +
   23.37 +    mode = s->gr[0x05] & 0x7;
   23.38 +    if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
   23.39 +            if (vga_accelerate && s->cirrus_lfb_addr && s->cirrus_lfb_end)
   23.40 +                set_vram_mapping(s->cirrus_lfb_addr, s->cirrus_lfb_end);
   23.41              s->cirrus_linear_write[0] = cirrus_linear_mem_writeb;
   23.42              s->cirrus_linear_write[1] = cirrus_linear_mem_writew;
   23.43              s->cirrus_linear_write[2] = cirrus_linear_mem_writel;
   23.44          } else {
   23.45          generic_io:
   23.46 +            if (vga_accelerate && s->cirrus_lfb_addr && s->cirrus_lfb_end)
   23.47 +                 unset_vram_mapping(s->cirrus_lfb_addr, s->cirrus_lfb_end);
   23.48              s->cirrus_linear_write[0] = cirrus_linear_writeb;
   23.49              s->cirrus_linear_write[1] = cirrus_linear_writew;
   23.50              s->cirrus_linear_write[2] = cirrus_linear_writel;
   23.51 @@ -3058,6 +3068,8 @@ static void cirrus_pci_lfb_map(PCIDevice
   23.52      /* XXX: add byte swapping apertures */
   23.53      cpu_register_physical_memory(addr, s->vram_size,
   23.54  				 s->cirrus_linear_io_addr);
   23.55 +    s->cirrus_lfb_addr = addr;
   23.56 +    s->cirrus_lfb_end = addr + VGA_RAM_SIZE;
   23.57      cpu_register_physical_memory(addr + 0x1000000, 0x400000,
   23.58  				 s->cirrus_linear_bitblt_io_addr);
   23.59  }
    24.1 --- a/tools/ioemu/hw/pc.c	Thu Sep 29 13:35:13 2005 -0600
    24.2 +++ b/tools/ioemu/hw/pc.c	Thu Sep 29 16:22:02 2005 -0600
    24.3 @@ -385,6 +385,7 @@ void pc_init(int ram_size, int vga_ram_s
    24.4      unsigned long bios_offset, vga_bios_offset;
    24.5      int bios_size, isa_bios_size;
    24.6      PCIBus *pci_bus;
    24.7 +    extern void * shared_vram;
    24.8      
    24.9      linux_boot = (kernel_filename != NULL);
   24.10  
   24.11 @@ -511,14 +512,14 @@ void pc_init(int ram_size, int vga_ram_s
   24.12      if (cirrus_vga_enabled) {
   24.13          if (pci_enabled) {
   24.14              pci_cirrus_vga_init(pci_bus, 
   24.15 -                                ds, phys_ram_base + ram_size, ram_size, 
   24.16 +                                ds, shared_vram, ram_size, 
   24.17                                  vga_ram_size);
   24.18          } else {
   24.19 -            isa_cirrus_vga_init(ds, phys_ram_base + ram_size, ram_size, 
   24.20 +            isa_cirrus_vga_init(ds, shared_vram, ram_size, 
   24.21                                  vga_ram_size);
   24.22          }
   24.23      } else {
   24.24 -        vga_initialize(pci_bus, ds, phys_ram_base + ram_size, ram_size, 
   24.25 +        vga_initialize(pci_bus, ds, shared_vram, ram_size, 
   24.26                         vga_ram_size);
   24.27      }
   24.28  
    25.1 --- a/tools/ioemu/hw/vga.c	Thu Sep 29 13:35:13 2005 -0600
    25.2 +++ b/tools/ioemu/hw/vga.c	Thu Sep 29 16:22:02 2005 -0600
    25.3 @@ -1568,6 +1568,8 @@ void vga_update_display(void)
    25.4              s->graphic_mode = graphic_mode;
    25.5              full_update = 1;
    25.6          }
    25.7 +
    25.8 +        full_update = 1;
    25.9          switch(graphic_mode) {
   25.10          case GMODE_TEXT:
   25.11              vga_draw_text(s, full_update);
   25.12 @@ -1848,6 +1850,7 @@ void vga_common_init(VGAState *s, Displa
   25.13                       unsigned long vga_ram_offset, int vga_ram_size)
   25.14  {
   25.15      int i, j, v, b;
   25.16 +    extern void* shared_vram;
   25.17  
   25.18      for(i = 0;i < 256; i++) {
   25.19          v = 0;
   25.20 @@ -1876,7 +1879,7 @@ void vga_common_init(VGAState *s, Displa
   25.21  
   25.22      /* qemu's vga mem is not detached from phys_ram_base and can cause DM abort
   25.23       * when guest write vga mem, so allocate a new one */
   25.24 -    s->vram_ptr = qemu_mallocz(vga_ram_size);
   25.25 +    s->vram_ptr = shared_vram;
   25.26  
   25.27      s->vram_offset = vga_ram_offset;
   25.28      s->vram_size = vga_ram_size;
    26.1 --- a/tools/ioemu/target-i386-dm/helper2.c	Thu Sep 29 13:35:13 2005 -0600
    26.2 +++ b/tools/ioemu/target-i386-dm/helper2.c	Thu Sep 29 16:22:02 2005 -0600
    26.3 @@ -54,6 +54,8 @@
    26.4  #include "exec-all.h"
    26.5  #include "vl.h"
    26.6  
    26.7 +void *shared_vram;
    26.8 +
    26.9  shared_iopage_t *shared_page = NULL;
   26.10  extern int reset_requested;
   26.11  
    27.1 --- a/tools/ioemu/vl.c	Thu Sep 29 13:35:13 2005 -0600
    27.2 +++ b/tools/ioemu/vl.c	Thu Sep 29 16:22:02 2005 -0600
    27.3 @@ -134,6 +134,7 @@ int pci_enabled = 1;
    27.4  int prep_enabled = 0;
    27.5  int rtc_utc = 1;
    27.6  int cirrus_vga_enabled = 1;
    27.7 +int vga_accelerate = 1;
    27.8  int graphic_width = 800;
    27.9  int graphic_height = 600;
   27.10  int graphic_depth = 15;
   27.11 @@ -141,6 +142,12 @@ int full_screen = 0;
   27.12  TextConsole *vga_console;
   27.13  CharDriverState *serial_hds[MAX_SERIAL_PORTS];
   27.14  int xc_handle;
   27.15 +unsigned long *vgapage_array;
   27.16 +unsigned long *freepage_array;
   27.17 +unsigned long free_pages;
   27.18 +void *vtop_table;
   27.19 +unsigned long toptab;
   27.20 +unsigned long vgaram_pages;
   27.21  
   27.22  /***********************************************************/
   27.23  /* x86 ISA bus support */
   27.24 @@ -2162,6 +2169,7 @@ void help(void)
   27.25             "-isa            simulate an ISA-only system (default is PCI system)\n"
   27.26             "-std-vga        simulate a standard VGA card with VESA Bochs Extensions\n"
   27.27             "                (default is CL-GD5446 PCI VGA)\n"
   27.28 +           "-vgaacc [0|1]   1 to accelerate CL-GD5446 speed, default is 1\n"
   27.29  #endif
   27.30             "-loadvm file    start right away with a saved state (loadvm in monitor)\n"
   27.31             "\n"
   27.32 @@ -2251,6 +2259,7 @@ enum {
   27.33      QEMU_OPTION_serial,
   27.34      QEMU_OPTION_loadvm,
   27.35      QEMU_OPTION_full_screen,
   27.36 +    QEMU_OPTION_vgaacc,
   27.37  };
   27.38  
   27.39  typedef struct QEMUOption {
   27.40 @@ -2327,6 +2336,7 @@ const QEMUOption qemu_options[] = {
   27.41      { "pci", 0, QEMU_OPTION_pci },
   27.42      { "nic-pcnet", 0, QEMU_OPTION_nic_pcnet },
   27.43      { "cirrusvga", 0, QEMU_OPTION_cirrusvga },
   27.44 +    { "vgaacc", HAS_ARG, QEMU_OPTION_vgaacc },
   27.45      { NULL },
   27.46  };
   27.47  
   27.48 @@ -2343,6 +2353,177 @@ static uint8_t *signal_stack;
   27.49  #define NET_IF_USER  1
   27.50  #define NET_IF_DUMMY 2
   27.51  
   27.52 +#include <xg_private.h>
   27.53 +
   27.54 +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
   27.55 +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
   27.56 +
   27.57 +#ifdef __i386__
   27.58 +#define _LEVEL_3_ 0
   27.59 +#else
   27.60 +#define _LEVEL_3_ 1
   27.61 +#endif
   27.62 +
   27.63 +#if _LEVEL_3_
   27.64 +#define L3_PROT (_PAGE_PRESENT)
   27.65 +#define L1_PAGETABLE_ENTRIES    512
   27.66 +#else
   27.67 +#define L1_PAGETABLE_ENTRIES    1024
   27.68 +#endif
   27.69 +
   27.70 +inline int
   27.71 +get_vl2_table(unsigned long count, unsigned long start)
   27.72 +{
   27.73 +#if _LEVEL_3_
   27.74 +    return ((start + (count << PAGE_SHIFT)) >> L3_PAGETABLE_SHIFT) & 0x3;
   27.75 +#else
   27.76 +    return 0;
   27.77 +#endif
   27.78 +}
   27.79 +
   27.80 +int
   27.81 +setup_mapping(int xc_handle, u32 dom, unsigned long toptab, unsigned long  *mem_page_array, unsigned long *page_table_array, unsigned long v_start, unsigned long v_end)
   27.82 +{
   27.83 +    l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
   27.84 +    l2_pgentry_t *vl2tab[4], *vl2e=NULL, *vl2_table = NULL;
   27.85 +    unsigned long l1tab;
   27.86 +    unsigned long ppt_alloc = 0;
   27.87 +    unsigned long count;
   27.88 +    int i = 0;
   27.89 +#if _LEVEL_3_
   27.90 +    l3_pgentry_t *vl3tab = NULL;
   27.91 +    unsigned long l2tab;
   27.92 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
   27.93 +                                        PROT_READ|PROT_WRITE, 
   27.94 +                                        toptab >> PAGE_SHIFT)) == NULL )
   27.95 +        goto error_out;
   27.96 +    for (i = 0; i < 4 ; i++) {
   27.97 +        l2tab = vl3tab[i] & PAGE_MASK;
   27.98 +        vl2tab[i] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   27.99 +          PROT_READ|PROT_WRITE,
  27.100 +          l2tab >> PAGE_SHIFT);
  27.101 +        if(vl2tab[i] == NULL)
  27.102 +            goto error_out;
  27.103 +    }
  27.104 +    munmap(vl3tab, PAGE_SIZE);
  27.105 +    vl3tab = NULL;
  27.106 +#else
  27.107 +    if ( (vl2tab[0] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  27.108 +                                           PROT_READ|PROT_WRITE, 
  27.109 +                                           toptab >> PAGE_SHIFT)) == NULL )
  27.110 +        goto error_out;
  27.111 +#endif
  27.112 +
  27.113 +    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
  27.114 +    {
  27.115 +        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  27.116 +        {
  27.117 +            vl2_table = vl2tab[get_vl2_table(count, v_start)];
  27.118 +            vl2e = &vl2_table[l2_table_offset(
  27.119 +                v_start + (count << PAGE_SHIFT))];
  27.120 +
  27.121 +            l1tab = page_table_array[ppt_alloc++] << PAGE_SHIFT;
  27.122 +            if ( vl1tab != NULL )
  27.123 +                munmap(vl1tab, PAGE_SIZE);
  27.124 +
  27.125 +            if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  27.126 +                                                PROT_READ|PROT_WRITE,
  27.127 +                                                l1tab >> PAGE_SHIFT)) == NULL )
  27.128 +            {
  27.129 +                goto error_out;
  27.130 +            }
  27.131 +            memset(vl1tab, 0, PAGE_SIZE);
  27.132 +            vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))];
  27.133 +            *vl2e = l1tab | L2_PROT;
  27.134 +        }
  27.135 +
  27.136 +        *vl1e = (mem_page_array[count] << PAGE_SHIFT) | L1_PROT;
  27.137 +        vl1e++;
  27.138 +    }
  27.139 +error_out:
  27.140 +    if(vl1tab)  munmap(vl1tab, PAGE_SIZE);
  27.141 +    for(i = 0; i < 4; i++)
  27.142 +        if(vl2tab[i]) munmap(vl2tab[i], PAGE_SIZE);
  27.143 +    return ppt_alloc;
  27.144 +}
  27.145 +
  27.146 +void
  27.147 +unsetup_mapping(int xc_handle, u32 dom, unsigned long toptab, unsigned long v_start, unsigned long v_end)
  27.148 +{
  27.149 +    l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
  27.150 +    l2_pgentry_t *vl2tab[4], *vl2e=NULL, *vl2_table = NULL;
  27.151 +    unsigned long l1tab;
  27.152 +    unsigned long count;
  27.153 +    int i = 0;
  27.154 +#if _LEVEL_3_
  27.155 +    l3_pgentry_t *vl3tab = NULL;
  27.156 +    unsigned long l2tab;
  27.157 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  27.158 +                                        PROT_READ|PROT_WRITE, 
  27.159 +                                        toptab >> PAGE_SHIFT)) == NULL )
  27.160 +        goto error_out;
  27.161 +    for (i = 0; i < 4 ; i ++){
  27.162 +        l2tab = vl3tab[i] & PAGE_MASK;
  27.163 +        vl2tab[i] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  27.164 +          PROT_READ|PROT_WRITE,
  27.165 +          l2tab >> PAGE_SHIFT);
  27.166 +        if(vl2tab[i] == NULL)
  27.167 +            goto error_out;
  27.168 +    }
  27.169 +    munmap(vl3tab, PAGE_SIZE);
  27.170 +    vl3tab = NULL;
  27.171 +#else
  27.172 +    if ( (vl2tab[0] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  27.173 +                                        PROT_READ|PROT_WRITE, 
  27.174 +                                        toptab >> PAGE_SHIFT)) == NULL )
  27.175 +        goto error_out;
  27.176 +#endif
  27.177 +
  27.178 +    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ){
  27.179 +        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  27.180 +        {
  27.181 +            vl2_table = vl2tab[get_vl2_table(count, v_start)];
  27.182 +            vl2e = &vl2_table[l2_table_offset(v_start + (count << PAGE_SHIFT))];
  27.183 +            l1tab = *vl2e & PAGE_MASK;
  27.184 +
  27.185 +            if(l1tab == 0)
  27.186 +                continue;
  27.187 +            if ( vl1tab != NULL )
  27.188 +                munmap(vl1tab, PAGE_SIZE);
  27.189 +
  27.190 +            if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  27.191 +                      PROT_READ|PROT_WRITE,
  27.192 +                      l1tab >> PAGE_SHIFT)) == NULL )
  27.193 +            {
  27.194 +                goto error_out;
  27.195 +            }
  27.196 +            vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))];
  27.197 +            *vl2e = 0;
  27.198 +        }
  27.199 +
  27.200 +        *vl1e = 0;
  27.201 +        vl1e++;
  27.202 +    }
  27.203 +error_out:
  27.204 +    if(vl1tab)  munmap(vl1tab, PAGE_SIZE);
  27.205 +    for(i = 0; i < 4; i++)
  27.206 +        if(vl2tab[i]) munmap(vl2tab[i], PAGE_SIZE);
  27.207 +}
  27.208 +
  27.209 +void set_vram_mapping(unsigned long addr, unsigned long end)
  27.210 +{
  27.211 +    end = addr + VGA_RAM_SIZE;
  27.212 +    setup_mapping(xc_handle, domid, toptab,
  27.213 +      vgapage_array, freepage_array, addr, end);
  27.214 +}
  27.215 +
  27.216 +void unset_vram_mapping(unsigned long addr, unsigned long end)
  27.217 +{
  27.218 +    end = addr + VGA_RAM_SIZE;
  27.219 +    /* FIXME Flush the shadow page */
  27.220 +    unsetup_mapping(xc_handle, domid, toptab, addr, end);
  27.221 +}
  27.222 +
  27.223  int main(int argc, char **argv)
  27.224  {
  27.225  #ifdef CONFIG_GDBSTUB
  27.226 @@ -2366,8 +2547,9 @@ int main(int argc, char **argv)
  27.227      char serial_devices[MAX_SERIAL_PORTS][128];
  27.228      int serial_device_index;
  27.229      const char *loadvm = NULL;
  27.230 -    unsigned long nr_pages, *page_array;
  27.231 +    unsigned long nr_pages, extra_pages, ram_pages, *page_array;
  27.232      extern void *shared_page;
  27.233 +    extern void *shared_vram;
  27.234      /* change the qemu-dm to daemon, just like bochs dm */
  27.235  //    daemon(0, 0);
  27.236      
  27.237 @@ -2674,6 +2856,17 @@ int main(int argc, char **argv)
  27.238              case QEMU_OPTION_cirrusvga:
  27.239                  cirrus_vga_enabled = 1;
  27.240                  break;
  27.241 +            case QEMU_OPTION_vgaacc:
  27.242 +                {
  27.243 +                    const char *p;
  27.244 +                    p = optarg;
  27.245 +                    vga_accelerate = strtol(p, (char **)&p, 0);
  27.246 +                    if (*p != '\0') {
  27.247 +                        fprintf(stderr, "qemu: invalid vgaacc option\n");
  27.248 +                        exit(1);
  27.249 +                    }
  27.250 +                    break;
  27.251 +                }
  27.252              case QEMU_OPTION_std_vga:
  27.253                  cirrus_vga_enabled = 0;
  27.254                  break;
  27.255 @@ -2803,12 +2996,25 @@ int main(int argc, char **argv)
  27.256      /* init the memory */
  27.257      phys_ram_size = ram_size + vga_ram_size + bios_size;
  27.258  
  27.259 -    #define PAGE_SHIFT 12
  27.260 -    #define PAGE_SIZE  (1 << PAGE_SHIFT)
  27.261 -
  27.262 -    nr_pages = ram_size/PAGE_SIZE;
  27.263 +    ram_pages = ram_size/PAGE_SIZE;
  27.264 +    vgaram_pages =  (vga_ram_size -1)/PAGE_SIZE + 1;
  27.265 +    free_pages = vgaram_pages / L1_PAGETABLE_ENTRIES;
  27.266 +    extra_pages = vgaram_pages + free_pages;
  27.267 +
  27.268      xc_handle = xc_interface_open();
  27.269 -    
  27.270 +
  27.271 +    xc_dominfo_t info;
  27.272 +    xc_domain_getinfo(xc_handle, domid, 1, &info);
  27.273 +
  27.274 +    nr_pages = info.nr_pages + extra_pages;
  27.275 +
  27.276 +    if ( xc_domain_setmaxmem(xc_handle, domid,
  27.277 +            (nr_pages) * PAGE_SIZE/1024 ) != 0)
  27.278 +    {
  27.279 +        perror("set maxmem");
  27.280 +        exit(-1);
  27.281 +    }
  27.282 +   
  27.283      if ( (page_array = (unsigned long *)
  27.284  	  malloc(nr_pages * sizeof(unsigned long))) == NULL)
  27.285      {
  27.286 @@ -2816,6 +3022,12 @@ int main(int argc, char **argv)
  27.287  	    exit(-1);
  27.288      }
  27.289  
  27.290 +    if (xc_domain_memory_increase_reservation(xc_handle, domid, 
  27.291 +          extra_pages , 0, 0, NULL) != 0) {
  27.292 +        perror("increase reservation");
  27.293 +        exit(-1);
  27.294 +    }
  27.295 +
  27.296      if ( xc_get_pfn_list(xc_handle, domid, page_array, nr_pages) != nr_pages )
  27.297      {
  27.298  	    perror("xc_get_pfn_list");
  27.299 @@ -2825,15 +3037,36 @@ int main(int argc, char **argv)
  27.300      if ((phys_ram_base =  xc_map_foreign_batch(xc_handle, domid,
  27.301  						 PROT_READ|PROT_WRITE,
  27.302  						 page_array,
  27.303 -						 nr_pages - 1)) == 0) {
  27.304 +						 ram_pages - 1)) == 0) {
  27.305  	    perror("xc_map_foreign_batch");
  27.306  	    exit(-1);
  27.307      }
  27.308  
  27.309      shared_page = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
  27.310  				       PROT_READ|PROT_WRITE,
  27.311 -				       page_array[nr_pages - 1]);
  27.312 -
  27.313 + 				       page_array[ram_pages - 1]);
  27.314 +
  27.315 +    vgapage_array = &page_array[nr_pages - vgaram_pages];
  27.316 +
  27.317 +    if ((shared_vram =  xc_map_foreign_batch(xc_handle, domid,
  27.318 + 						 PROT_READ|PROT_WRITE,
  27.319 + 						 vgapage_array,
  27.320 + 						 vgaram_pages)) == 0) {
  27.321 + 	    perror("xc_map_foreign_batch vgaram ");
  27.322 + 	    exit(-1);
  27.323 +     }
  27.324 +
  27.325 +
  27.326 +
  27.327 +    memset(shared_vram, 0, vgaram_pages * PAGE_SIZE);
  27.328 +    toptab = page_array[ram_pages] << PAGE_SHIFT;
  27.329 +
  27.330 +    vtop_table = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
  27.331 +				       PROT_READ|PROT_WRITE,
  27.332 + 				       page_array[ram_pages]);
  27.333 +
  27.334 +    freepage_array = &page_array[nr_pages - extra_pages];
  27.335 + 
  27.336  
  27.337      fprintf(logfile, "shared page at pfn:%lx, mfn: %lx\n", (nr_pages-1), 
  27.338             (page_array[nr_pages - 1]));
    28.1 --- a/tools/libxc/linux_boot_params.h	Thu Sep 29 13:35:13 2005 -0600
    28.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.3 @@ -1,166 +0,0 @@
    28.4 -#ifndef __LINUX_BOOT_PARAMS_H__
    28.5 -#define __LINUX_BOOT_PARAMS_H__
    28.6 -
    28.7 -#include <asm/types.h>
    28.8 -
    28.9 -#define E820MAX	32
   28.10 -
   28.11 -struct mem_map {
   28.12 -    int nr_map;
   28.13 -    struct entry {
   28.14 -        u64 addr;	/* start of memory segment */
   28.15 -        u64 size;	/* size of memory segment */
   28.16 -        u32 type;		/* type of memory segment */
   28.17 -#define E820_RAM        1
   28.18 -#define E820_RESERVED   2
   28.19 -#define E820_ACPI       3 /* usable as RAM once ACPI tables have been read */
   28.20 -#define E820_NVS        4
   28.21 -#define E820_IO         16
   28.22 -#define E820_SHARED     17
   28.23 -#define E820_XENSTORE   18
   28.24 -
   28.25 -        u32 caching_attr;    /* used by hypervisor */
   28.26 -#define MEMMAP_UC	0
   28.27 -#define MEMMAP_WC	1
   28.28 -#define MEMMAP_WT	4
   28.29 -#define MEMMAP_WP	5
   28.30 -#define MEMMAP_WB	6
   28.31 -
   28.32 -    }map[E820MAX];
   28.33 -};
   28.34 -
   28.35 -struct e820entry {
   28.36 -	u64 addr;	/* start of memory segment */
   28.37 -	u64 size;	/* size of memory segment */
   28.38 -	u32 type;	/* type of memory segment */
   28.39 -}__attribute__((packed));
   28.40 -
   28.41 -struct e820map {
   28.42 -    u32 nr_map;
   28.43 -    struct e820entry map[E820MAX];
   28.44 -}__attribute__((packed));
   28.45 -
   28.46 -struct drive_info_struct { __u8 dummy[32]; }; 
   28.47 -
   28.48 -struct sys_desc_table { 
   28.49 -    __u16 length; 
   28.50 -    __u8 table[318]; 
   28.51 -}; 
   28.52 -
   28.53 -struct screen_info {
   28.54 -    unsigned char  orig_x;		/* 0x00 */
   28.55 -    unsigned char  orig_y;		/* 0x01 */
   28.56 -    unsigned short dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
   28.57 -    unsigned short orig_video_page;	/* 0x04 */
   28.58 -    unsigned char  orig_video_mode;	/* 0x06 */
   28.59 -    unsigned char  orig_video_cols;	/* 0x07 */
   28.60 -    unsigned short unused2;		/* 0x08 */
   28.61 -    unsigned short orig_video_ega_bx;	/* 0x0a */
   28.62 -    unsigned short unused3;		/* 0x0c */
   28.63 -    unsigned char  orig_video_lines;	/* 0x0e */
   28.64 -    unsigned char  orig_video_isVGA;	/* 0x0f */
   28.65 -    unsigned short orig_video_points;	/* 0x10 */
   28.66 -    
   28.67 -    /* VESA graphic mode -- linear frame buffer */
   28.68 -    unsigned short lfb_width;		/* 0x12 */
   28.69 -    unsigned short lfb_height;		/* 0x14 */
   28.70 -    unsigned short lfb_depth;		/* 0x16 */
   28.71 -    unsigned int   lfb_base;		/* 0x18 */
   28.72 -    unsigned int   lfb_size;		/* 0x1c */
   28.73 -    unsigned short dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
   28.74 -    unsigned short lfb_linelength;	/* 0x24 */
   28.75 -    unsigned char  red_size;		/* 0x26 */
   28.76 -    unsigned char  red_pos;		/* 0x27 */
   28.77 -    unsigned char  green_size;		/* 0x28 */
   28.78 -    unsigned char  green_pos;		/* 0x29 */
   28.79 -    unsigned char  blue_size;		/* 0x2a */
   28.80 -    unsigned char  blue_pos;		/* 0x2b */
   28.81 -    unsigned char  rsvd_size;		/* 0x2c */
   28.82 -    unsigned char  rsvd_pos;		/* 0x2d */
   28.83 -    unsigned short vesapm_seg;		/* 0x2e */
   28.84 -    unsigned short vesapm_off;		/* 0x30 */
   28.85 -    unsigned short pages;		/* 0x32 */
   28.86 -					/* 0x34 -- 0x3f reserved for future expansion */
   28.87 -};
   28.88 -
   28.89 -struct screen_info_overlap { 
   28.90 -    __u8 reserved1[2]; /* 0x00 */ 
   28.91 -    __u16 ext_mem_k; /* 0x02 */ 
   28.92 -    __u8 reserved2[0x20 - 0x04]; /* 0x04 */ 
   28.93 -    __u16 cl_magic; /* 0x20 */ 
   28.94 -#define CL_MAGIC_VALUE 0xA33F 
   28.95 -    __u16 cl_offset; /* 0x22 */ 
   28.96 -    __u8 reserved3[0x40 - 0x24]; /* 0x24 */ 
   28.97 -}; 
   28.98 -
   28.99 -
  28.100 -struct apm_bios_info {
  28.101 -    __u16 version;
  28.102 -    __u16  cseg;
  28.103 -    __u32   offset;
  28.104 -    __u16  cseg_16;
  28.105 -    __u16  dseg;
  28.106 -    __u16  flags;
  28.107 -    __u16  cseg_len;
  28.108 -    __u16  cseg_16_len;
  28.109 -    __u16  dseg_len;
  28.110 -};
  28.111 - 
  28.112 -struct linux_boot_params { 
  28.113 -    union { /* 0x00 */ 
  28.114 -       struct screen_info info; 
  28.115 -       struct screen_info_overlap overlap; 
  28.116 -    } screen; 
  28.117 - 
  28.118 -    struct apm_bios_info apm_bios_info; /* 0x40 */ 
  28.119 -    __u8 reserved4[0x80 - 0x54]; /* 0x54 */ 
  28.120 -    struct drive_info_struct drive_info; /* 0x80 */ 
  28.121 -    struct sys_desc_table sys_desc_table; /* 0xa0 */ 
  28.122 -    __u32 alt_mem_k; /* 0x1e0 */ 
  28.123 -    __u8 reserved5[4]; /* 0x1e4 */ 
  28.124 -    __u8 e820_map_nr; /* 0x1e8 */ 
  28.125 -    __u8 reserved6[8]; /* 0x1e9 */ 
  28.126 -    __u8 setup_sects; /* 0x1f1 */ 
  28.127 -    __u16 mount_root_rdonly; /* 0x1f2 */ 
  28.128 -    __u16 syssize; /* 0x1f4 */ 
  28.129 -    __u16 swapdev; /* 0x1f6 */ 
  28.130 -    __u16 ramdisk_flags; /* 0x1f8 */ 
  28.131 -#define RAMDISK_IMAGE_START_MASK 0x07FF 
  28.132 -#define RAMDISK_PROMPT_FLAG 0x8000 
  28.133 -#define RAMDISK_LOAD_FLAG 0x4000 
  28.134 -    __u16 vid_mode; /* 0x1fa */ 
  28.135 -    __u16 root_dev; /* 0x1fc */ 
  28.136 -    __u8 reserved9[1]; /* 0x1fe */ 
  28.137 -    __u8 aux_device_info; /* 0x1ff */ 
  28.138 -    /* 2.00+ */ 
  28.139 -    __u8 reserved10[2]; /* 0x200 */ 
  28.140 -    __u8 header_magic[4]; /* 0x202 */ 
  28.141 -    __u16 protocol_version; /* 0x206 */ 
  28.142 -    __u8 reserved11[8]; /* 0x208 */ 
  28.143 -    __u8 loader_type; /* 0x210 */ 
  28.144 -#define LOADER_TYPE_LOADLIN 1 
  28.145 -#define LOADER_TYPE_BOOTSECT_LOADER 2 
  28.146 -#define LOADER_TYPE_SYSLINUX 3 
  28.147 -#define LOADER_TYPE_ETHERBOOT 4 
  28.148 -#define LOADER_TYPE_UNKNOWN 0xFF 
  28.149 -    __u8 loader_flags; /* 0x211 */ 
  28.150 -    __u8 reserved12[2]; /* 0x212 */ 
  28.151 -    __u32 code32_start; /* 0x214 */ 
  28.152 -    __u32 initrd_start; /* 0x218 */ 
  28.153 -    __u32 initrd_size; /* 0x21c */ 
  28.154 -    __u8 reserved13[4]; /* 0x220 */ 
  28.155 -    /* 2.01+ */ 
  28.156 -    __u16 heap_end_ptr; /* 0x224 */ 
  28.157 -    __u8 reserved14[2]; /* 0x226 */ 
  28.158 -    /* 2.02+ */ 
  28.159 -    __u32 cmd_line_ptr; /* 0x228 */ 
  28.160 -    /* 2.03+ */ 
  28.161 -    __u32 ramdisk_max; /* 0x22c */ 
  28.162 -    __u8 reserved15[0x2d0 - 0x230]; /* 0x230 */ 
  28.163 -    struct e820entry e820_map[E820MAX]; /* 0x2d0 */ 
  28.164 -    __u64 shared_info; /* 0x550 */
  28.165 -    __u8 padding[0x800 - 0x558]; /* 0x558 */ 
  28.166 -    __u8 cmd_line[0x800]; /* 0x800 */
  28.167 -} __attribute__((packed)); 
  28.168 -
  28.169 -#endif /* __LINUX_BOOT_PARAMS_H__ */
    29.1 --- a/tools/libxc/xc_vmx_build.c	Thu Sep 29 13:35:13 2005 -0600
    29.2 +++ b/tools/libxc/xc_vmx_build.c	Thu Sep 29 16:22:02 2005 -0600
    29.3 @@ -10,7 +10,8 @@
    29.4  #include <unistd.h>
    29.5  #include <zlib.h>
    29.6  #include <xen/io/ioreq.h>
    29.7 -#include "linux_boot_params.h"
    29.8 +
    29.9 +#define VMX_LOADER_ENTR_ADDR  0x00100000
   29.10  
   29.11  #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
   29.12  #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
   29.13 @@ -18,13 +19,29 @@
   29.14  #define L3_PROT (_PAGE_PRESENT)
   29.15  #endif
   29.16  
   29.17 +#define E820MAX	128
   29.18 +
   29.19 +#define E820_RAM          1
   29.20 +#define E820_RESERVED     2
   29.21 +#define E820_ACPI         3
   29.22 +#define E820_NVS          4
   29.23 +#define E820_IO          16
   29.24 +#define E820_SHARED_PAGE 17
   29.25 +#define E820_XENSTORE    18
   29.26 +
   29.27 +#define E820_MAP_PAGE        0x00090000
   29.28 +#define E820_MAP_NR_OFFSET   0x000001E8
   29.29 +#define E820_MAP_OFFSET      0x000002D0
   29.30 +
   29.31 +struct e820entry {
   29.32 +    u64 addr;
   29.33 +    u64 size;
   29.34 +    u32 type;
   29.35 +} __attribute__((packed));
   29.36 +
   29.37  #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
   29.38  #define round_pgdown(_p)  ((_p)&PAGE_MASK)
   29.39  
   29.40 -#define LINUX_BOOT_PARAMS_ADDR   0x00090000
   29.41 -#define LINUX_KERNEL_ENTR_ADDR   0x00100000
   29.42 -#define LINUX_PAGE_OFFSET        0xC0000000
   29.43 -
   29.44  static int
   29.45  parseelfimage(
   29.46      char *elfbase, unsigned long elfsize, struct domain_setup_info *dsi);
   29.47 @@ -33,78 +50,70 @@ loadelfimage(
   29.48      char *elfbase, int xch, u32 dom, unsigned long *parray,
   29.49      struct domain_setup_info *dsi);
   29.50  
   29.51 -static void build_e820map(struct mem_map *mem_mapp, unsigned long mem_size)
   29.52 +static unsigned char build_e820map(void *e820_page, unsigned long mem_size)
   29.53  {
   29.54 -    int nr_map = 0;
   29.55 +    struct e820entry *e820entry =
   29.56 +        (struct e820entry *)(((unsigned char *)e820_page) + E820_MAP_OFFSET);
   29.57 +    unsigned char nr_map = 0;
   29.58  
   29.59      /* XXX: Doesn't work for > 4GB yet */
   29.60 -    mem_mapp->map[nr_map].addr = 0x0;
   29.61 -    mem_mapp->map[nr_map].size = 0x9F800;
   29.62 -    mem_mapp->map[nr_map].type = E820_RAM;
   29.63 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
   29.64 +    e820entry[nr_map].addr = 0x0;
   29.65 +    e820entry[nr_map].size = 0x9F800;
   29.66 +    e820entry[nr_map].type = E820_RAM;
   29.67      nr_map++;
   29.68  
   29.69 -    mem_mapp->map[nr_map].addr = 0x9F800;
   29.70 -    mem_mapp->map[nr_map].size = 0x800;
   29.71 -    mem_mapp->map[nr_map].type = E820_RESERVED;
   29.72 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   29.73 +    e820entry[nr_map].addr = 0x9F800;
   29.74 +    e820entry[nr_map].size = 0x800;
   29.75 +    e820entry[nr_map].type = E820_RESERVED;
   29.76      nr_map++;
   29.77  
   29.78 -    mem_mapp->map[nr_map].addr = 0xA0000;
   29.79 -    mem_mapp->map[nr_map].size = 0x20000;
   29.80 -    mem_mapp->map[nr_map].type = E820_IO;
   29.81 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   29.82 +    e820entry[nr_map].addr = 0xA0000;
   29.83 +    e820entry[nr_map].size = 0x20000;
   29.84 +    e820entry[nr_map].type = E820_IO;
   29.85      nr_map++;
   29.86  
   29.87 -    mem_mapp->map[nr_map].addr = 0xF0000;
   29.88 -    mem_mapp->map[nr_map].size = 0x10000;
   29.89 -    mem_mapp->map[nr_map].type = E820_RESERVED;
   29.90 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   29.91 +    e820entry[nr_map].addr = 0xF0000;
   29.92 +    e820entry[nr_map].size = 0x10000;
   29.93 +    e820entry[nr_map].type = E820_RESERVED;
   29.94      nr_map++;
   29.95  
   29.96  #define STATIC_PAGES    2       /* for ioreq_t and store_mfn */
   29.97      /* Most of the ram goes here */
   29.98 -    mem_mapp->map[nr_map].addr = 0x100000;
   29.99 -    mem_mapp->map[nr_map].size = mem_size - 0x100000 - STATIC_PAGES*PAGE_SIZE;
  29.100 -    mem_mapp->map[nr_map].type = E820_RAM;
  29.101 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  29.102 +    e820entry[nr_map].addr = 0x100000;
  29.103 +    e820entry[nr_map].size = mem_size - 0x100000 - STATIC_PAGES*PAGE_SIZE;
  29.104 +    e820entry[nr_map].type = E820_RAM;
  29.105      nr_map++;
  29.106  
  29.107      /* Statically allocated special pages */
  29.108  
  29.109      /* Shared ioreq_t page */
  29.110 -    mem_mapp->map[nr_map].addr = mem_size - PAGE_SIZE;
  29.111 -    mem_mapp->map[nr_map].size = PAGE_SIZE;
  29.112 -    mem_mapp->map[nr_map].type = E820_SHARED;
  29.113 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  29.114 +    e820entry[nr_map].addr = mem_size - PAGE_SIZE;
  29.115 +    e820entry[nr_map].size = PAGE_SIZE;
  29.116 +    e820entry[nr_map].type = E820_SHARED_PAGE;
  29.117      nr_map++;
  29.118  
  29.119      /* For xenstore */
  29.120 -    mem_mapp->map[nr_map].addr = mem_size - 2*PAGE_SIZE;
  29.121 -    mem_mapp->map[nr_map].size = PAGE_SIZE;
  29.122 -    mem_mapp->map[nr_map].type = E820_XENSTORE;
  29.123 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  29.124 +    e820entry[nr_map].addr = mem_size - 2*PAGE_SIZE;
  29.125 +    e820entry[nr_map].size = PAGE_SIZE;
  29.126 +    e820entry[nr_map].type = E820_XENSTORE;
  29.127      nr_map++;
  29.128  
  29.129 -    mem_mapp->map[nr_map].addr = mem_size;
  29.130 -    mem_mapp->map[nr_map].size = 0x3 * PAGE_SIZE;
  29.131 -    mem_mapp->map[nr_map].type = E820_NVS;
  29.132 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
  29.133 +    e820entry[nr_map].addr = mem_size;
  29.134 +    e820entry[nr_map].size = 0x3 * PAGE_SIZE;
  29.135 +    e820entry[nr_map].type = E820_NVS;
  29.136      nr_map++;
  29.137  
  29.138 -    mem_mapp->map[nr_map].addr = mem_size + 0x3 * PAGE_SIZE;
  29.139 -    mem_mapp->map[nr_map].size = 0xA * PAGE_SIZE;
  29.140 -    mem_mapp->map[nr_map].type = E820_ACPI;
  29.141 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  29.142 +    e820entry[nr_map].addr = mem_size + 0x3 * PAGE_SIZE;
  29.143 +    e820entry[nr_map].size = 0xA * PAGE_SIZE;
  29.144 +    e820entry[nr_map].type = E820_ACPI;
  29.145      nr_map++;
  29.146  
  29.147 -    mem_mapp->map[nr_map].addr = 0xFEC00000;
  29.148 -    mem_mapp->map[nr_map].size = 0x1400000;
  29.149 -    mem_mapp->map[nr_map].type = E820_IO;
  29.150 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
  29.151 +    e820entry[nr_map].addr = 0xFEC00000;
  29.152 +    e820entry[nr_map].size = 0x1400000;
  29.153 +    e820entry[nr_map].type = E820_IO;
  29.154      nr_map++;
  29.155  
  29.156 -    mem_mapp->nr_map = nr_map;
  29.157 +    return (*(((unsigned char *)e820_page) + E820_MAP_NR_OFFSET) = nr_map);
  29.158  }
  29.159  
  29.160  /*
  29.161 @@ -112,19 +121,19 @@ static void build_e820map(struct mem_map
  29.162   * vmxloader will use it to config ACPI MADT table
  29.163   */
  29.164  #define VCPU_MAGIC 0x76637075 /* "vcpu" */
  29.165 -static int 
  29.166 -set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list, 
  29.167 +static int
  29.168 +set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list,
  29.169               struct domain_setup_info *dsi, unsigned long vcpus)
  29.170  {
  29.171      char          *va_map;
  29.172      unsigned long *va_vcpus;
  29.173 -    
  29.174 +
  29.175      va_map = xc_map_foreign_range(
  29.176          xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  29.177 -        pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]);    
  29.178 +        pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]);
  29.179      if ( va_map == NULL )
  29.180          return -1;
  29.181 -    
  29.182 +
  29.183      va_vcpus = (unsigned long *)(va_map + 0x800);
  29.184      *va_vcpus++ = VCPU_MAGIC;
  29.185      *va_vcpus++ = vcpus;
  29.186 @@ -164,24 +173,23 @@ static int zap_mmio_range(int xc_handle,
  29.187      return 0;
  29.188  }
  29.189  
  29.190 -static int zap_mmio_ranges(int xc_handle, u32 dom,
  29.191 -                           unsigned long l2tab,
  29.192 -                           struct mem_map *mem_mapp)
  29.193 +static int zap_mmio_ranges(int xc_handle, u32 dom, unsigned long l2tab,
  29.194 +                           unsigned char e820_map_nr, unsigned char *e820map)
  29.195  {
  29.196 -    int i;
  29.197 +    unsigned int i;
  29.198 +    struct e820entry *e820entry = (struct e820entry *)e820map;
  29.199 +
  29.200      l2_pgentry_32_t *vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  29.201                                                     PROT_READ|PROT_WRITE,
  29.202                                                     l2tab >> PAGE_SHIFT);
  29.203      if ( vl2tab == 0 )
  29.204          return -1;
  29.205  
  29.206 -    for ( i = 0; i < mem_mapp->nr_map; i++ )
  29.207 +    for ( i = 0; i < e820_map_nr; i++ )
  29.208      {
  29.209 -        if ( (mem_mapp->map[i].type == E820_IO) &&
  29.210 -             (mem_mapp->map[i].caching_attr == MEMMAP_UC) &&
  29.211 +        if ( (e820entry[i].type == E820_IO) &&
  29.212               (zap_mmio_range(xc_handle, dom, vl2tab,
  29.213 -                             mem_mapp->map[i].addr,
  29.214 -                             mem_mapp->map[i].size) == -1) )
  29.215 +                             e820entry[i].addr, e820entry[i].size) == -1))
  29.216              return -1;
  29.217      }
  29.218  
  29.219 @@ -200,7 +208,7 @@ static int zap_mmio_range(int xc_handle,
  29.220      unsigned long vl3e;
  29.221      l1_pgentry_t *vl1tab;
  29.222      l2_pgentry_t *vl2tab;
  29.223 - 
  29.224 +
  29.225      mmio_addr = mmio_range_start & PAGE_MASK;
  29.226      for ( ; mmio_addr < mmio_range_end; mmio_addr += PAGE_SIZE )
  29.227      {
  29.228 @@ -239,22 +247,22 @@ static int zap_mmio_range(int xc_handle,
  29.229      return 0;
  29.230  }
  29.231  
  29.232 -static int zap_mmio_ranges(int xc_handle, u32 dom,
  29.233 -                           unsigned long l3tab,
  29.234 -                           struct mem_map *mem_mapp)
  29.235 +static int zap_mmio_ranges(int xc_handle, u32 dom, unsigned long l3tab,
  29.236 +                           unsigned char e820_map_nr, unsigned char *e820map)
  29.237  {
  29.238 -    int i;
  29.239 +    unsigned int i;
  29.240 +    struct e820entry *e820entry = (struct e820entry *)e820map;
  29.241 +
  29.242      l3_pgentry_t *vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  29.243                                                  PROT_READ|PROT_WRITE,
  29.244                                                  l3tab >> PAGE_SHIFT);
  29.245      if (vl3tab == 0)
  29.246          return -1;
  29.247 -    for (i = 0; i < mem_mapp->nr_map; i++) {
  29.248 -        if ((mem_mapp->map[i].type == E820_IO)
  29.249 -            && (mem_mapp->map[i].caching_attr == MEMMAP_UC))
  29.250 -            if (zap_mmio_range(xc_handle, dom, vl3tab,
  29.251 -                               mem_mapp->map[i].addr, mem_mapp->map[i].size) == -1)
  29.252 -                return -1;
  29.253 +    for ( i = 0; i < e820_map_nr; i++ ) {
  29.254 +        if ( (e820entry[i].type == E820_IO) &&
  29.255 +             (zap_mmio_range(xc_handle, dom, vl3tab,
  29.256 +                             e820entry[i].addr, e820entry[i].size) == -1) )
  29.257 +            return -1;
  29.258      }
  29.259      munmap(vl3tab, PAGE_SIZE);
  29.260      return 0;
  29.261 @@ -265,18 +273,14 @@ static int zap_mmio_ranges(int xc_handle
  29.262  static int setup_guest(int xc_handle,
  29.263                         u32 dom, int memsize,
  29.264                         char *image, unsigned long image_size,
  29.265 -                       gzFile initrd_gfd, unsigned long initrd_len,
  29.266                         unsigned long nr_pages,
  29.267                         vcpu_guest_context_t *ctxt,
  29.268 -                       const char *cmdline,
  29.269                         unsigned long shared_info_frame,
  29.270                         unsigned int control_evtchn,
  29.271                         unsigned long flags,
  29.272                         unsigned int vcpus,
  29.273                         unsigned int store_evtchn,
  29.274 -                       unsigned long *store_mfn,
  29.275 -                       struct mem_map *mem_mapp
  29.276 -    )
  29.277 +                       unsigned long *store_mfn)
  29.278  {
  29.279      l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
  29.280      l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
  29.281 @@ -289,8 +293,8 @@ static int setup_guest(int xc_handle,
  29.282      unsigned long l1tab;
  29.283      unsigned long count, i;
  29.284      shared_info_t *shared_info;
  29.285 -    struct linux_boot_params * boot_paramsp;
  29.286 -    __u16 * boot_gdtp;
  29.287 +    void *e820_page;
  29.288 +    unsigned char e820_map_nr;
  29.289      xc_mmu_t *mmu = NULL;
  29.290      int rc;
  29.291  
  29.292 @@ -298,12 +302,6 @@ static int setup_guest(int xc_handle,
  29.293      unsigned long ppt_alloc;
  29.294  
  29.295      struct domain_setup_info dsi;
  29.296 -    unsigned long vinitrd_start;
  29.297 -    unsigned long vinitrd_end;
  29.298 -    unsigned long vboot_params_start;
  29.299 -    unsigned long vboot_params_end;
  29.300 -    unsigned long vboot_gdt_start;
  29.301 -    unsigned long vboot_gdt_end;
  29.302      unsigned long vpt_start;
  29.303      unsigned long vpt_end;
  29.304      unsigned long v_end;
  29.305 @@ -322,27 +320,8 @@ static int setup_guest(int xc_handle,
  29.306          goto error_out;
  29.307      }
  29.308  
  29.309 -    /*
  29.310 -     * Why do we need this? The number of page-table frames depends on the 
  29.311 -     * size of the bootstrap address space. But the size of the address space 
  29.312 -     * depends on the number of page-table frames (since each one is mapped 
  29.313 -     * read-only). We have a pair of simultaneous equations in two unknowns, 
  29.314 -     * which we solve by exhaustive search.
  29.315 -     */
  29.316 -    vboot_params_start = LINUX_BOOT_PARAMS_ADDR;
  29.317 -    vboot_params_end   = vboot_params_start + PAGE_SIZE;
  29.318 -    vboot_gdt_start    = vboot_params_end;
  29.319 -    vboot_gdt_end      = vboot_gdt_start + PAGE_SIZE;
  29.320 -
  29.321      /* memsize is in megabytes */
  29.322      v_end              = memsize << 20;
  29.323 -    /* leaving the top 4k untouched for IO requests page use */
  29.324 -    vinitrd_end        = v_end - PAGE_SIZE;
  29.325 -    vinitrd_start      = vinitrd_end - initrd_len;
  29.326 -    vinitrd_start      = vinitrd_start & (~(PAGE_SIZE - 1));
  29.327 -
  29.328 -    if(initrd_len == 0)
  29.329 -        vinitrd_start = vinitrd_end = 0;
  29.330  
  29.331  #ifdef __i386__
  29.332      nr_pt_pages = 1 + ((memsize + 3) >> 2);
  29.333 @@ -353,24 +332,17 @@ static int setup_guest(int xc_handle,
  29.334      vpt_end     = vpt_start + (nr_pt_pages * PAGE_SIZE);
  29.335  
  29.336      printf("VIRTUAL MEMORY ARRANGEMENT:\n"
  29.337 -           " Boot_params:   %08lx->%08lx\n"
  29.338 -           " boot_gdt:      %08lx->%08lx\n"
  29.339 -           " Loaded kernel: %08lx->%08lx\n"
  29.340 -           " Init. ramdisk: %08lx->%08lx\n"
  29.341 +           " Loaded VMX loader: %08lx->%08lx\n"
  29.342             " Page tables:   %08lx->%08lx\n"
  29.343             " TOTAL:         %08lx->%08lx\n",
  29.344 -           vboot_params_start, vboot_params_end,
  29.345 -           vboot_gdt_start, vboot_gdt_end,
  29.346 -           dsi.v_kernstart, dsi.v_kernend, 
  29.347 -           vinitrd_start, vinitrd_end,
  29.348 +           dsi.v_kernstart, dsi.v_kernend,
  29.349             vpt_start, vpt_end,
  29.350             dsi.v_start, v_end);
  29.351      printf(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
  29.352 -    printf(" INITRD LENGTH: %08lx\n", initrd_len);
  29.353  
  29.354      if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
  29.355      {
  29.356 -        printf("Initial guest OS requires too much space\n"
  29.357 +        ERROR("Initial guest OS requires too much space\n"
  29.358                 "(%luMB is greater than %luMB limit)\n",
  29.359                 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
  29.360          goto error_out;
  29.361 @@ -390,23 +362,6 @@ static int setup_guest(int xc_handle,
  29.362  
  29.363      loadelfimage(image, xc_handle, dom, page_array, &dsi);
  29.364  
  29.365 -    /* Load the initial ramdisk image. */
  29.366 -    if ( initrd_len != 0 )
  29.367 -    {
  29.368 -        for ( i = (vinitrd_start - dsi.v_start); 
  29.369 -              i < (vinitrd_end - dsi.v_start); i += PAGE_SIZE )
  29.370 -        {
  29.371 -            char page[PAGE_SIZE];
  29.372 -            if ( gzread(initrd_gfd, page, PAGE_SIZE) == -1 )
  29.373 -            {
  29.374 -                PERROR("Error reading initrd image, could not");
  29.375 -                goto error_out;
  29.376 -            }
  29.377 -            xc_copy_to_domain_page(xc_handle, dom,
  29.378 -                                   page_array[i>>PAGE_SHIFT], page);
  29.379 -        }
  29.380 -    }
  29.381 -
  29.382      if ( (mmu = xc_init_mmu_updates(xc_handle, dom)) == NULL )
  29.383          goto error_out;
  29.384  
  29.385 @@ -428,15 +383,14 @@ static int setup_guest(int xc_handle,
  29.386      l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  29.387      ctxt->ctrlreg[3] = l2tab;
  29.388  
  29.389 -    /* Initialise the page tables. */
  29.390 -    if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  29.391 -                                        PROT_READ|PROT_WRITE, 
  29.392 +    if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  29.393 +                                        PROT_READ|PROT_WRITE,
  29.394                                          l2tab >> PAGE_SHIFT)) == NULL )
  29.395          goto error_out;
  29.396      memset(vl2tab, 0, PAGE_SIZE);
  29.397      vl2e = &vl2tab[l2_table_offset(dsi.v_start)];
  29.398      for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
  29.399 -    {    
  29.400 +    {
  29.401          if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  29.402          {
  29.403              l1tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  29.404 @@ -460,23 +414,35 @@ static int setup_guest(int xc_handle,
  29.405      munmap(vl1tab, PAGE_SIZE);
  29.406      munmap(vl2tab, PAGE_SIZE);
  29.407  #else
  29.408 -    /* here l3tab means pdpt, only 4 entry is used */
  29.409      l3tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  29.410      ctxt->ctrlreg[3] = l3tab;
  29.411  
  29.412 -    /* Initialise the page tables. */
  29.413 -    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  29.414 -                                        PROT_READ|PROT_WRITE, 
  29.415 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  29.416 +                                        PROT_READ|PROT_WRITE,
  29.417                                          l3tab >> PAGE_SHIFT)) == NULL )
  29.418          goto error_out;
  29.419      memset(vl3tab, 0, PAGE_SIZE);
  29.420  
  29.421 +    /* Fill in every PDPT entry. */
  29.422 +    for ( i = 0; i < L3_PAGETABLE_ENTRIES_PAE; i++ )
  29.423 +    {
  29.424 +        l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  29.425 +        if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  29.426 +                                            PROT_READ|PROT_WRITE,
  29.427 +                                            l2tab >> PAGE_SHIFT)) == NULL )
  29.428 +            goto error_out;
  29.429 +        memset(vl2tab, 0, PAGE_SIZE);
  29.430 +        munmap(vl2tab, PAGE_SIZE);
  29.431 +        vl3tab[i] = l2tab | L3_PROT;
  29.432 +    }
  29.433 +
  29.434      vl3e = &vl3tab[l3_table_offset(dsi.v_start)];
  29.435  
  29.436      for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
  29.437      {
  29.438 -        if (!(count % (1 << (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))){
  29.439 -            l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  29.440 +        if (!(count & (1 << (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))){
  29.441 +            l2tab = vl3tab[count >> (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]
  29.442 +                & PAGE_MASK;
  29.443  
  29.444              if (vl2tab != NULL)
  29.445                  munmap(vl2tab, PAGE_SIZE);
  29.446 @@ -486,8 +452,6 @@ static int setup_guest(int xc_handle,
  29.447                                                  l2tab >> PAGE_SHIFT)) == NULL )
  29.448                  goto error_out;
  29.449  
  29.450 -            memset(vl2tab, 0, PAGE_SIZE);
  29.451 -            *vl3e++ = l2tab | L3_PROT;
  29.452              vl2e = &vl2tab[l2_table_offset(dsi.v_start + (count << PAGE_SHIFT))];
  29.453          }
  29.454          if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  29.455 @@ -519,103 +483,31 @@ static int setup_guest(int xc_handle,
  29.456      for ( count = 0; count < nr_pages; count++ )
  29.457      {
  29.458          if ( xc_add_mmu_update(xc_handle, mmu,
  29.459 -                               (page_array[count] << PAGE_SHIFT) | 
  29.460 +                               (page_array[count] << PAGE_SHIFT) |
  29.461                                 MMU_MACHPHYS_UPDATE, count) )
  29.462              goto error_out;
  29.463      }
  29.464  
  29.465      set_nr_vcpus(xc_handle, dom, page_array, &dsi, vcpus);
  29.466  
  29.467 -    if ((boot_paramsp = xc_map_foreign_range(
  29.468 -        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  29.469 -        page_array[(vboot_params_start-dsi.v_start)>>PAGE_SHIFT])) == 0)
  29.470 -        goto error_out;
  29.471 -
  29.472 -    memset(boot_paramsp, 0, sizeof(*boot_paramsp));
  29.473 -
  29.474 -    strncpy((char *)boot_paramsp->cmd_line, cmdline, 0x800);
  29.475 -    boot_paramsp->cmd_line[0x800-1] = '\0';
  29.476 -    boot_paramsp->cmd_line_ptr = ((unsigned long) vboot_params_start) + offsetof(struct linux_boot_params, cmd_line);
  29.477 -
  29.478 -    boot_paramsp->setup_sects = 0;
  29.479 -    boot_paramsp->mount_root_rdonly = 1;
  29.480 -    boot_paramsp->swapdev = 0x0; 
  29.481 -    boot_paramsp->ramdisk_flags = 0x0; 
  29.482 -    boot_paramsp->root_dev = 0x0; /* We must tell kernel root dev by kernel command line. */
  29.483 -
  29.484 -    /* we don't have a ps/2 mouse now.
  29.485 -     * 0xAA means a aux mouse is there.
  29.486 -     * See detect_auxiliary_port() in pc_keyb.c.
  29.487 -     */
  29.488 -    boot_paramsp->aux_device_info = 0x0; 
  29.489 -
  29.490 -    boot_paramsp->header_magic[0] = 0x48; /* "H" */
  29.491 -    boot_paramsp->header_magic[1] = 0x64; /* "d" */
  29.492 -    boot_paramsp->header_magic[2] = 0x72; /* "r" */
  29.493 -    boot_paramsp->header_magic[3] = 0x53; /* "S" */
  29.494 -
  29.495 -    boot_paramsp->protocol_version = 0x0203; /* 2.03 */
  29.496 -    boot_paramsp->loader_type = 0x71; /* GRUB */
  29.497 -    boot_paramsp->loader_flags = 0x1; /* loaded high */
  29.498 -    boot_paramsp->code32_start = LINUX_KERNEL_ENTR_ADDR; /* 1MB */
  29.499 -    boot_paramsp->initrd_start = vinitrd_start;
  29.500 -    boot_paramsp->initrd_size = initrd_len;
  29.501 -
  29.502 -    i = ((memsize - 1) << 10) - 4;
  29.503 -    boot_paramsp->alt_mem_k = i; /* alt_mem_k */
  29.504 -    boot_paramsp->screen.overlap.ext_mem_k = i & 0xFFFF; /* ext_mem_k */
  29.505 +    *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT];
  29.506 +    shared_page_frame = (v_end - PAGE_SIZE) >> PAGE_SHIFT;
  29.507  
  29.508 -    /*
  29.509 -     * Stuff SCREAN_INFO
  29.510 -     */
  29.511 -    boot_paramsp->screen.info.orig_x = 0;
  29.512 -    boot_paramsp->screen.info.orig_y = 0;
  29.513 -    boot_paramsp->screen.info.orig_video_page = 8;
  29.514 -    boot_paramsp->screen.info.orig_video_mode = 3;
  29.515 -    boot_paramsp->screen.info.orig_video_cols = 80;
  29.516 -    boot_paramsp->screen.info.orig_video_ega_bx = 0;
  29.517 -    boot_paramsp->screen.info.orig_video_lines = 25;
  29.518 -    boot_paramsp->screen.info.orig_video_isVGA = 1;
  29.519 -    boot_paramsp->screen.info.orig_video_points = 0x0010;
  29.520 -
  29.521 -    /* seems we may NOT stuff boot_paramsp->apm_bios_info */
  29.522 -    /* seems we may NOT stuff boot_paramsp->drive_info */
  29.523 -    /* seems we may NOT stuff boot_paramsp->sys_desc_table */
  29.524 -    *((unsigned short *) &boot_paramsp->drive_info.dummy[0]) = 800;
  29.525 -    boot_paramsp->drive_info.dummy[2] = 4;
  29.526 -    boot_paramsp->drive_info.dummy[14] = 32;
  29.527 -
  29.528 -    /* memsize is in megabytes */
  29.529 -    /* If you need to create a special e820map, comment this line
  29.530 -       and use mem-map.sxp */
  29.531 -    build_e820map(mem_mapp, memsize << 20);
  29.532 -    *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT];
  29.533 +    if ((e820_page = xc_map_foreign_range(
  29.534 +        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  29.535 +        page_array[E820_MAP_PAGE >> PAGE_SHIFT])) == 0)
  29.536 +        goto error_out;
  29.537 +    memset(e820_page, 0, PAGE_SIZE);
  29.538 +    e820_map_nr = build_e820map(e820_page, v_end);
  29.539  #if defined (__i386__)
  29.540 -    if (zap_mmio_ranges(xc_handle, dom, l2tab, mem_mapp) == -1)
  29.541 +    if (zap_mmio_ranges(xc_handle, dom, l2tab, e820_map_nr,
  29.542 +                        ((unsigned char *)e820_page) + E820_MAP_OFFSET) == -1)
  29.543  #else
  29.544 -        if (zap_mmio_ranges(xc_handle, dom, l3tab, mem_mapp) == -1)
  29.545 +    if (zap_mmio_ranges(xc_handle, dom, l3tab, e820_map_nr,
  29.546 +                        ((unsigned char *)e820_page) + E820_MAP_OFFSET) == -1)
  29.547  #endif
  29.548 -            goto error_out;
  29.549 -    boot_paramsp->e820_map_nr = mem_mapp->nr_map;
  29.550 -    for (i=0; i<mem_mapp->nr_map; i++) {
  29.551 -        boot_paramsp->e820_map[i].addr = mem_mapp->map[i].addr; 
  29.552 -        boot_paramsp->e820_map[i].size = mem_mapp->map[i].size; 
  29.553 -        boot_paramsp->e820_map[i].type = mem_mapp->map[i].type; 
  29.554 -        if (mem_mapp->map[i].type == E820_SHARED)
  29.555 -            shared_page_frame = (mem_mapp->map[i].addr >> PAGE_SHIFT);
  29.556 -    }
  29.557 -    munmap(boot_paramsp, PAGE_SIZE); 
  29.558 -
  29.559 -    if ((boot_gdtp = xc_map_foreign_range(
  29.560 -        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  29.561 -        page_array[(vboot_gdt_start-dsi.v_start)>>PAGE_SHIFT])) == 0)
  29.562          goto error_out;
  29.563 -    memset(boot_gdtp, 0, PAGE_SIZE);
  29.564 -    boot_gdtp[12*4 + 0] = boot_gdtp[13*4 + 0] = 0xffff; /* limit */
  29.565 -    boot_gdtp[12*4 + 1] = boot_gdtp[13*4 + 1] = 0x0000; /* base */
  29.566 -    boot_gdtp[12*4 + 2] = 0x9a00; boot_gdtp[13*4 + 2] = 0x9200; /* perms */
  29.567 -    boot_gdtp[12*4 + 3] = boot_gdtp[13*4 + 3] = 0x00cf; /* granu + top of limit */
  29.568 -    munmap(boot_gdtp, PAGE_SIZE);
  29.569 +    munmap(e820_page, PAGE_SIZE);
  29.570  
  29.571      /* shared_info page starts its life empty. */
  29.572      if ((shared_info = xc_map_foreign_range(
  29.573 @@ -651,20 +543,21 @@ static int setup_guest(int xc_handle,
  29.574      /*
  29.575       * Initial register values:
  29.576       */
  29.577 -    ctxt->user_regs.ds = 0x68;
  29.578 -    ctxt->user_regs.es = 0x0;
  29.579 -    ctxt->user_regs.fs = 0x0;
  29.580 -    ctxt->user_regs.gs = 0x0;
  29.581 -    ctxt->user_regs.ss = 0x68;
  29.582 -    ctxt->user_regs.cs = 0x60;
  29.583 +    ctxt->user_regs.ds = 0;
  29.584 +    ctxt->user_regs.es = 0;
  29.585 +    ctxt->user_regs.fs = 0;
  29.586 +    ctxt->user_regs.gs = 0;
  29.587 +    ctxt->user_regs.ss = 0;
  29.588 +    ctxt->user_regs.cs = 0;
  29.589      ctxt->user_regs.eip = dsi.v_kernentry;
  29.590 -    ctxt->user_regs.edx = vboot_gdt_start;
  29.591 -    ctxt->user_regs.eax = 0x800;
  29.592 -    ctxt->user_regs.esp = vboot_gdt_end;
  29.593 +    ctxt->user_regs.edx = 0;
  29.594 +    ctxt->user_regs.eax = 0;
  29.595 +    ctxt->user_regs.esp = 0;
  29.596      ctxt->user_regs.ebx = 0; /* startup_32 expects this to be 0 to signal boot cpu */
  29.597 -    ctxt->user_regs.ecx = mem_mapp->nr_map;
  29.598 -    ctxt->user_regs.esi = vboot_params_start;
  29.599 -    ctxt->user_regs.edi = vboot_params_start + 0x2d0;
  29.600 +    ctxt->user_regs.ecx = 0;
  29.601 +    ctxt->user_regs.esi = 0;
  29.602 +    ctxt->user_regs.edi = 0;
  29.603 +    ctxt->user_regs.ebp = 0;
  29.604  
  29.605      ctxt->user_regs.eflags = 0;
  29.606  
  29.607 @@ -684,9 +577,9 @@ static int vmx_identify(void)
  29.608      int eax, ecx;
  29.609  
  29.610  #ifdef __i386__
  29.611 -    __asm__ __volatile__ ("pushl %%ebx; cpuid; popl %%ebx" 
  29.612 -                          : "=a" (eax), "=c" (ecx) 
  29.613 -                          : "0" (1) 
  29.614 +    __asm__ __volatile__ ("pushl %%ebx; cpuid; popl %%ebx"
  29.615 +                          : "=a" (eax), "=c" (ecx)
  29.616 +                          : "0" (1)
  29.617                            : "dx");
  29.618  #elif defined __x86_64__
  29.619      __asm__ __volatile__ ("pushq %%rbx; cpuid; popq %%rbx"
  29.620 @@ -705,9 +598,6 @@ int xc_vmx_build(int xc_handle,
  29.621                   u32 domid,
  29.622                   int memsize,
  29.623                   const char *image_name,
  29.624 -                 struct mem_map *mem_mapp,
  29.625 -                 const char *ramdisk_name,
  29.626 -                 const char *cmdline,
  29.627                   unsigned int control_evtchn,
  29.628                   unsigned long flags,
  29.629                   unsigned int vcpus,
  29.630 @@ -715,20 +605,18 @@ int xc_vmx_build(int xc_handle,
  29.631                   unsigned long *store_mfn)
  29.632  {
  29.633      dom0_op_t launch_op, op;
  29.634 -    int initrd_fd = -1;
  29.635 -    gzFile initrd_gfd = NULL;
  29.636      int rc, i;
  29.637      vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
  29.638      unsigned long nr_pages;
  29.639      char         *image = NULL;
  29.640 -    unsigned long image_size, initrd_size=0;
  29.641 +    unsigned long image_size;
  29.642  
  29.643      if ( vmx_identify() < 0 )
  29.644      {
  29.645          PERROR("CPU doesn't support VMX Extensions");
  29.646          goto error_out;
  29.647      }
  29.648 -    
  29.649 +
  29.650      if ( (nr_pages = xc_get_tot_pages(xc_handle, domid)) < 0 )
  29.651      {
  29.652          PERROR("Could not find total pages for domain");
  29.653 @@ -738,32 +626,15 @@ int xc_vmx_build(int xc_handle,
  29.654      if ( (image = xc_read_kernel_image(image_name, &image_size)) == NULL )
  29.655          goto error_out;
  29.656  
  29.657 -    if ( (ramdisk_name != NULL) && (strlen(ramdisk_name) != 0) )
  29.658 +    if ( mlock(&st_ctxt, sizeof(st_ctxt) ) )
  29.659      {
  29.660 -        if ( (initrd_fd = open(ramdisk_name, O_RDONLY)) < 0 )
  29.661 -        {
  29.662 -            PERROR("Could not open the initial ramdisk image");
  29.663 -            goto error_out;
  29.664 -        }
  29.665 -
  29.666 -        initrd_size = xc_get_filesz(initrd_fd);
  29.667 -
  29.668 -        if ( (initrd_gfd = gzdopen(initrd_fd, "rb")) == NULL )
  29.669 -        {
  29.670 -            PERROR("Could not allocate decompression state for initrd");
  29.671 -            goto error_out;
  29.672 -        }
  29.673 -    }
  29.674 -
  29.675 -    if ( mlock(&st_ctxt, sizeof(st_ctxt) ) )
  29.676 -    {   
  29.677          PERROR("xc_vmx_build: ctxt mlock failed");
  29.678          return 1;
  29.679      }
  29.680  
  29.681      op.cmd = DOM0_GETDOMAININFO;
  29.682      op.u.getdomaininfo.domain = (domid_t)domid;
  29.683 -    if ( (xc_dom0_op(xc_handle, &op) < 0) || 
  29.684 +    if ( (xc_dom0_op(xc_handle, &op) < 0) ||
  29.685           ((u16)op.u.getdomaininfo.domain != domid) )
  29.686      {
  29.687          PERROR("Could not get info on domain");
  29.688 @@ -783,21 +654,14 @@ int xc_vmx_build(int xc_handle,
  29.689          goto error_out;
  29.690      }
  29.691  
  29.692 -    if ( setup_guest(xc_handle, domid, memsize, image, image_size, 
  29.693 -                     initrd_gfd, initrd_size, nr_pages, 
  29.694 -                     ctxt, cmdline,
  29.695 -                     op.u.getdomaininfo.shared_info_frame,
  29.696 -                     control_evtchn, flags, vcpus, store_evtchn, store_mfn,
  29.697 -                     mem_mapp) < 0 )
  29.698 +    if ( setup_guest(xc_handle, domid, memsize, image, image_size, nr_pages,
  29.699 +                     ctxt, op.u.getdomaininfo.shared_info_frame, control_evtchn,
  29.700 +                     flags, vcpus, store_evtchn, store_mfn) < 0)
  29.701      {
  29.702          ERROR("Error constructing guest OS");
  29.703          goto error_out;
  29.704      }
  29.705  
  29.706 -    if ( initrd_fd >= 0 )
  29.707 -        close(initrd_fd);
  29.708 -    if ( initrd_gfd )
  29.709 -        gzclose(initrd_gfd);
  29.710      free(image);
  29.711  
  29.712      ctxt->flags = VGCF_VMX_GUEST;
  29.713 @@ -813,15 +677,10 @@ int xc_vmx_build(int xc_handle,
  29.714  
  29.715      /* No LDT. */
  29.716      ctxt->ldt_ents = 0;
  29.717 -    
  29.718 +
  29.719      /* Use the default Xen-provided GDT. */
  29.720      ctxt->gdt_ents = 0;
  29.721  
  29.722 -    /* Ring 1 stack is the initial stack. */
  29.723 -/*
  29.724 -  ctxt->kernel_ss = FLAT_KERNEL_DS;
  29.725 -  ctxt->kernel_sp = vstartinfo_start;
  29.726 -*/
  29.727      /* No debugging. */
  29.728      memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg));
  29.729  
  29.730 @@ -845,14 +704,10 @@ int xc_vmx_build(int xc_handle,
  29.731  
  29.732      launch_op.cmd = DOM0_SETDOMAININFO;
  29.733      rc = xc_dom0_op(xc_handle, &launch_op);
  29.734 -    
  29.735 +
  29.736      return rc;
  29.737  
  29.738   error_out:
  29.739 -    if ( initrd_gfd != NULL )
  29.740 -        gzclose(initrd_gfd);
  29.741 -    else if ( initrd_fd >= 0 )
  29.742 -        close(initrd_fd);
  29.743      free(image);
  29.744  
  29.745      return -1;
  29.746 @@ -864,7 +719,7 @@ static inline int is_loadable_phdr(Elf32
  29.747              ((phdr->p_flags & (PF_W|PF_X)) != 0));
  29.748  }
  29.749  
  29.750 -static int parseelfimage(char *elfbase, 
  29.751 +static int parseelfimage(char *elfbase,
  29.752                           unsigned long elfsize,
  29.753                           struct domain_setup_info *dsi)
  29.754  {
  29.755 @@ -899,11 +754,11 @@ static int parseelfimage(char *elfbase,
  29.756          ERROR("ELF image has no section-header strings table (shstrtab).");
  29.757          return -EINVAL;
  29.758      }
  29.759 -    shdr = (Elf32_Shdr *)(elfbase + ehdr->e_shoff + 
  29.760 +    shdr = (Elf32_Shdr *)(elfbase + ehdr->e_shoff +
  29.761                            (ehdr->e_shstrndx*ehdr->e_shentsize));
  29.762      shstrtab = elfbase + shdr->sh_offset;
  29.763 -    
  29.764 -    for ( h = 0; h < ehdr->e_phnum; h++ ) 
  29.765 +
  29.766 +    for ( h = 0; h < ehdr->e_phnum; h++ )
  29.767      {
  29.768          phdr = (Elf32_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
  29.769          if ( !is_loadable_phdr(phdr) )
  29.770 @@ -914,8 +769,8 @@ static int parseelfimage(char *elfbase,
  29.771              kernend = phdr->p_paddr + phdr->p_memsz;
  29.772      }
  29.773  
  29.774 -    if ( (kernstart > kernend) || 
  29.775 -         (ehdr->e_entry < kernstart) || 
  29.776 +    if ( (kernstart > kernend) ||
  29.777 +         (ehdr->e_entry < kernstart) ||
  29.778           (ehdr->e_entry > kernend) )
  29.779      {
  29.780          ERROR("Malformed ELF image.");
  29.781 @@ -924,9 +779,9 @@ static int parseelfimage(char *elfbase,
  29.782  
  29.783      dsi->v_start = 0x00000000;
  29.784  
  29.785 -    dsi->v_kernstart = kernstart - LINUX_PAGE_OFFSET;
  29.786 -    dsi->v_kernend   = kernend - LINUX_PAGE_OFFSET;
  29.787 -    dsi->v_kernentry = LINUX_KERNEL_ENTR_ADDR;
  29.788 +    dsi->v_kernstart = kernstart;
  29.789 +    dsi->v_kernend   = kernend;
  29.790 +    dsi->v_kernentry = VMX_LOADER_ENTR_ADDR;
  29.791  
  29.792      dsi->v_end       = dsi->v_kernend;
  29.793  
  29.794 @@ -945,18 +800,18 @@ loadelfimage(
  29.795      char         *va;
  29.796      unsigned long pa, done, chunksz;
  29.797  
  29.798 -    for ( h = 0; h < ehdr->e_phnum; h++ ) 
  29.799 +    for ( h = 0; h < ehdr->e_phnum; h++ )
  29.800      {
  29.801          phdr = (Elf32_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
  29.802          if ( !is_loadable_phdr(phdr) )
  29.803              continue;
  29.804 -        
  29.805 +
  29.806          for ( done = 0; done < phdr->p_filesz; done += chunksz )
  29.807          {
  29.808 -            pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET;
  29.809 +            pa = (phdr->p_paddr + done) - dsi->v_start;
  29.810              if ((va = xc_map_foreign_range(
  29.811                  xch, dom, PAGE_SIZE, PROT_WRITE,
  29.812 -                parray[pa>>PAGE_SHIFT])) == 0)
  29.813 +                parray[pa >> PAGE_SHIFT])) == 0)
  29.814                  return -1;
  29.815              chunksz = phdr->p_filesz - done;
  29.816              if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
  29.817 @@ -968,10 +823,10 @@ loadelfimage(
  29.818  
  29.819          for ( ; done < phdr->p_memsz; done += chunksz )
  29.820          {
  29.821 -            pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET;
  29.822 +            pa = (phdr->p_paddr + done) - dsi->v_start;
  29.823              if ((va = xc_map_foreign_range(
  29.824                  xch, dom, PAGE_SIZE, PROT_WRITE,
  29.825 -                parray[pa>>PAGE_SHIFT])) == 0)
  29.826 +                parray[pa >> PAGE_SHIFT])) == 0)
  29.827                  return -1;
  29.828              chunksz = phdr->p_memsz - done;
  29.829              if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
    30.1 --- a/tools/libxc/xenguest.h	Thu Sep 29 13:35:13 2005 -0600
    30.2 +++ b/tools/libxc/xenguest.h	Thu Sep 29 16:22:02 2005 -0600
    30.3 @@ -57,9 +57,6 @@ int xc_vmx_build(int xc_handle,
    30.4                   uint32_t domid,
    30.5                   int memsize,
    30.6                   const char *image_name,
    30.7 -                 struct mem_map *memmap,
    30.8 -                 const char *ramdisk_name,
    30.9 -                 const char *cmdline,
   30.10                   unsigned int control_evtchn,
   30.11                   unsigned long flags,
   30.12                   unsigned int vcpus,
    31.1 --- a/tools/libxc/xg_private.h	Thu Sep 29 13:35:13 2005 -0600
    31.2 +++ b/tools/libxc/xg_private.h	Thu Sep 29 16:22:02 2005 -0600
    31.3 @@ -28,25 +28,27 @@ unsigned long csum_page (void * page);
    31.4  #define _PAGE_PSE       0x080
    31.5  #define _PAGE_GLOBAL    0x100
    31.6  
    31.7 -#if defined(__i386__)
    31.8 -#define L1_PAGETABLE_SHIFT       12
    31.9 -#define L2_PAGETABLE_SHIFT       22
   31.10  #define L1_PAGETABLE_SHIFT_PAE   12
   31.11  #define L2_PAGETABLE_SHIFT_PAE   21
   31.12  #define L3_PAGETABLE_SHIFT_PAE   30
   31.13 +
   31.14 +#if defined(__i386__)
   31.15 +#define L1_PAGETABLE_SHIFT       12
   31.16 +#define L2_PAGETABLE_SHIFT       22
   31.17  #elif defined(__x86_64__)
   31.18 -#define L1_PAGETABLE_SHIFT      12
   31.19 -#define L2_PAGETABLE_SHIFT      21
   31.20 -#define L3_PAGETABLE_SHIFT      30
   31.21 -#define L4_PAGETABLE_SHIFT      39
   31.22 +#define L1_PAGETABLE_SHIFT       12
   31.23 +#define L2_PAGETABLE_SHIFT       21
   31.24 +#define L3_PAGETABLE_SHIFT       30
   31.25 +#define L4_PAGETABLE_SHIFT       39
   31.26  #endif
   31.27  
   31.28 -#if defined(__i386__) 
   31.29 -#define ENTRIES_PER_L1_PAGETABLE 1024
   31.30 -#define ENTRIES_PER_L2_PAGETABLE 1024
   31.31  #define L1_PAGETABLE_ENTRIES_PAE  512
   31.32  #define L2_PAGETABLE_ENTRIES_PAE  512
   31.33  #define L3_PAGETABLE_ENTRIES_PAE    4
   31.34 +
   31.35 +#if defined(__i386__) 
   31.36 +#define L1_PAGETABLE_ENTRIES   1024
   31.37 +#define L2_PAGETABLE_ENTRIES   1024
   31.38  #elif defined(__x86_64__)
   31.39  #define L1_PAGETABLE_ENTRIES    512
   31.40  #define L2_PAGETABLE_ENTRIES    512
   31.41 @@ -70,17 +72,18 @@ typedef unsigned long l3_pgentry_t;
   31.42  typedef unsigned long l4_pgentry_t;
   31.43  #endif
   31.44  
   31.45 -#if defined(__i386__)
   31.46 -#define l1_table_offset(_a) \
   31.47 -          (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1))
   31.48 -#define l2_table_offset(_a) \
   31.49 -          ((_a) >> L2_PAGETABLE_SHIFT)
   31.50  #define l1_table_offset_pae(_a) \
   31.51    (((_a) >> L1_PAGETABLE_SHIFT_PAE) & (L1_PAGETABLE_ENTRIES_PAE - 1))
   31.52  #define l2_table_offset_pae(_a) \
   31.53    (((_a) >> L2_PAGETABLE_SHIFT_PAE) & (L2_PAGETABLE_ENTRIES_PAE - 1))
   31.54  #define l3_table_offset_pae(_a) \
   31.55  	(((_a) >> L3_PAGETABLE_SHIFT_PAE) & (L3_PAGETABLE_ENTRIES_PAE - 1))
   31.56 +
   31.57 +#if defined(__i386__)
   31.58 +#define l1_table_offset(_a) \
   31.59 +          (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))
   31.60 +#define l2_table_offset(_a) \
   31.61 +          ((_a) >> L2_PAGETABLE_SHIFT)
   31.62  #elif defined(__x86_64__)
   31.63  #define l1_table_offset(_a) \
   31.64    (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))
    32.1 --- a/tools/python/xen/lowlevel/xc/xc.c	Thu Sep 29 13:35:13 2005 -0600
    32.2 +++ b/tools/python/xen/lowlevel/xc/xc.c	Thu Sep 29 16:22:02 2005 -0600
    32.3 @@ -17,7 +17,6 @@
    32.4  #include <arpa/inet.h>
    32.5  
    32.6  #include "xc_private.h"
    32.7 -#include "linux_boot_params.h"
    32.8  
    32.9  /* Needed for Python versions earlier than 2.3. */
   32.10  #ifndef PyMODINIT_FUNC
   32.11 @@ -310,80 +309,24 @@ static PyObject *pyxc_vmx_build(PyObject
   32.12      XcObject *xc = (XcObject *)self;
   32.13  
   32.14      u32   dom;
   32.15 -    char *image, *ramdisk = NULL, *cmdline = "";
   32.16 -    PyObject *memmap;
   32.17 +    char *image;
   32.18      int   control_evtchn, store_evtchn;
   32.19      int flags = 0, vcpus = 1;
   32.20 -    int numItems, i;
   32.21      int memsize;
   32.22 -    struct mem_map mem_map;
   32.23      unsigned long store_mfn = 0;
   32.24  
   32.25      static char *kwd_list[] = { "dom", "control_evtchn", "store_evtchn",
   32.26 -                                "memsize", "image", "memmap",
   32.27 -				"ramdisk", "cmdline", "flags",
   32.28 -				"vcpus", NULL };
   32.29 +                                "memsize", "image", "flags", "vcpus", NULL };
   32.30  
   32.31 -    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisO!|ssii", kwd_list, 
   32.32 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisii", kwd_list,
   32.33                                        &dom, &control_evtchn, &store_evtchn,
   32.34 -                                      &memsize,
   32.35 -                                      &image, &PyList_Type, &memmap,
   32.36 -				      &ramdisk, &cmdline, &flags, &vcpus) )
   32.37 +                                      &memsize, &image, &flags, &vcpus) )
   32.38          return NULL;
   32.39  
   32.40 -    memset(&mem_map, 0, sizeof(mem_map));
   32.41 -    /* Parse memmap */
   32.42 -
   32.43 -    /* get the number of lines passed to us */
   32.44 -    numItems = PyList_Size(memmap) - 1;	/* removing the line 
   32.45 -					   containing "memmap" */
   32.46 -    mem_map.nr_map = numItems;
   32.47 -   
   32.48 -    /* should raise an error here. */
   32.49 -    if (numItems < 0) return NULL; /* Not a list */
   32.50 -
   32.51 -    /* iterate over items of the list, grabbing ranges and parsing them */
   32.52 -    for (i = 1; i <= numItems; i++) {	// skip over "memmap"
   32.53 -	    PyObject *item, *f1, *f2, *f3, *f4;
   32.54 -	    int numFields;
   32.55 -	    unsigned long lf1, lf2, lf3, lf4;
   32.56 -	    char *sf1, *sf2;
   32.57 -	    
   32.58 -	    /* grab the string object from the next element of the list */
   32.59 -	    item = PyList_GetItem(memmap, i); /* Can't fail */
   32.60 -
   32.61 -	    /* get the number of lines passed to us */
   32.62 -	    numFields = PyList_Size(item);
   32.63 -
   32.64 -	    if (numFields != 4)
   32.65 -		    return NULL;
   32.66 +    if ( xc_vmx_build(xc->xc_handle, dom, memsize, image, control_evtchn,
   32.67 +                      flags, vcpus, store_evtchn, &store_mfn) != 0 )
   32.68 +        return PyErr_SetFromErrno(xc_error);
   32.69  
   32.70 -	    f1 = PyList_GetItem(item, 0);
   32.71 -	    f2 = PyList_GetItem(item, 1);
   32.72 -	    f3 = PyList_GetItem(item, 2);
   32.73 -	    f4 = PyList_GetItem(item, 3);
   32.74 -
   32.75 -	    /* Convert objects to strings/longs */
   32.76 -	    sf1 = PyString_AsString(f1);
   32.77 -	    sf2 = PyString_AsString(f2);
   32.78 -	    lf3 = PyLong_AsLong(f3);
   32.79 -	    lf4 = PyLong_AsLong(f4);
   32.80 -	    if ( sscanf(sf1, "%lx", &lf1) != 1 )
   32.81 -                return NULL;
   32.82 -	    if ( sscanf(sf2, "%lx", &lf2) != 1 )
   32.83 -                return NULL;
   32.84 -
   32.85 -            mem_map.map[i-1].addr = lf1;
   32.86 -            mem_map.map[i-1].size = lf2 - lf1;
   32.87 -            mem_map.map[i-1].type = lf3;
   32.88 -            mem_map.map[i-1].caching_attr = lf4;
   32.89 -    }
   32.90 -
   32.91 -    if ( xc_vmx_build(xc->xc_handle, dom, memsize, image, &mem_map,
   32.92 -                        ramdisk, cmdline, control_evtchn, flags,
   32.93 -                        vcpus, store_evtchn, &store_mfn) != 0 )
   32.94 -        return PyErr_SetFromErrno(xc_error);
   32.95 -    
   32.96      return Py_BuildValue("{s:i}", "store_mfn", store_mfn);
   32.97  }
   32.98  
    33.1 --- a/tools/python/xen/lowlevel/xs/xs.c	Thu Sep 29 13:35:13 2005 -0600
    33.2 +++ b/tools/python/xen/lowlevel/xs/xs.c	Thu Sep 29 16:22:02 2005 -0600
    33.3 @@ -582,9 +582,8 @@ static PyObject *xspy_unwatch(PyObject *
    33.4  }
    33.5  
    33.6  #define xspy_transaction_start_doc "\n"				\
    33.7 -	"Start a transaction on a path.\n"			\
    33.8 +	"Start a transaction.\n"				\
    33.9  	"Only one transaction can be active at a time.\n"	\
   33.10 -	" path [string]: xenstore path.\n"			\
   33.11  	"\n"							\
   33.12  	"Returns None on success.\n"				\
   33.13  	"Raises RuntimeError on error.\n"			\
   33.14 @@ -593,8 +592,8 @@ static PyObject *xspy_unwatch(PyObject *
   33.15  static PyObject *xspy_transaction_start(PyObject *self, PyObject *args,
   33.16                                          PyObject *kwds)
   33.17  {
   33.18 -    static char *kwd_spec[] = { "path", NULL };
   33.19 -    static char *arg_spec = "s|";
   33.20 +    static char *kwd_spec[] = { NULL };
   33.21 +    static char *arg_spec = "";
   33.22      char *path = NULL;
   33.23  
   33.24      struct xs_handle *xh = xshandle(self);
   33.25 @@ -606,7 +605,7 @@ static PyObject *xspy_transaction_start(
   33.26      if (!PyArg_ParseTupleAndKeywords(args, kwds, arg_spec, kwd_spec, &path))
   33.27          goto exit;
   33.28      Py_BEGIN_ALLOW_THREADS
   33.29 -    xsval = xs_transaction_start(xh, path);
   33.30 +    xsval = xs_transaction_start(xh);
   33.31      Py_END_ALLOW_THREADS
   33.32      if (!xsval) {
   33.33          PyErr_SetFromErrno(PyExc_RuntimeError);
   33.34 @@ -623,7 +622,7 @@ static PyObject *xspy_transaction_start(
   33.35  	"Attempts to commit the transaction unless abort is true.\n"	\
   33.36  	" abort [int]: abort flag (default 0).\n"			\
   33.37  	"\n"								\
   33.38 -	"Returns None on success.\n"					\
   33.39 +	"Returns True on success, False if you need to try again.\n"	\
   33.40  	"Raises RuntimeError on error.\n"				\
   33.41  	"\n"
   33.42  
   33.43 @@ -646,11 +645,16 @@ static PyObject *xspy_transaction_end(Py
   33.44      xsval = xs_transaction_end(xh, abort);
   33.45      Py_END_ALLOW_THREADS
   33.46      if (!xsval) {
   33.47 +	if (errno == EAGAIN) {
   33.48 +	    Py_INCREF(Py_False);
   33.49 +	    val = Py_False;
   33.50 +	    goto exit;
   33.51 +	}
   33.52          PyErr_SetFromErrno(PyExc_RuntimeError);
   33.53          goto exit;
   33.54      }
   33.55 -    Py_INCREF(Py_None);
   33.56 -    val = Py_None;
   33.57 +    Py_INCREF(Py_True);
   33.58 +    val = Py_True;
   33.59   exit:
   33.60      return val;
   33.61  }
    34.1 --- a/tools/python/xen/util/memmap.py	Thu Sep 29 13:35:13 2005 -0600
    34.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.3 @@ -1,41 +0,0 @@
    34.4 -mem_caching_attr = {
    34.5 -    'UC' : 0,
    34.6 -    'WC' : 1,
    34.7 -    'WT' : 4,
    34.8 -    'WP' : 5,
    34.9 -    'WB' : 6,
   34.10 -    };
   34.11 -
   34.12 -e820_mem_type = {
   34.13 -    'AddressRangeMemory'    : 1,
   34.14 -    'AddressRangeReserved'  : 2,
   34.15 -    'AddressRangeACPI'      : 3,
   34.16 -    'AddressRangeNVS'       : 4,
   34.17 -    'AddressRangeIO'        : 16,
   34.18 -    'AddressRangeShared'    : 17,
   34.19 -};
   34.20 -
   34.21 -MT_COL = 2
   34.22 -MA_COL = 3
   34.23 -
   34.24 -def strmap(row):
   34.25 -   if (type(row) != type([])):
   34.26 -       return row
   34.27 -   row[MT_COL] = e820_mem_type[row[MT_COL]]
   34.28 -   row[MA_COL] = mem_caching_attr[row[MA_COL]]
   34.29 -   return row
   34.30 -
   34.31 -def memmap_parse(memmap):
   34.32 -    return map(strmap, memmap)
   34.33 -
   34.34 -if __name__ == '__main__':
   34.35 -   memmap = [ 'memmap',
   34.36 -              [ '1', '2', 'AddressRangeMemory', 'UC'],
   34.37 -              [ '1', '2', 'AddressRangeReserved', 'UC'],
   34.38 -              [ '1', '2', 'AddressRangeACPI', 'WB'],
   34.39 -              [ '1', '2', 'AddressRangeNVS', 'WB'],
   34.40 -              [ '1', '2', 'AddressRangeIO', 'WB'],
   34.41 -              [ '1', '2', 'AddressRangeShared', 'WB']]
   34.42 -   print memmap_parse(memmap);
   34.43 -
   34.44 -
    35.1 --- a/tools/python/xen/util/tempfile.py	Thu Sep 29 13:35:13 2005 -0600
    35.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    35.3 @@ -1,451 +0,0 @@
    35.4 -"""Temporary files.
    35.5 -
    35.6 -This module provides generic, low- and high-level interfaces for
    35.7 -creating temporary files and directories.  The interfaces listed
    35.8 -as "safe" just below can be used without fear of race conditions.
    35.9 -Those listed as "unsafe" cannot, and are provided for backward
   35.10 -compatibility only.
   35.11 -
   35.12 -This module also provides some data items to the user:
   35.13 -
   35.14 -  TMP_MAX  - maximum number of names that will be tried before
   35.15 -             giving up.
   35.16 -  template - the default prefix for all temporary names.
   35.17 -             You may change this to control the default prefix.
   35.18 -  tempdir  - If this is set to a string before the first use of
   35.19 -             any routine from this module, it will be considered as
   35.20 -             another candidate location to store temporary files.
   35.21 -"""
   35.22 -
   35.23 -__all__ = [
   35.24 -    "NamedTemporaryFile", "TemporaryFile", # high level safe interfaces
   35.25 -    "mkstemp", "mkdtemp",                  # low level safe interfaces
   35.26 -    "mktemp",                              # deprecated unsafe interface
   35.27 -    "TMP_MAX", "gettempprefix",            # constants
   35.28 -    "tempdir", "gettempdir"
   35.29 -   ]
   35.30 -
   35.31 -
   35.32 -# Imports.
   35.33 -
   35.34 -import os as _os
   35.35 -import errno as _errno
   35.36 -from random import Random as _Random
   35.37 -
   35.38 -if _os.name == 'mac':
   35.39 -    import Carbon.Folder as _Folder
   35.40 -    import Carbon.Folders as _Folders
   35.41 -
   35.42 -try:
   35.43 -    import fcntl as _fcntl
   35.44 -    # If PYTHONCASEOK is set on Windows, stinking FCNTL.py gets
   35.45 -    # imported, and we don't get an ImportError then.  Provoke
   35.46 -    # an AttributeError instead in that case.
   35.47 -    _fcntl.fcntl
   35.48 -except (ImportError, AttributeError):
   35.49 -    def _set_cloexec(fd):
   35.50 -        pass
   35.51 -else:
   35.52 -    def _set_cloexec(fd):
   35.53 -        flags = _fcntl.fcntl(fd, _fcntl.F_GETFD, 0)
   35.54 -        if flags >= 0:
   35.55 -            # flags read successfully, modify
   35.56 -            flags |= _fcntl.FD_CLOEXEC
   35.57 -            _fcntl.fcntl(fd, _fcntl.F_SETFD, flags)
   35.58 -
   35.59 -
   35.60 -try:
   35.61 -    import thread as _thread
   35.62 -except ImportError:
   35.63 -    import dummy_thread as _thread
   35.64 -_allocate_lock = _thread.allocate_lock
   35.65 -
   35.66 -_text_openflags = _os.O_RDWR | _os.O_CREAT | _os.O_EXCL
   35.67 -if hasattr(_os, 'O_NOINHERIT'):
   35.68 -    _text_openflags |= _os.O_NOINHERIT
   35.69 -if hasattr(_os, 'O_NOFOLLOW'):
   35.70 -    _text_openflags |= _os.O_NOFOLLOW
   35.71 -
   35.72 -_bin_openflags = _text_openflags
   35.73 -if hasattr(_os, 'O_BINARY'):
   35.74 -    _bin_openflags |= _os.O_BINARY
   35.75 -
   35.76 -if hasattr(_os, 'TMP_MAX'):
   35.77 -    TMP_MAX = _os.TMP_MAX
   35.78 -else:
   35.79 -    TMP_MAX = 10000
   35.80 -
   35.81 -template = "tmp"
   35.82 -
   35.83 -tempdir = None
   35.84 -
   35.85 -# Internal routines.
   35.86 -
   35.87 -_once_lock = _allocate_lock()
   35.88 -
   35.89 -class _RandomNameSequence:
   35.90 -    """An instance of _RandomNameSequence generates an endless
   35.91 -    sequence of unpredictable strings which can safely be incorporated
   35.92 -    into file names.  Each string is six characters long.  Multiple
   35.93 -    threads can safely use the same instance at the same time.
   35.94 -
   35.95 -    _RandomNameSequence is an iterator."""
   35.96 -
   35.97 -    characters = ("abcdefghijklmnopqrstuvwxyz" +
   35.98 -                  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
   35.99 -                  "0123456789-_")
  35.100 -
  35.101 -    def __init__(self):
  35.102 -        self.mutex = _allocate_lock()
  35.103 -        self.rng = _Random()
  35.104 -        self.normcase = _os.path.normcase
  35.105 -
  35.106 -    def __iter__(self):
  35.107 -        return self
  35.108 -
  35.109 -    def next(self):
  35.110 -        m = self.mutex
  35.111 -        c = self.characters
  35.112 -        choose = self.rng.choice
  35.113 -
  35.114 -        m.acquire()
  35.115 -        try:
  35.116 -            letters = [choose(c) for dummy in "123456"]
  35.117 -        finally:
  35.118 -            m.release()
  35.119 -
  35.120 -        return self.normcase(''.join(letters))
  35.121 -
  35.122 -def _candidate_tempdir_list():
  35.123 -    """Generate a list of candidate temporary directories which
  35.124 -    _get_default_tempdir will try."""
  35.125 -
  35.126 -    dirlist = []
  35.127 -
  35.128 -    # First, try the environment.
  35.129 -    for envname in 'TMPDIR', 'TEMP', 'TMP':
  35.130 -        dirname = _os.getenv(envname)
  35.131 -        if dirname: dirlist.append(dirname)
  35.132 -
  35.133 -    # Failing that, try OS-specific locations.
  35.134 -    if _os.name == 'mac':
  35.135 -        try:
  35.136 -            fsr = _Folder.FSFindFolder(_Folders.kOnSystemDisk,
  35.137 -                                              _Folders.kTemporaryFolderType, 1)
  35.138 -            dirname = fsr.as_pathname()
  35.139 -            dirlist.append(dirname)
  35.140 -        except _Folder.error:
  35.141 -            pass
  35.142 -    elif _os.name == 'riscos':
  35.143 -        dirname = _os.getenv('Wimp$ScrapDir')
  35.144 -        if dirname: dirlist.append(dirname)
  35.145 -    elif _os.name == 'nt':
  35.146 -        dirlist.extend([ r'c:\temp', r'c:\tmp', r'\temp', r'\tmp' ])
  35.147 -    else:
  35.148 -        dirlist.extend([ '/tmp', '/var/tmp', '/usr/tmp' ])
  35.149 -
  35.150 -    # As a last resort, the current directory.
  35.151 -    try:
  35.152 -        dirlist.append(_os.getcwd())
  35.153 -    except (AttributeError, _os.error):
  35.154 -        dirlist.append(_os.curdir)
  35.155 -
  35.156 -    return dirlist
  35.157 -
  35.158 -def _get_default_tempdir():
  35.159 -    """Calculate the default directory to use for temporary files.
  35.160 -    This routine should be called exactly once.
  35.161 -
  35.162 -    We determine whether or not a candidate temp dir is usable by
  35.163 -    trying to create and write to a file in that directory.  If this
  35.164 -    is successful, the test file is deleted.  To prevent denial of
  35.165 -    service, the name of the test file must be randomized."""
  35.166 -
  35.167 -    namer = _RandomNameSequence()
  35.168 -    dirlist = _candidate_tempdir_list()
  35.169 -    flags = _text_openflags
  35.170 -
  35.171 -    for dir in dirlist:
  35.172 -        if dir != _os.curdir:
  35.173 -            dir = _os.path.normcase(_os.path.abspath(dir))
  35.174 -        # Try only a few names per directory.
  35.175 -        for seq in xrange(100):
  35.176 -            name = namer.next()
  35.177 -            filename = _os.path.join(dir, name)
  35.178 -            try:
  35.179 -                fd = _os.open(filename, flags, 0600)
  35.180 -                fp = _os.fdopen(fd, 'w')
  35.181 -                fp.write('blat')
  35.182 -                fp.close()
  35.183 -                _os.unlink(filename)
  35.184 -                del fp, fd
  35.185 -                return dir
  35.186 -            except (OSError, IOError), e:
  35.187 -                if e[0] != _errno.EEXIST:
  35.188 -                    break # no point trying more names in this directory
  35.189 -                pass
  35.190 -    raise IOError, (_errno.ENOENT,
  35.191 -                    ("No usable temporary directory found in %s" % dirlist))
  35.192 -
  35.193 -_name_sequence = None
  35.194 -
  35.195 -def _get_candidate_names():
  35.196 -    """Common setup sequence for all user-callable interfaces."""
  35.197 -
  35.198 -    global _name_sequence
  35.199 -    if _name_sequence is None:
  35.200 -        _once_lock.acquire()
  35.201 -        try:
  35.202 -            if _name_sequence is None:
  35.203 -                _name_sequence = _RandomNameSequence()
  35.204 -        finally:
  35.205 -            _once_lock.release()
  35.206 -    return _name_sequence
  35.207 -
  35.208 -
  35.209 -def _mkstemp_inner(dir, pre, suf, flags):
  35.210 -    """Code common to mkstemp, TemporaryFile, and NamedTemporaryFile."""
  35.211 -
  35.212 -    names = _get_candidate_names()
  35.213 -
  35.214 -    for seq in xrange(TMP_MAX):
  35.215 -        name = names.next()
  35.216 -        file = _os.path.join(dir, pre + name + suf)
  35.217 -        try:
  35.218 -            fd = _os.open(file, flags, 0600)
  35.219 -            _set_cloexec(fd)
  35.220 -            return (fd, file)
  35.221 -        except OSError, e:
  35.222 -            if e.errno == _errno.EEXIST:
  35.223 -                continue # try again
  35.224 -            raise
  35.225 -
  35.226 -    raise IOError, (_errno.EEXIST, "No usable temporary file name found")
  35.227 -
  35.228 -
  35.229 -# User visible interfaces.
  35.230 -
  35.231 -def gettempprefix():
  35.232 -    """Accessor for tempdir.template."""
  35.233 -    return template
  35.234 -
  35.235 -tempdir = None
  35.236 -
  35.237 -def gettempdir():
  35.238 -    """Accessor for tempdir.tempdir."""
  35.239 -    global tempdir
  35.240 -    if tempdir is None:
  35.241 -        _once_lock.acquire()
  35.242 -        try:
  35.243 -            if tempdir is None:
  35.244 -                tempdir = _get_default_tempdir()
  35.245 -        finally:
  35.246 -            _once_lock.release()
  35.247 -    return tempdir
  35.248 -
  35.249 -def mkstemp(suffix="", prefix=template, dir=None, text=False):
  35.250 -    """mkstemp([suffix, [prefix, [dir, [text]]]])
  35.251 -    User-callable function to create and return a unique temporary
  35.252 -    file.  The return value is a pair (fd, name) where fd is the
  35.253 -    file descriptor returned by os.open, and name is the filename.
  35.254 -
  35.255 -    If 'suffix' is specified, the file name will end with that suffix,
  35.256 -    otherwise there will be no suffix.
  35.257 -
  35.258 -    If 'prefix' is specified, the file name will begin with that prefix,
  35.259 -    otherwise a default prefix is used.
  35.260 -
  35.261 -    If 'dir' is specified, the file will be created in that directory,
  35.262 -    otherwise a default directory is used.
  35.263 -
  35.264 -    If 'text' is specified and true, the file is opened in text
  35.265 -    mode.  Else (the default) the file is opened in binary mode.  On
  35.266 -    some operating systems, this makes no difference.
  35.267 -
  35.268 -    The file is readable and writable only by the creating user ID.
  35.269 -    If the operating system uses permission bits to indicate whether a
  35.270 -    file is executable, the file is executable by no one. The file
  35.271 -    descriptor is not inherited by children of this process.
  35.272 -
  35.273 -    Caller is responsible for deleting the file when done with it.
  35.274 -    """
  35.275 -
  35.276 -    if dir is None:
  35.277 -        dir = gettempdir()
  35.278 -
  35.279 -    if text:
  35.280 -        flags = _text_openflags
  35.281 -    else:
  35.282 -        flags = _bin_openflags
  35.283 -
  35.284 -    return _mkstemp_inner(dir, prefix, suffix, flags)
  35.285 -
  35.286 -
  35.287 -def mkdtemp(suffix="", prefix=template, dir=None):
  35.288 -    """mkdtemp([suffix, [prefix, [dir]]])
  35.289 -    User-callable function to create and return a unique temporary
  35.290 -    directory.  The return value is the pathname of the directory.
  35.291 -
  35.292 -    Arguments are as for mkstemp, except that the 'text' argument is
  35.293 -    not accepted.
  35.294 -
  35.295 -    The directory is readable, writable, and searchable only by the
  35.296 -    creating user.
  35.297 -
  35.298 -    Caller is responsible for deleting the directory when done with it.
  35.299 -    """
  35.300 -
  35.301 -    if dir is None:
  35.302 -        dir = gettempdir()
  35.303 -
  35.304 -    names = _get_candidate_names()
  35.305 -
  35.306 -    for seq in xrange(TMP_MAX):
  35.307 -        name = names.next()
  35.308 -        file = _os.path.join(dir, prefix + name + suffix)
  35.309 -        try:
  35.310 -            _os.mkdir(file, 0700)
  35.311 -            return file
  35.312 -        except OSError, e:
  35.313 -            if e.errno == _errno.EEXIST:
  35.314 -                continue # try again
  35.315 -            raise
  35.316 -
  35.317 -    raise IOError, (_errno.EEXIST, "No usable temporary directory name found")
  35.318 -
  35.319 -def mktemp(suffix="", prefix=template, dir=None):
  35.320 -    """mktemp([suffix, [prefix, [dir]]])
  35.321 -    User-callable function to return a unique temporary file name.  The
  35.322 -    file is not created.
  35.323 -
  35.324 -    Arguments are as for mkstemp, except that the 'text' argument is
  35.325 -    not accepted.
  35.326 -
  35.327 -    This function is unsafe and should not be used.  The file name
  35.328 -    refers to a file that did not exist at some point, but by the time
  35.329 -    you get around to creating it, someone else may have beaten you to
  35.330 -    the punch.
  35.331 -    """
  35.332 -
  35.333 -##    from warnings import warn as _warn
  35.334 -##    _warn("mktemp is a potential security risk to your program",
  35.335 -##          RuntimeWarning, stacklevel=2)
  35.336 -
  35.337 -    if dir is None:
  35.338 -        dir = gettempdir()
  35.339 -
  35.340 -    names = _get_candidate_names()
  35.341 -    for seq in xrange(TMP_MAX):
  35.342 -        name = names.next()
  35.343 -        file = _os.path.join(dir, prefix + name + suffix)
  35.344 -        if not _os.path.exists(file):
  35.345 -            return file
  35.346 -
  35.347 -    raise IOError, (_errno.EEXIST, "No usable temporary filename found")
  35.348 -
  35.349 -class _TemporaryFileWrapper:
  35.350 -    """Temporary file wrapper
  35.351 -
  35.352 -    This class provides a wrapper around files opened for
  35.353 -    temporary use.  In particular, it seeks to automatically
  35.354 -    remove the file when it is no longer needed.
  35.355 -    """
  35.356 -
  35.357 -    def __init__(self, file, name):
  35.358 -        self.file = file
  35.359 -        self.name = name
  35.360 -        self.close_called = False
  35.361 -
  35.362 -    def __getattr__(self, name):
  35.363 -        file = self.__dict__['file']
  35.364 -        a = getattr(file, name)
  35.365 -        if type(a) != type(0):
  35.366 -            setattr(self, name, a)
  35.367 -        return a
  35.368 -
  35.369 -    # NT provides delete-on-close as a primitive, so we don't need
  35.370 -    # the wrapper to do anything special.  We still use it so that
  35.371 -    # file.name is useful (i.e. not "(fdopen)") with NamedTemporaryFile.
  35.372 -    if _os.name != 'nt':
  35.373 -
  35.374 -        # Cache the unlinker so we don't get spurious errors at
  35.375 -        # shutdown when the module-level "os" is None'd out.  Note
  35.376 -        # that this must be referenced as self.unlink, because the
  35.377 -        # name TemporaryFileWrapper may also get None'd out before
  35.378 -        # __del__ is called.
  35.379 -        unlink = _os.unlink
  35.380 -
  35.381 -        def close(self):
  35.382 -            if not self.close_called:
  35.383 -                self.close_called = True
  35.384 -                self.file.close()
  35.385 -                self.unlink(self.name)
  35.386 -
  35.387 -        def __del__(self):
  35.388 -            self.close()
  35.389 -
  35.390 -def NamedTemporaryFile(mode='w+b', bufsize=-1, suffix="",
  35.391 -                       prefix=template, dir=None):
  35.392 -    """Create and return a temporary file.
  35.393 -    Arguments:
  35.394 -    'prefix', 'suffix', 'dir' -- as for mkstemp.
  35.395 -    'mode' -- the mode argument to os.fdopen (default "w+b").
  35.396 -    'bufsize' -- the buffer size argument to os.fdopen (default -1).
  35.397 -    The file is created as mkstemp() would do it.
  35.398 -
  35.399 -    Returns a file object; the name of the file is accessible as
  35.400 -    file.name.  The file will be automatically deleted when it is
  35.401 -    closed.
  35.402 -    """
  35.403 -
  35.404 -    if dir is None:
  35.405 -        dir = gettempdir()
  35.406 -
  35.407 -    if 'b' in mode:
  35.408 -        flags = _bin_openflags
  35.409 -    else:
  35.410 -        flags = _text_openflags
  35.411 -
  35.412 -    # Setting O_TEMPORARY in the flags causes the OS to delete
  35.413 -    # the file when it is closed.  This is only supported by Windows.
  35.414 -    if _os.name == 'nt':
  35.415 -        flags |= _os.O_TEMPORARY
  35.416 -
  35.417 -    (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags)
  35.418 -    file = _os.fdopen(fd, mode, bufsize)
  35.419 -    return _TemporaryFileWrapper(file, name)
  35.420 -
  35.421 -if _os.name != 'posix' or _os.sys.platform == 'cygwin':
  35.422 -    # On non-POSIX and Cygwin systems, assume that we cannot unlink a file
  35.423 -    # while it is open.
  35.424 -    TemporaryFile = NamedTemporaryFile
  35.425 -
  35.426 -else:
  35.427 -    def TemporaryFile(mode='w+b', bufsize=-1, suffix="",
  35.428 -                      prefix=template, dir=None):
  35.429 -        """Create and return a temporary file.
  35.430 -        Arguments:
  35.431 -        'prefix', 'suffix', 'directory' -- as for mkstemp.
  35.432 -        'mode' -- the mode argument to os.fdopen (default "w+b").
  35.433 -        'bufsize' -- the buffer size argument to os.fdopen (default -1).
  35.434 -        The file is created as mkstemp() would do it.
  35.435 -
  35.436 -        Returns a file object.  The file has no name, and will cease to
  35.437 -        exist when it is closed.
  35.438 -        """
  35.439 -
  35.440 -        if dir is None:
  35.441 -            dir = gettempdir()
  35.442 -
  35.443 -        if 'b' in mode:
  35.444 -            flags = _bin_openflags
  35.445 -        else:
  35.446 -            flags = _text_openflags
  35.447 -
  35.448 -        (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags)
  35.449 -        try:
  35.450 -            _os.unlink(name)
  35.451 -            return _os.fdopen(fd, mode, bufsize)
  35.452 -        except:
  35.453 -            _os.close(fd)
  35.454 -            raise
    36.1 --- a/tools/python/xen/xend/Blkctl.py	Thu Sep 29 13:35:13 2005 -0600
    36.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    36.3 @@ -1,43 +0,0 @@
    36.4 -"""Xend interface to block control scripts.
    36.5 -"""
    36.6 -import os
    36.7 -import os.path
    36.8 -import sys
    36.9 -import string
   36.10 -import xen.util.process
   36.11 -
   36.12 -from xen.xend import XendRoot
   36.13 -
   36.14 -xroot = XendRoot.instance()
   36.15 -
   36.16 -"""Where network control scripts live."""
   36.17 -SCRIPT_DIR = xroot.block_script_dir
   36.18 -
   36.19 -def block(op, type, dets, script=None):
   36.20 -    """Call a block control script.
   36.21 -    Xend calls this with op 'bind' when it is about to export a block device
   36.22 -    (other than a raw partition).  The script is called with unbind when a
   36.23 -    device is no longer in use and should be removed.
   36.24 -
   36.25 -    @param op:        operation (start, stop, status)
   36.26 -    @param type:      type of block device (determines the script used)
   36.27 -    @param dets:      arguments to the control script
   36.28 -    @param script:    block script name
   36.29 -    """
   36.30 -    
   36.31 -    if op not in ['bind', 'unbind']:
   36.32 -        raise ValueError('Invalid operation:' + op)
   36.33 -
   36.34 -    # Special case phy devices - they don't require any (un)binding
   36.35 -    # Parallax also doesn't need script-based binding.
   36.36 -    if (type == 'phy') or (type == 'parallax'):
   36.37 -        return dets
   36.38 -    
   36.39 -    if script is None:
   36.40 -        script = xroot.get_block_script(type)
   36.41 -    script = os.path.join(SCRIPT_DIR, script)
   36.42 -    args = [op] + string.split(dets, ':')
   36.43 -    args = ' '.join(args)
   36.44 -    ret = xen.util.process.runscript(script + ' ' + args)
   36.45 -    if len(ret):
   36.46 -        return ret.splitlines()[0]
    37.1 --- a/tools/python/xen/xend/PrettyPrint.py	Thu Sep 29 13:35:13 2005 -0600
    37.2 +++ b/tools/python/xen/xend/PrettyPrint.py	Thu Sep 29 16:22:02 2005 -0600
    37.3 @@ -252,7 +252,7 @@ class PrettyPrinter:
    37.4          self.block = self.block.parent
    37.5  
    37.6      def prettyprint(self, out=sys.stdout):
    37.7 -        self.top.prettyprint(Line(out, self.width))
    37.8 +        self.top.prettyprint(Line(out, self.width), self.width)
    37.9  
   37.10  class SXPPrettyPrinter(PrettyPrinter):
   37.11      """An SXP prettyprinter.
    38.1 --- a/tools/python/xen/xend/XendDB.py	Thu Sep 29 13:35:13 2005 -0600
    38.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    38.3 @@ -1,127 +0,0 @@
    38.4 -#============================================================================
    38.5 -# This library is free software; you can redistribute it and/or
    38.6 -# modify it under the terms of version 2.1 of the GNU Lesser General Public
    38.7 -# License as published by the Free Software Foundation.
    38.8 -#
    38.9 -# This library is distributed in the hope that it will be useful,
   38.10 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
   38.11 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   38.12 -# Lesser General Public License for more details.
   38.13 -#
   38.14 -# You should have received a copy of the GNU Lesser General Public
   38.15 -# License along with this library; if not, write to the Free Software
   38.16 -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   38.17 -#============================================================================
   38.18 -# Copyright (C) 2004, 2005 Mike Wray <mike.wray@hp.com>
   38.19 -#============================================================================
   38.20 -
   38.21 -import os
   38.22 -import os.path
   38.23 -import errno
   38.24 -import dircache
   38.25 -import time
   38.26 -
   38.27 -import sxp
   38.28 -import XendRoot
   38.29 -xroot = XendRoot.instance()
   38.30 -
   38.31 -class XendDB:
   38.32 -    """Persistence for Xend. Stores data in files and directories.
   38.33 -    """
   38.34 -
   38.35 -    def __init__(self, path=None):
   38.36 -        self.dbpath = xroot.get_dbroot()
   38.37 -        if path:
   38.38 -            self.dbpath = os.path.join(self.dbpath, path)
   38.39 -        pass
   38.40 -
   38.41 -    def listdir(self, dpath):
   38.42 -        try:
   38.43 -            return dircache.listdir(dpath)
   38.44 -        except:
   38.45 -            return []
   38.46 -
   38.47 -    def filepath(self, path):
   38.48 -        return os.path.join(self.dbpath, path)
   38.49 -        
   38.50 -    def fetch(self, path):
   38.51 -        fpath = self.filepath(path)
   38.52 -        return self.fetchfile(fpath)
   38.53 -
   38.54 -    def fetchfile(self, fpath):
   38.55 -        pin = sxp.Parser()
   38.56 -        fin = file(fpath, "rb")
   38.57 -        try:
   38.58 -            while 1:
   38.59 -                try:
   38.60 -                    buf = fin.read(1024)
   38.61 -                except IOError, ex:
   38.62 -                    if ex.errno == errno.EINTR:
   38.63 -                        continue
   38.64 -                    else:
   38.65 -                        raise
   38.66 -                pin.input(buf)
   38.67 -                if buf == '':
   38.68 -                    pin.input_eof()
   38.69 -                    break
   38.70 -        finally:
   38.71 -            fin.close()
   38.72 -        return pin.get_val()
   38.73 -
   38.74 -    def save(self, path, sxpr):
   38.75 -        fpath = self.filepath(path)
   38.76 -        return self.savefile(fpath, sxpr)
   38.77 -    
   38.78 -    def savefile(self, fpath, sxpr):
   38.79 -        backup = False
   38.80 -        fdir = os.path.dirname(fpath)
   38.81 -        if not os.path.isdir(fdir):
   38.82 -            os.makedirs(fdir)
   38.83 -        if os.path.exists(fpath):
   38.84 -            backup = True
   38.85 -            real_fpath = fpath
   38.86 -            fpath += ".new."
   38.87 -            
   38.88 -        fout = file(fpath, "wb+")
   38.89 -        try:
   38.90 -            try:
   38.91 -                t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
   38.92 -                fout.write("# %s %s\n" % (fpath, t))
   38.93 -                sxp.show(sxpr, out=fout)
   38.94 -            finally:
   38.95 -                fout.close()
   38.96 -        except:
   38.97 -            if backup:
   38.98 -                try:
   38.99 -                    os.unlink(fpath)
  38.100 -                except:
  38.101 -                    pass
  38.102 -                raise
  38.103 -        if backup:
  38.104 -            os.rename(fpath, real_fpath)
  38.105 -
  38.106 -    def fetchall(self, path):
  38.107 -        dpath = self.filepath(path)
  38.108 -        d = {}
  38.109 -        for k in self.listdir(dpath):
  38.110 -            try:
  38.111 -                v = self.fetchfile(os.path.join(dpath, k))
  38.112 -                d[k] = v
  38.113 -            except:
  38.114 -                pass
  38.115 -        return d
  38.116 -
  38.117 -    def saveall(self, path, d):
  38.118 -        for (k, v) in d.items():
  38.119 -            self.save(os.path.join(path, k), v)
  38.120 -
  38.121 -    def delete(self, path):
  38.122 -        dpath = self.filepath(path)
  38.123 -        os.unlink(dpath)
  38.124 -
  38.125 -    def ls(self, path):
  38.126 -        dpath = self.filepath(path)
  38.127 -        return self.listdir(dpath)
  38.128 -        
  38.129 -
  38.130 -        
    39.1 --- a/tools/python/xen/xend/XendDomain.py	Thu Sep 29 13:35:13 2005 -0600
    39.2 +++ b/tools/python/xen/xend/XendDomain.py	Thu Sep 29 16:22:02 2005 -0600
    39.3 @@ -433,12 +433,11 @@ class XendDomain:
    39.4              self.domain_shutdowns()
    39.5          return val
    39.6  
    39.7 +
    39.8      def domain_sysrq(self, id, key):
    39.9 -        """Send a SysRq to a domain
   39.10 -        """
   39.11 -        dominfo = self.domain_lookup(id)
   39.12 -        val = dominfo.send_sysrq(key)
   39.13 -        return val
   39.14 +        """Send a SysRq to the specified domain."""
   39.15 +        return self.callInfo(id, XendDomainInfo.send_sysrq, key)
   39.16 +
   39.17  
   39.18      def domain_shutdowns(self):
   39.19          """Process pending domain shutdowns.
   39.20 @@ -630,73 +629,45 @@ class XendDomain:
   39.21          except Exception, ex:
   39.22              raise XendError(str(ex))
   39.23  
   39.24 -    def domain_device_create(self, id, devconfig):
   39.25 -        """Create a new device for a domain.
   39.26  
   39.27 -        @param id:       domain id
   39.28 -        @param devconfig: device configuration
   39.29 +    def domain_device_create(self, domid, devconfig):
   39.30 +        """Create a new device for the specified domain.
   39.31          """
   39.32 -        dominfo = self.domain_lookup(id)
   39.33 -        val = dominfo.device_create(devconfig)
   39.34 -        dominfo.exportToDB()
   39.35 -        return val
   39.36 +        return self.callInfo(domid, XendDomainInfo.device_create, devconfig)
   39.37 +
   39.38  
   39.39 -    def domain_device_configure(self, id, devconfig, devid):
   39.40 -        """Configure an existing device for a domain.
   39.41 -
   39.42 -        @param id:   domain id
   39.43 -        @param devconfig: device configuration
   39.44 -        @param devid:  device id
   39.45 +    def domain_device_configure(self, domid, devconfig, devid):
   39.46 +        """Configure an existing device in the specified domain.
   39.47          @return: updated device configuration
   39.48          """
   39.49 -        dominfo = self.domain_lookup(id)
   39.50 -        val = dominfo.device_configure(devconfig, devid)
   39.51 -        dominfo.exportToDB()
   39.52 -        return val
   39.53 -    
   39.54 -    def domain_device_refresh(self, id, type, devid):
   39.55 -        """Refresh a device.
   39.56 +        return self.callInfo(domid, XendDomainInfo.device_configure,
   39.57 +                             devconfig, devid)
   39.58  
   39.59 -        @param id:  domain id
   39.60 -        @param devid:  device id
   39.61 -        @param type: device type
   39.62 -        """
   39.63 -        dominfo = self.domain_lookup(id)
   39.64 -        val = dominfo.device_refresh(type, devid)
   39.65 -        dominfo.exportToDB()
   39.66 -        return val
   39.67 -
   39.68 -    def domain_device_destroy(self, id, type, devid):
   39.69 -        """Destroy a device.
   39.70 -
   39.71 -        @param id:  domain id
   39.72 -        @param devid:  device id
   39.73 -        @param type: device type
   39.74 -        """
   39.75 -        dominfo = self.domain_lookup(id)
   39.76 -        return dominfo.destroyDevice(type, devid)
   39.77 +    
   39.78 +    def domain_device_refresh(self, domid, devtype, devid):
   39.79 +        """Refresh a device."""
   39.80 +        return self.callInfo(domid, XendDomainInfo.device_refresh, devtype,
   39.81 +                             devid)
   39.82  
   39.83  
   39.84 -    def domain_devtype_ls(self, id, type):
   39.85 -        """Get list of device sxprs for a domain.
   39.86 +    def domain_device_destroy(self, domid, devtype, devid):
   39.87 +        """Destroy a device."""
   39.88 +        return self.callInfo(domid, XendDomainInfo.destroyDevice, devtype,
   39.89 +                             devid)
   39.90 +
   39.91  
   39.92 -        @param id:  domain
   39.93 -        @param type: device type
   39.94 -        @return: device sxprs
   39.95 -        """
   39.96 -        dominfo = self.domain_lookup(id)
   39.97 -        return dominfo.getDeviceSxprs(type)
   39.98 +    def domain_devtype_ls(self, domid, devtype):
   39.99 +        """Get list of device sxprs for the specified domain."""
  39.100 +        return self.callInfo(domid, XendDomainInfo.getDeviceSxprs, devtype)
  39.101  
  39.102 -    def domain_devtype_get(self, id, type, devid):
  39.103 +
  39.104 +    def domain_devtype_get(self, domid, devtype, devid):
  39.105          """Get a device from a domain.
  39.106          
  39.107 -        @param id:  domain
  39.108 -        @param type: device type
  39.109 -        @param devid:  device id
  39.110          @return: device object (or None)
  39.111          """
  39.112 -        dominfo = self.domain_lookup(id)
  39.113 -        return dominfo.getDevice(type, devid)
  39.114 +        return self.callInfo(domid, XendDomainInfo.getDevice, devtype, devid)
  39.115 +
  39.116  
  39.117      def domain_vif_limit_set(self, id, vif, credit, period):
  39.118          """Limit the vif's transmission rate
  39.119 @@ -723,7 +694,7 @@ class XendDomain:
  39.120          """Set the memory limit for a domain.
  39.121  
  39.122          @param id: domain
  39.123 -        @param mem: memory limit (in MB)
  39.124 +        @param mem: memory limit (in MiB)
  39.125          @return: 0 on success, -1 on error
  39.126          """
  39.127          dominfo = self.domain_lookup(id)
  39.128 @@ -734,42 +705,37 @@ class XendDomain:
  39.129          except Exception, ex:
  39.130              raise XendError(str(ex))
  39.131  
  39.132 -    def domain_mem_target_set(self, id, mem):
  39.133 +    def domain_mem_target_set(self, domid, mem):
  39.134          """Set the memory target for a domain.
  39.135  
  39.136 -        @param id: domain
  39.137 -        @param mem: memory target (in MB)
  39.138 -        @return: 0 on success, -1 on error
  39.139 +        @param mem: memory target (in MiB)
  39.140          """
  39.141 -        dominfo = self.domain_lookup(id)
  39.142 -        return dominfo.setMemoryTarget(mem << 10)
  39.143 +        self.callInfo(domid, XendDomainInfo.setMemoryTarget, mem << 10)
  39.144 +
  39.145  
  39.146 -    def domain_vcpu_hotplug(self, id, vcpu, state):
  39.147 -        """Enable or disable VCPU vcpu in DOM id
  39.148 +    def domain_vcpu_hotplug(self, domid, vcpu, state):
  39.149 +        """Enable or disable specified VCPU in specified domain
  39.150  
  39.151 -        @param id: domain
  39.152          @param vcpu: target VCPU in domain
  39.153          @param state: which state VCPU will become
  39.154 -        @return: 0 on success, -1 on error
  39.155          """
  39.156 +        self.callInfo(domid, XendDomainInfo.vcpu_hotplug, vcpu, state)
  39.157 +
  39.158  
  39.159 -        dominfo = self.domain_lookup(id)
  39.160 -        return dominfo.vcpu_hotplug(vcpu, state)
  39.161 +    def domain_dumpcore(self, domid):
  39.162 +        """Save a core dump for a crashed domain."""
  39.163 +        self.callInfo(domid, XendDomainInfo.dumpCore)
  39.164  
  39.165 -    def domain_dumpcore(self, id):
  39.166 -        """Save a core dump for a crashed domain.
  39.167  
  39.168 -        @param id: domain
  39.169 -        """
  39.170 -        dominfo = self.domain_lookup(id)
  39.171 -        corefile = "/var/xen/dump/%s.%s.core" % (dominfo.getName(),
  39.172 -                                                 dominfo.getDomid())
  39.173 -        try:
  39.174 -            xc.domain_dumpcore(dom=dominfo.getDomid(), corefile=corefile)
  39.175 -        except Exception, ex:
  39.176 -            log.warning("Dumpcore failed, id=%s name=%s: %s",
  39.177 -                        dominfo.getDomid(), dominfo.getName(), ex)
  39.178 -        
  39.179 +    ## private:
  39.180 +
  39.181 +    def callInfo(self, domid, fn, *args, **kwargs):
  39.182 +        self.refresh()
  39.183 +        dominfo = self.domains.get(domid)
  39.184 +        if dominfo:
  39.185 +            return fn(dominfo, *args, **kwargs)
  39.186 +
  39.187 +
  39.188  def instance():
  39.189      """Singleton constructor. Use this instead of the class constructor.
  39.190      """
    40.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Thu Sep 29 13:35:13 2005 -0600
    40.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Thu Sep 29 16:22:02 2005 -0600
    40.3 @@ -34,6 +34,7 @@ from xen.util.blkif import blkdev_uname_
    40.4  
    40.5  from xen.xend.server.channel import EventChannel
    40.6  
    40.7 +from xen.xend import image
    40.8  from xen.xend import sxp
    40.9  from xen.xend.XendBootloader import bootloader
   40.10  from xen.xend.XendLogging import log
   40.11 @@ -319,6 +320,7 @@ class XendDomainInfo:
   40.12  
   40.13          try:
   40.14              defaultInfo('name',         lambda: "Domain-%d" % self.domid)
   40.15 +            defaultInfo('ssidref',      lambda: 0)
   40.16              defaultInfo('restart_mode', lambda: RESTART_ONREBOOT)
   40.17              defaultInfo('cpu_weight',   lambda: 1.0)
   40.18              defaultInfo('bootloader',   lambda: None)
   40.19 @@ -511,6 +513,19 @@ class XendDomainInfo:
   40.20                        self.info['backend'], 0)
   40.21  
   40.22  
   40.23 +    def dumpCore(self):
   40.24 +        """Create a core dump for this domain.  Nothrow guarantee."""
   40.25 +        
   40.26 +        try:
   40.27 +            corefile = "/var/xen/dump/%s.%s.core" % (self.info['name'],
   40.28 +                                                     self.domid)
   40.29 +            xc.domain_dumpcore(dom = self.domid, corefile = corefile)
   40.30 +
   40.31 +        except Exception, exn:
   40.32 +            log.error("XendDomainInfo.dumpCore failed: id = %s name = %s: %s",
   40.33 +                      self.domid, self.info['name'], str(exn))
   40.34 +
   40.35 +
   40.36      def closeStoreChannel(self):
   40.37          """Close the store channel, if any.  Nothrow guarantee."""
   40.38          
   40.39 @@ -614,7 +629,7 @@ class XendDomainInfo:
   40.40              sxpr.append(['maxmem', self.info['maxmem_KiB'] / 1024])
   40.41  
   40.42              if self.infoIsSet('device'):
   40.43 -                for (n, c) in self.info['device']:
   40.44 +                for (_, c) in self.info['device']:
   40.45                      sxpr.append(['device', c])
   40.46  
   40.47              def stateChar(name):
   40.48 @@ -706,13 +721,6 @@ class XendDomainInfo:
   40.49          """
   40.50          # todo - add support for scheduling params?
   40.51          try:
   40.52 -            if 'image' not in self.info:
   40.53 -                raise VmError('Missing image in configuration')
   40.54 -
   40.55 -            self.image = ImageHandler.create(self,
   40.56 -                                             self.info['image'],
   40.57 -                                             self.info['device'])
   40.58 -
   40.59              self.initDomain()
   40.60  
   40.61              # Create domain devices.
   40.62 @@ -737,6 +745,14 @@ class XendDomainInfo:
   40.63  
   40.64          self.domid = xc.domain_create(dom = self.domid or 0,
   40.65                                        ssidref = self.info['ssidref'])
   40.66 +
   40.67 +        if 'image' not in self.info:
   40.68 +            raise VmError('Missing image in configuration')
   40.69 +
   40.70 +        self.image = image.create(self,
   40.71 +                                  self.info['image'],
   40.72 +                                  self.info['device'])
   40.73 +
   40.74          if self.domid <= 0:
   40.75              raise VmError('Creating domain failed: name=%s' %
   40.76                            self.info['name'])
   40.77 @@ -839,20 +855,20 @@ class XendDomainInfo:
   40.78          """Release all vm devices.
   40.79          """
   40.80  
   40.81 -        t = xstransact("%s/device" % self.path)
   40.82 -
   40.83 -        for n in controllerClasses.keys():
   40.84 -            for d in t.list(n):
   40.85 -                try:
   40.86 -                    t.remove(d)
   40.87 -                except ex:
   40.88 -                    # Log and swallow any exceptions in removal -- there's
   40.89 -                    # nothing more we can do.
   40.90 -                    log.exception(
   40.91 -                        "Device release failed: %s; %s; %s; %s" %
   40.92 -                        (self.info['name'], n, d, str(ex)))
   40.93 -        t.commit()
   40.94 -
   40.95 +        while True:
   40.96 +            t = xstransact("%s/device" % self.path)
   40.97 +            for n in controllerClasses.keys():
   40.98 +                for d in t.list(n):
   40.99 +                    try:
  40.100 +                        t.remove(d)
  40.101 +                    except ex:
  40.102 +                        # Log and swallow any exceptions in removal --
  40.103 +                        # there's nothing more we can do.
  40.104 +                        log.exception(
  40.105 +                           "Device release failed: %s; %s; %s; %s" %
  40.106 +                            (self.info['name'], n, d, str(ex)))
  40.107 +            if t.commit():
  40.108 +                break
  40.109  
  40.110      def eventChannel(self, path=None):
  40.111          """Create an event channel to the domain.
  40.112 @@ -1085,19 +1101,6 @@ class XendDomainInfo:
  40.113  
  40.114  
  40.115  #============================================================================
  40.116 -# Register image handlers.
  40.117 -
  40.118 -from image import          \
  40.119 -     addImageHandlerClass, \
  40.120 -     ImageHandler,         \
  40.121 -     LinuxImageHandler,    \
  40.122 -     VmxImageHandler
  40.123 -
  40.124 -addImageHandlerClass(LinuxImageHandler)
  40.125 -addImageHandlerClass(VmxImageHandler)
  40.126 -
  40.127 -
  40.128 -#============================================================================
  40.129  # Register device controllers and their device config types.
  40.130  
  40.131  """A map from device-class names to the subclass of DevController that
    41.1 --- a/tools/python/xen/xend/image.py	Thu Sep 29 13:35:13 2005 -0600
    41.2 +++ b/tools/python/xen/xend/image.py	Thu Sep 29 16:22:02 2005 -0600
    41.3 @@ -33,6 +33,15 @@ xc = xen.lowlevel.xc.new()
    41.4  
    41.5  MAX_GUEST_CMDLINE = 1024
    41.6  
    41.7 +
    41.8 +def create(vm, imageConfig, deviceConfig):
    41.9 +    """Create an image handler for a vm.
   41.10 +
   41.11 +    @return ImageHandler instance
   41.12 +    """
   41.13 +    return findImageHandlerClass(imageConfig)(vm, imageConfig, deviceConfig)
   41.14 +
   41.15 +
   41.16  class ImageHandler:
   41.17      """Abstract base class for image handlers.
   41.18  
   41.19 @@ -48,81 +57,39 @@ class ImageHandler:
   41.20  
   41.21      The method destroy() is called when the domain is destroyed.
   41.22      The default is to do nothing.
   41.23 -    
   41.24      """
   41.25  
   41.26 -    #======================================================================
   41.27 -    # Class vars and methods.
   41.28 -
   41.29 -    """Table of image handler classes for virtual machine images.
   41.30 -    Indexed by image type.
   41.31 -    """
   41.32 -    imageHandlerClasses = {}
   41.33 -
   41.34 -    def addImageHandlerClass(cls, h):
   41.35 -        """Add a handler class for an image type
   41.36 -        @param h:        handler: ImageHandler subclass
   41.37 -        """
   41.38 -        cls.imageHandlerClasses[h.ostype] = h
   41.39 -
   41.40 -    addImageHandlerClass = classmethod(addImageHandlerClass)
   41.41 -
   41.42 -    def findImageHandlerClass(cls, image):
   41.43 -        """Find the image handler class for an image config.
   41.44 -
   41.45 -        @param image config
   41.46 -        @return ImageHandler subclass or None
   41.47 -        """
   41.48 -        ty = sxp.name(image)
   41.49 -        if ty is None:
   41.50 -            raise VmError('missing image type')
   41.51 -        imageClass = cls.imageHandlerClasses.get(ty)
   41.52 -        if imageClass is None:
   41.53 -            raise VmError('unknown image type: ' + ty)
   41.54 -        return imageClass
   41.55 -
   41.56 -    findImageHandlerClass = classmethod(findImageHandlerClass)
   41.57 -
   41.58 -    def create(cls, vm, imageConfig, deviceConfig):
   41.59 -        """Create an image handler for a vm.
   41.60 -
   41.61 -        @return ImageHandler instance
   41.62 -        """
   41.63 -        imageClass = cls.findImageHandlerClass(imageConfig)
   41.64 -        return imageClass(vm, imageConfig, deviceConfig)
   41.65 -
   41.66 -    create = classmethod(create)
   41.67 -
   41.68 -    #======================================================================
   41.69 -    # Instance vars and methods.
   41.70 -
   41.71      ostype = None
   41.72  
   41.73 -    kernel = None
   41.74 -    ramdisk = None
   41.75 -    cmdline = None
   41.76 -
   41.77 -    flags = 0
   41.78  
   41.79      def __init__(self, vm, imageConfig, deviceConfig):
   41.80          self.vm = vm
   41.81 +
   41.82 +        self.kernel = None
   41.83 +        self.ramdisk = None
   41.84 +        self.cmdline = None
   41.85 +        self.flags = 0
   41.86 +
   41.87          self.configure(imageConfig, deviceConfig)
   41.88  
   41.89      def configure(self, imageConfig, _):
   41.90          """Config actions common to all unix-like domains."""
   41.91  
   41.92 -        self.kernel = sxp.child_value(imageConfig, "kernel")
   41.93 +        def get_cfg(name, default = None):
   41.94 +            return sxp.child_value(imageConfig, name, default)
   41.95 +
   41.96 +        self.kernel = get_cfg("kernel")
   41.97          self.cmdline = ""
   41.98 -        ip = sxp.child_value(imageConfig, "ip", None)
   41.99 +        ip = get_cfg("ip")
  41.100          if ip:
  41.101              self.cmdline += " ip=" + ip
  41.102 -        root = sxp.child_value(imageConfig, "root")
  41.103 +        root = get_cfg("root")
  41.104          if root:
  41.105              self.cmdline += " root=" + root
  41.106 -        args = sxp.child_value(imageConfig, "args")
  41.107 +        args = get_cfg("args")
  41.108          if args:
  41.109              self.cmdline += " " + args
  41.110 -        self.ramdisk = sxp.child_value(imageConfig, "ramdisk", '')
  41.111 +        self.ramdisk = get_cfg("ramdisk", '')
  41.112          
  41.113          self.vm.storeVm(("image/ostype", self.ostype),
  41.114                          ("image/kernel", self.kernel),
  41.115 @@ -130,7 +97,7 @@ class ImageHandler:
  41.116                          ("image/ramdisk", self.ramdisk))
  41.117  
  41.118  
  41.119 -    def handleBootloading():
  41.120 +    def handleBootloading(self):
  41.121          self.unlink(self.kernel)
  41.122          self.unlink(self.ramdisk)
  41.123  
  41.124 @@ -194,7 +161,6 @@ class ImageHandler:
  41.125          if d.has_key('console_mfn'):
  41.126              self.vm.setConsoleRef(d.get('console_mfn'))
  41.127  
  41.128 -addImageHandlerClass = ImageHandler.addImageHandlerClass
  41.129  
  41.130  class LinuxImageHandler(ImageHandler):
  41.131  
  41.132 @@ -238,22 +204,19 @@ class VmxImageHandler(ImageHandler):
  41.133  
  41.134      def configure(self, imageConfig, deviceConfig):
  41.135          ImageHandler.configure(self, imageConfig, deviceConfig)
  41.136 -        
  41.137 -        self.memmap = sxp.child_value(imageConfig, 'memmap')
  41.138 +
  41.139          self.dmargs = self.parseDeviceModelArgs(imageConfig, deviceConfig)
  41.140          self.device_model = sxp.child_value(imageConfig, 'device_model')
  41.141          if not self.device_model:
  41.142              raise VmError("vmx: missing device model")
  41.143          self.display = sxp.child_value(imageConfig, 'display')
  41.144  
  41.145 -        self.vm.storeVm(("image/memmap", self.memmap),
  41.146 -                        ("image/dmargs", " ".join(self.dmargs)),
  41.147 +        self.vm.storeVm(("image/dmargs", " ".join(self.dmargs)),
  41.148                          ("image/device-model", self.device_model),
  41.149                          ("image/display", self.display))
  41.150  
  41.151          self.device_channel = None
  41.152          self.pid = 0
  41.153 -        self.memmap_value = []
  41.154  
  41.155          self.dmargs += self.configVNC(imageConfig)
  41.156  
  41.157 @@ -261,7 +224,6 @@ class VmxImageHandler(ImageHandler):
  41.158      def createImage(self):
  41.159          """Create a VM for the VMX environment.
  41.160          """
  41.161 -        self.parseMemmap()
  41.162          self.createDomain()
  41.163  
  41.164      def buildDomain(self):
  41.165 @@ -278,9 +240,6 @@ class VmxImageHandler(ImageHandler):
  41.166          log.debug("control_evtchn = %d", self.device_channel.port2)
  41.167          log.debug("store_evtchn   = %d", store_evtchn)
  41.168          log.debug("memsize        = %d", self.vm.getMemoryTarget() / 1024)
  41.169 -        log.debug("memmap         = %s", self.memmap_value)
  41.170 -        log.debug("cmdline        = %s", self.cmdline)
  41.171 -        log.debug("ramdisk        = %s", self.ramdisk)
  41.172          log.debug("flags          = %d", self.flags)
  41.173          log.debug("vcpus          = %d", self.vm.getVCpuCount())
  41.174  
  41.175 @@ -289,9 +248,6 @@ class VmxImageHandler(ImageHandler):
  41.176                             control_evtchn = self.device_channel.port2,
  41.177                             store_evtchn   = store_evtchn,
  41.178                             memsize        = self.vm.getMemoryTarget() / 1024,
  41.179 -                           memmap         = self.memmap_value,
  41.180 -                           cmdline        = self.cmdline,
  41.181 -                           ramdisk        = self.ramdisk,
  41.182                             flags          = self.flags,
  41.183                             vcpus          = self.vm.getVCpuCount())
  41.184          if isinstance(ret, dict):
  41.185 @@ -299,18 +255,11 @@ class VmxImageHandler(ImageHandler):
  41.186              return 0
  41.187          return ret
  41.188  
  41.189 -    def parseMemmap(self):
  41.190 -        if self.memmap is None:
  41.191 -            return
  41.192 -        memmap = sxp.parse(open(self.memmap))[0]
  41.193 -        from xen.util.memmap import memmap_parse
  41.194 -        self.memmap_value = memmap_parse(memmap)
  41.195 -        
  41.196      # Return a list of cmd line args to the device models based on the
  41.197      # xm config file
  41.198      def parseDeviceModelArgs(self, imageConfig, deviceConfig):
  41.199          dmargs = [ 'cdrom', 'boot', 'fda', 'fdb',
  41.200 -                   'localtime', 'serial', 'stdvga', 'isa', 'vcpus' ] 
  41.201 +                   'localtime', 'serial', 'stdvga', 'isa', 'vcpus' ]
  41.202          ret = []
  41.203          for a in dmargs:
  41.204              v = sxp.child_value(imageConfig, a)
  41.205 @@ -439,3 +388,28 @@ class VmxImageHandler(ImageHandler):
  41.206              return 16 * 1024
  41.207          else:
  41.208              return (1 + ((mem_mb + 3) >> 2)) * 4
  41.209 +
  41.210 +
  41.211 +"""Table of image handler classes for virtual machine images.  Indexed by
  41.212 +image type.
  41.213 +"""
  41.214 +imageHandlerClasses = {}
  41.215 +
  41.216 +
  41.217 +for h in LinuxImageHandler, VmxImageHandler:
  41.218 +    imageHandlerClasses[h.ostype] = h
  41.219 +
  41.220 +
  41.221 +def findImageHandlerClass(image):
  41.222 +    """Find the image handler class for an image config.
  41.223 +
  41.224 +    @param image config
  41.225 +    @return ImageHandler subclass or None
  41.226 +    """
  41.227 +    ty = sxp.name(image)
  41.228 +    if ty is None:
  41.229 +        raise VmError('missing image type')
  41.230 +    imageClass = imageHandlerClasses.get(ty)
  41.231 +    if imageClass is None:
  41.232 +        raise VmError('unknown image type: ' + ty)
  41.233 +    return imageClass
    42.1 --- a/tools/python/xen/xend/server/DevController.py	Thu Sep 29 13:35:13 2005 -0600
    42.2 +++ b/tools/python/xen/xend/server/DevController.py	Thu Sep 29 16:22:02 2005 -0600
    42.3 @@ -126,20 +126,21 @@ class DevController:
    42.4          compulsory to use it; subclasses may prefer to allocate IDs based upon
    42.5          the device configuration instead.
    42.6          """
    42.7 -        path = self.frontendMiscPath()
    42.8 -        t = xstransact(path)
    42.9 -        try:
   42.10 -            result = t.read("nextDeviceID")
   42.11 -            if result:
   42.12 -                result = int(result)
   42.13 -            else:
   42.14 -                result = 1
   42.15 -            t.write("nextDeviceID", str(result + 1))
   42.16 -            t.commit()
   42.17 -            return result
   42.18 -        except:
   42.19 -            t.abort()
   42.20 -            raise
   42.21 +        while True:
   42.22 +            path = self.frontendMiscPath()
   42.23 +            t = xstransact(path)
   42.24 +            try:
   42.25 +                result = t.read("nextDeviceID")
   42.26 +                if result:
   42.27 +                    result = int(result)
   42.28 +                else:
   42.29 +                    result = 1
   42.30 +                t.write("nextDeviceID", str(result + 1))
   42.31 +                if t.commit():
   42.32 +                    return result
   42.33 +            except:
   42.34 +                t.abort()
   42.35 +                raise
   42.36  
   42.37  
   42.38      ## private:
    43.1 --- a/tools/python/xen/xend/xenstore/xsnode.py	Thu Sep 29 13:35:13 2005 -0600
    43.2 +++ b/tools/python/xen/xend/xenstore/xsnode.py	Thu Sep 29 16:22:02 2005 -0600
    43.3 @@ -280,8 +280,8 @@ class XenStore:
    43.4                                 (', while writing %s : %s' % (str(path),
    43.5                                                               str(data))))
    43.6  
    43.7 -    def begin(self, path):
    43.8 -        self.getxs().transaction_start(path)
    43.9 +    def begin(self):
   43.10 +        self.getxs().transaction_start()
   43.11  
   43.12      def commit(self, abandon=False):
   43.13          self.getxs().transaction_end(abort=abandon)
    44.1 --- a/tools/python/xen/xend/xenstore/xstransact.py	Thu Sep 29 13:35:13 2005 -0600
    44.2 +++ b/tools/python/xen/xend/xenstore/xstransact.py	Thu Sep 29 16:22:02 2005 -0600
    44.3 @@ -14,16 +14,8 @@ class xstransact:
    44.4      def __init__(self, path):
    44.5          self.in_transaction = False
    44.6          self.path = path.rstrip("/")
    44.7 -        while True:
    44.8 -            try:
    44.9 -                xshandle().transaction_start(path)
   44.10 -                self.in_transaction = True
   44.11 -                return
   44.12 -            except RuntimeError, ex:
   44.13 -                if ex.args[0] == errno.ENOENT and path != "/":
   44.14 -                    path = "/".join(path.split("/")[0:-1]) or "/"
   44.15 -                else:
   44.16 -                    raise
   44.17 +        xshandle().transaction_start()
   44.18 +        self.in_transaction = True
   44.19  
   44.20      def __del__(self):
   44.21          if self.in_transaction:
   44.22 @@ -175,14 +167,8 @@ class xstransact:
   44.23              t = cls(path)
   44.24              try:
   44.25                  v = t.read(*args)
   44.26 -                t.commit()
   44.27 +                t.abort()
   44.28                  return v
   44.29 -            except RuntimeError, ex:
   44.30 -                t.abort()
   44.31 -                if ex.args[0] == errno.ETIMEDOUT:
   44.32 -                    pass
   44.33 -                else:
   44.34 -                    raise
   44.35              except:
   44.36                  t.abort()
   44.37                  raise
   44.38 @@ -194,14 +180,8 @@ class xstransact:
   44.39              t = cls(path)
   44.40              try:
   44.41                  t.write(*args, **opts)
   44.42 -                t.commit()
   44.43 -                return
   44.44 -            except RuntimeError, ex:
   44.45 -                t.abort()
   44.46 -                if ex.args[0] == errno.ETIMEDOUT:
   44.47 -                    pass
   44.48 -                else:
   44.49 -                    raise
   44.50 +                if t.commit():
   44.51 +                    return
   44.52              except:
   44.53                  t.abort()
   44.54                  raise
   44.55 @@ -217,14 +197,8 @@ class xstransact:
   44.56              t = cls(path)
   44.57              try:
   44.58                  t.remove(*args)
   44.59 -                t.commit()
   44.60 -                return
   44.61 -            except RuntimeError, ex:
   44.62 -                t.abort()
   44.63 -                if ex.args[0] == errno.ETIMEDOUT:
   44.64 -                    pass
   44.65 -                else:
   44.66 -                    raise
   44.67 +                if t.commit():
   44.68 +                    return
   44.69              except:
   44.70                  t.abort()
   44.71                  raise
   44.72 @@ -236,14 +210,8 @@ class xstransact:
   44.73              t = cls(path)
   44.74              try:
   44.75                  v = t.list(*args)
   44.76 -                t.commit()
   44.77 -                return v
   44.78 -            except RuntimeError, ex:
   44.79 -                t.abort()
   44.80 -                if ex.args[0] == errno.ETIMEDOUT:
   44.81 -                    pass
   44.82 -                else:
   44.83 -                    raise
   44.84 +                if t.commit():
   44.85 +                    return v
   44.86              except:
   44.87                  t.abort()
   44.88                  raise
   44.89 @@ -255,14 +223,8 @@ class xstransact:
   44.90              t = cls(path)
   44.91              try:
   44.92                  v = t.gather(*args)
   44.93 -                t.commit()
   44.94 -                return v
   44.95 -            except RuntimeError, ex:
   44.96 -                t.abort()
   44.97 -                if ex.args[0] == errno.ETIMEDOUT:
   44.98 -                    pass
   44.99 -                else:
  44.100 -                    raise
  44.101 +                if t.commit():
  44.102 +                    return v
  44.103              except:
  44.104                  t.abort()
  44.105                  raise
  44.106 @@ -274,14 +236,8 @@ class xstransact:
  44.107              t = cls(path)
  44.108              try:
  44.109                  v = t.store(*args)
  44.110 -                t.commit()
  44.111 -                return v
  44.112 -            except RuntimeError, ex:
  44.113 -                t.abort()
  44.114 -                if ex.args[0] == errno.ETIMEDOUT:
  44.115 -                    pass
  44.116 -                else:
  44.117 -                    raise
  44.118 +                if t.commit():
  44.119 +                    return v
  44.120              except:
  44.121                  t.abort()
  44.122                  raise
    45.1 --- a/tools/python/xen/xm/main.py	Thu Sep 29 13:35:13 2005 -0600
    45.2 +++ b/tools/python/xen/xm/main.py	Thu Sep 29 16:22:02 2005 -0600
    45.3 @@ -1,5 +1,6 @@
    45.4  # (C) Copyright IBM Corp. 2005
    45.5  # Copyright (C) 2004 Mike Wray
    45.6 +# Copyright (c) 2005 XenSource Ltd
    45.7  #
    45.8  # Authors:
    45.9  #     Sean Dague <sean at dague dot net>
   45.10 @@ -169,12 +170,6 @@ def handle_xend_error(cmd, dom, ex):
   45.11  #
   45.12  #########################################################################
   45.13  
   45.14 -def xm_create(args):
   45.15 -    from xen.xm import create
   45.16 -    # ugly hack because the opt parser apparently wants
   45.17 -    # the subcommand name just to throw it away!
   45.18 -    create.main(["bogus"] + args)
   45.19 -
   45.20  def xm_save(args):
   45.21      arg_check(args,2,"save")
   45.22  
   45.23 @@ -196,13 +191,6 @@ def xm_restore(args):
   45.24      if id is not None:
   45.25          server.xend_domain_unpause(domid)
   45.26  
   45.27 -def xm_migrate(args):
   45.28 -    # TODO: arg_check
   45.29 -    from xen.xm import migrate
   45.30 -    # ugly hack because the opt parser apparently wants
   45.31 -    # the subcommand name just to throw it away!
   45.32 -    migrate.main(["bogus"] + args)
   45.33 -
   45.34  def xm_list(args):
   45.35      use_long = 0
   45.36      show_vcpus = 0
   45.37 @@ -290,14 +278,6 @@ def xm_show_vcpus(domsinfo):
   45.38  def xm_vcpu_list(args):
   45.39      xm_list(["-v"] + args)
   45.40  
   45.41 -def xm_destroy(args):
   45.42 -    arg_check(args,1,"destroy")
   45.43 -
   45.44 -    from xen.xm import destroy
   45.45 -    # ugly hack because the opt parser apparently wants
   45.46 -    # the subcommand name just to throw it away!
   45.47 -    destroy.main(["bogus"] + args)
   45.48 -            
   45.49  def xm_reboot(args):
   45.50      arg_check(args,1,"reboot")
   45.51      from xen.xm import shutdown
   45.52 @@ -305,20 +285,6 @@ def xm_reboot(args):
   45.53      # the subcommand name just to throw it away!
   45.54      shutdown.main(["bogus", "-R"] + args)
   45.55  
   45.56 -def xm_shutdown(args):
   45.57 -    arg_check(args,1,"shutdown")
   45.58 -
   45.59 -    from xen.xm import shutdown
   45.60 -    # ugly hack because the opt parser apparently wants
   45.61 -    # the subcommand name just to throw it away!
   45.62 -    shutdown.main(["bogus"] + args)
   45.63 -
   45.64 -def xm_sysrq(args):
   45.65 -    from xen.xm import sysrq
   45.66 -    # ugly hack because the opt parser apparently wants
   45.67 -    # the subcommand name just to throw it away!
   45.68 -    sysrq.main(["bogus"] + args)
   45.69 -
   45.70  def xm_pause(args):
   45.71      arg_check(args, 1, "pause")
   45.72      dom = args[0]
   45.73 @@ -333,6 +299,11 @@ def xm_unpause(args):
   45.74      from xen.xend.XendClient import server
   45.75      server.xend_domain_unpause(dom)
   45.76  
   45.77 +def xm_subcommand(command, args):
   45.78 +    cmd = __import__(command, globals(), locals(), 'xen.xm')
   45.79 +    cmd.main(["bogus"] + args)
   45.80 +
   45.81 +
   45.82  #############################################################
   45.83  
   45.84  def cpu_make_map(cpulist):
   45.85 @@ -506,14 +477,6 @@ def xm_network_list(args):
   45.86          sxp.show(x)
   45.87          print
   45.88  
   45.89 -def xm_network_attach(args):
   45.90 -
   45.91 -    print "Not implemented"
   45.92 -
   45.93 -def xm_network_detach(args):
   45.94 -
   45.95 -    print "Not implemented"
   45.96 -    
   45.97  def xm_block_list(args):
   45.98      arg_check(args,1,"block-list")
   45.99      dom = args[0]
  45.100 @@ -609,11 +572,8 @@ commands = {
  45.101      # domain commands
  45.102      "domid": xm_domid,
  45.103      "domname": xm_domname,
  45.104 -    "create": xm_create,
  45.105 -    "destroy": xm_destroy,
  45.106      "restore": xm_restore,
  45.107      "save": xm_save,
  45.108 -    "shutdown": xm_shutdown,
  45.109      "reboot": xm_reboot,
  45.110      "list": xm_list,
  45.111      # memory commands
  45.112 @@ -625,10 +585,7 @@ commands = {
  45.113      "vcpu-enable": xm_vcpu_enable,
  45.114      "vcpu-disable": xm_vcpu_disable,
  45.115      "vcpu-list": xm_vcpu_list,
  45.116 -    # migration
  45.117 -    "migrate": xm_migrate,
  45.118      # special
  45.119 -    "sysrq": xm_sysrq,
  45.120      "pause": xm_pause,
  45.121      "unpause": xm_unpause,
  45.122      # host commands
  45.123 @@ -647,14 +604,24 @@ commands = {
  45.124      # network
  45.125      "network-limit": xm_network_limit,
  45.126      "network-list": xm_network_list,
  45.127 -    "network-attach": xm_network_attach,
  45.128 -    "network-detach": xm_network_detach,
  45.129      # vnet
  45.130      "vnet-list": xm_vnet_list,
  45.131      "vnet-create": xm_vnet_create,
  45.132      "vnet-delete": xm_vnet_delete,
  45.133      }
  45.134  
  45.135 +## The commands supported by a separate argument parser in xend.xm.
  45.136 +subcommands = [
  45.137 +    'create',
  45.138 +    'destroy',
  45.139 +    'migrate',
  45.140 +    'sysrq',
  45.141 +    'shutdown'
  45.142 +    ]
  45.143 +
  45.144 +for c in subcommands:
  45.145 +    commands[c] = eval('lambda args: xm_subcommand("%s", args)' % c)
  45.146 +
  45.147  aliases = {
  45.148      "balloon": "mem-set",
  45.149      "vif-list": "network-list",
  45.150 @@ -669,6 +636,7 @@ help = {
  45.151      "--long": longhelp
  45.152     }
  45.153  
  45.154 +
  45.155  def xm_lookup_cmd(cmd):
  45.156      if commands.has_key(cmd):
  45.157          return commands[cmd]
  45.158 @@ -688,9 +656,7 @@ def deprecated(old,new):
  45.159      err('Option %s is the new replacement, see "xm help %s" for more info' % (new, new))
  45.160  
  45.161  def usage(cmd=None):
  45.162 -    if cmd == "full":
  45.163 -        print fullhelp
  45.164 -    elif help.has_key(cmd):
  45.165 +    if help.has_key(cmd):
  45.166          print help[cmd]
  45.167      else:
  45.168          print shorthelp
  45.169 @@ -701,7 +667,7 @@ def main(argv=sys.argv):
  45.170          usage()
  45.171      
  45.172      if re.compile('-*help').match(argv[1]):
  45.173 -	if len(argv) > 2 and help.has_key(argv[2]):
  45.174 +	if len(argv) > 2:
  45.175  	    usage(argv[2])
  45.176  	else:
  45.177  	    usage()
    46.1 --- a/tools/xenstore/Makefile	Thu Sep 29 13:35:13 2005 -0600
    46.2 +++ b/tools/xenstore/Makefile	Thu Sep 29 16:22:02 2005 -0600
    46.3 @@ -28,11 +28,11 @@ CLIENTS := xenstore-exists xenstore-list
    46.4  CLIENTS += xenstore-write
    46.5  CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
    46.6  
    46.7 -all: libxenstore.so xenstored $(CLIENTS)
    46.8 +all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump
    46.9  
   46.10  testcode: xs_test xenstored_test xs_random xs_dom0_test
   46.11  
   46.12 -xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o
   46.13 +xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o tdb.o
   46.14  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
   46.15  
   46.16  $(CLIENTS): libxenstore.so
   46.17 @@ -42,7 +42,10 @@ xenstored: xenstored_core.o xenstored_wa
   46.18  $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
   46.19  	$(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
   46.20  
   46.21 -xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o
   46.22 +xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o tdb.o
   46.23 +	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
   46.24 +
   46.25 +xs_tdb_dump: xs_tdb_dump.o utils.o tdb.o talloc.o
   46.26  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
   46.27  
   46.28  xs_test: xs_test.o xs_lib.o utils.o
   46.29 @@ -50,6 +53,11 @@ xs_random: xs_random.o xs_test_lib.o xs_
   46.30  xs_stress: xs_stress.o xs_test_lib.o xs_lib.o talloc.o utils.o
   46.31  xs_crashme: xs_crashme.o xs_lib.o talloc.o utils.o
   46.32  
   46.33 +speedtest: speedtest.o xs.o xs_lib.o utils.o talloc.o
   46.34 +
   46.35 +check-speed: speedtest xenstored_test $(TESTDIR)
   46.36 +	$(TESTENV) time ./speedtest 100
   46.37 +
   46.38  xs_test.o xs_stress.o xenstored_core_test.o xenstored_watch_test.o xenstored_transaction_test.o xenstored_domain_test.o xs_random.o xs_test_lib.o talloc_test.o fake_libxc.o xs_crashme.o: CFLAGS=$(BASECFLAGS) $(TESTFLAGS)
   46.39  
   46.40  xenstored_%_test.o: xenstored_%.c
   46.41 @@ -98,7 +106,7 @@ RANDSEED=$(shell date +%s)
   46.42  randomcheck: xs_random xenstored_test $(TESTDIR)
   46.43  	$(TESTENV) ./xs_random --simple --fast /tmp/xs_random 200000 $(RANDSEED) && echo
   46.44  	$(TESTENV) ./xs_random --fast /tmp/xs_random 100000 $(RANDSEED) && echo
   46.45 -	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
   46.46 +#	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
   46.47  
   46.48  crashme:  xs_crashme xenstored_test $(TESTDIR)
   46.49  	rm -rf $(TESTDIR)/store $(TESTDIR)/transactions /tmp/xs_crashme.vglog* /tmp/trace
    47.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    47.2 +++ b/tools/xenstore/speedtest.c	Thu Sep 29 16:22:02 2005 -0600
    47.3 @@ -0,0 +1,130 @@
    47.4 +/* 
    47.5 +    Xen Store Daemon Speed test
    47.6 +    Copyright (C) 2005 Rusty Russell IBM Corporation
    47.7 +
    47.8 +    This program is free software; you can redistribute it and/or modify
    47.9 +    it under the terms of the GNU General Public License as published by
   47.10 +    the Free Software Foundation; either version 2 of the License, or
   47.11 +    (at your option) any later version.
   47.12 +
   47.13 +    This program is distributed in the hope that it will be useful,
   47.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   47.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   47.16 +    GNU General Public License for more details.
   47.17 +
   47.18 +    You should have received a copy of the GNU General Public License
   47.19 +    along with this program; if not, write to the Free Software
   47.20 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   47.21 +*/
   47.22 +
   47.23 +#include <stdlib.h>
   47.24 +#include <sys/types.h>
   47.25 +#include <sys/wait.h>
   47.26 +#include <stdio.h>
   47.27 +#include <stdarg.h>
   47.28 +#include <unistd.h>
   47.29 +#include <fcntl.h>
   47.30 +#include <errno.h>
   47.31 +#include "utils.h"
   47.32 +#include "xs.h"
   47.33 +#include "list.h"
   47.34 +#include "talloc.h"
   47.35 +
   47.36 +static void do_command(const char *cmd)
   47.37 +{
   47.38 +	int ret;
   47.39 +
   47.40 +	ret = system(cmd);
   47.41 +	if (ret == -1 || !WIFEXITED(ret) || WEXITSTATUS(ret) != 0)
   47.42 +		barf_perror("Failed '%s': %i", cmd, ret);
   47.43 +}
   47.44 +
   47.45 +static int start_daemon(void)
   47.46 +{
   47.47 +	int fds[2], pid;
   47.48 +
   47.49 +	do_command(talloc_asprintf(NULL, "rm -rf testsuite/tmp/*"));
   47.50 +
   47.51 +	/* Start daemon. */
   47.52 +	pipe(fds);
   47.53 +	if ((pid = fork())) {
   47.54 +		/* Child writes PID when its ready: we wait for that. */
   47.55 +		char buffer[20];
   47.56 +		close(fds[1]);
   47.57 +		if (read(fds[0], buffer, sizeof(buffer)) < 0)
   47.58 +			barf("Failed to summon daemon");
   47.59 +		close(fds[0]);
   47.60 +	} else {
   47.61 +		dup2(fds[1], STDOUT_FILENO);
   47.62 +		close(fds[0]);
   47.63 +#if 0
   47.64 +		execlp("valgrind", "valgrind", "-q", "--suppressions=testsuite/vg-suppressions", "xenstored_test", "--output-pid",
   47.65 +		       "--no-fork", "--trace-file=/tmp/trace", NULL);
   47.66 +#else
   47.67 +		execlp("./xenstored_test", "xenstored_test", "--output-pid", "--no-fork", NULL);
   47.68 +//		execlp("strace", "strace", "-o", "/tmp/out", "./xenstored_test", "--output-pid", "--no-fork", NULL);
   47.69 +#endif
   47.70 +		exit(1);
   47.71 +	}
   47.72 +	return pid;
   47.73 +}
   47.74 +
   47.75 +static void kill_daemon(int pid)
   47.76 +{
   47.77 +	int saved_errno = errno;
   47.78 +	kill(pid, SIGTERM);
   47.79 +	errno = saved_errno;
   47.80 +}
   47.81 +
   47.82 +#define NUM_ENTRIES 50
   47.83 +
   47.84 +/* We create the given number of trees, each with NUM_ENTRIES, using
   47.85 + * transactions. */
   47.86 +int main(int argc, char *argv[])
   47.87 +{
   47.88 +	int i, j, pid, print;
   47.89 +	struct xs_handle *h;
   47.90 +
   47.91 +	if (argc != 2)
   47.92 +		barf("Usage: speedtest <numdomains>");
   47.93 +
   47.94 +	pid = start_daemon();
   47.95 +	h = xs_daemon_open();
   47.96 +	print = atoi(argv[1]) / 76;
   47.97 +	if (!print)
   47.98 +		print = 1;
   47.99 +	for (i = 0; i < atoi(argv[1]); i ++) {
  47.100 +		char name[64];
  47.101 +
  47.102 +		if (i % print == 0)
  47.103 +			write(1, ".", 1);
  47.104 +		if (!xs_transaction_start(h, "/")) {
  47.105 +			kill_daemon(pid);
  47.106 +			barf_perror("Starting transaction");
  47.107 +		}
  47.108 +		sprintf(name, "/%i", i);
  47.109 +		if (!xs_mkdir(h, name)) {
  47.110 +			kill_daemon(pid);
  47.111 +			barf_perror("Making directory %s", name);
  47.112 +		}
  47.113 +
  47.114 +		for (j = 0; j < NUM_ENTRIES; j++) {
  47.115 +			sprintf(name, "/%i/%i", i, j);
  47.116 +			if (!xs_write(h, name, name, strlen(name))) {
  47.117 +				kill_daemon(pid);
  47.118 +				barf_perror("Making directory %s", name);
  47.119 +			}
  47.120 +		}
  47.121 +		if (!xs_transaction_end(h, false)) {
  47.122 +			kill_daemon(pid);
  47.123 +			barf_perror("Ending transaction");
  47.124 +		}
  47.125 +	}
  47.126 +	write(1, "\n", 1);
  47.127 +
  47.128 +	kill_daemon(pid);
  47.129 +	wait(NULL);
  47.130 +	return 0;
  47.131 +}
  47.132 +	
  47.133 +	
    48.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    48.2 +++ b/tools/xenstore/tdb.c	Thu Sep 29 16:22:02 2005 -0600
    48.3 @@ -0,0 +1,2151 @@
    48.4 + /* 
    48.5 +   Unix SMB/CIFS implementation.
    48.6 +
    48.7 +   trivial database library
    48.8 +
    48.9 +   Copyright (C) Andrew Tridgell              1999-2004
   48.10 +   Copyright (C) Paul `Rusty' Russell		   2000
   48.11 +   Copyright (C) Jeremy Allison			   2000-2003
   48.12 +   
   48.13 +     ** NOTE! The following LGPL license applies to the tdb
   48.14 +     ** library. This does NOT imply that all of Samba is released
   48.15 +     ** under the LGPL
   48.16 +   
   48.17 +   This library is free software; you can redistribute it and/or
   48.18 +   modify it under the terms of the GNU Lesser General Public
   48.19 +   License as published by the Free Software Foundation; either
   48.20 +   version 2 of the License, or (at your option) any later version.
   48.21 +
   48.22 +   This library is distributed in the hope that it will be useful,
   48.23 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
   48.24 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   48.25 +   Lesser General Public License for more details.
   48.26 +
   48.27 +   You should have received a copy of the GNU Lesser General Public
   48.28 +   License along with this library; if not, write to the Free Software
   48.29 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   48.30 +*/
   48.31 +
   48.32 +
   48.33 +#ifndef _SAMBA_BUILD_
   48.34 +#if HAVE_CONFIG_H
   48.35 +#include <config.h>
   48.36 +#endif
   48.37 +
   48.38 +#include <stdlib.h>
   48.39 +#include <stdio.h>
   48.40 +#include <stdint.h>
   48.41 +#include <fcntl.h>
   48.42 +#include <unistd.h>
   48.43 +#include <string.h>
   48.44 +#include <fcntl.h>
   48.45 +#include <errno.h>
   48.46 +#include <sys/mman.h>
   48.47 +#include <sys/stat.h>
   48.48 +#include "tdb.h"
   48.49 +#include <stdarg.h>
   48.50 +#include "talloc.h"
   48.51 +#define HAVE_MMAP
   48.52 +#else
   48.53 +#include "includes.h"
   48.54 +#include "lib/tdb/include/tdb.h"
   48.55 +#include "system/time.h"
   48.56 +#include "system/shmem.h"
   48.57 +#include "system/filesys.h"
   48.58 +#endif
   48.59 +
   48.60 +#define TDB_MAGIC_FOOD "TDB file\n"
   48.61 +#define TDB_VERSION (0x26011967 + 6)
   48.62 +#define TDB_MAGIC (0x26011999U)
   48.63 +#define TDB_FREE_MAGIC (~TDB_MAGIC)
   48.64 +#define TDB_DEAD_MAGIC (0xFEE1DEAD)
   48.65 +#define TDB_ALIGNMENT 4
   48.66 +#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
   48.67 +#define DEFAULT_HASH_SIZE 131
   48.68 +#define TDB_PAGE_SIZE 0x2000
   48.69 +#define FREELIST_TOP (sizeof(struct tdb_header))
   48.70 +#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
   48.71 +#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
   48.72 +#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
   48.73 +#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
   48.74 +#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
   48.75 +#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1))
   48.76 +
   48.77 +
   48.78 +/* NB assumes there is a local variable called "tdb" that is the
   48.79 + * current context, also takes doubly-parenthesized print-style
   48.80 + * argument. */
   48.81 +#define TDB_LOG(x) tdb->log_fn x
   48.82 +
   48.83 +/* lock offsets */
   48.84 +#define GLOBAL_LOCK 0
   48.85 +#define ACTIVE_LOCK 4
   48.86 +
   48.87 +#ifndef MAP_FILE
   48.88 +#define MAP_FILE 0
   48.89 +#endif
   48.90 +
   48.91 +#ifndef MAP_FAILED
   48.92 +#define MAP_FAILED ((void *)-1)
   48.93 +#endif
   48.94 +
   48.95 +#ifndef discard_const_p
   48.96 +# if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
   48.97 +#  define discard_const(ptr) ((void *)((intptr_t)(ptr)))
   48.98 +# else
   48.99 +#  define discard_const(ptr) ((void *)(ptr))
  48.100 +# endif
  48.101 +# define discard_const_p(type, ptr) ((type *)discard_const(ptr))
  48.102 +#endif
  48.103 +
  48.104 +/* free memory if the pointer is valid and zero the pointer */
  48.105 +#ifndef SAFE_FREE
  48.106 +#define SAFE_FREE(x) do { if ((x) != NULL) {talloc_free(discard_const_p(void *, (x))); (x)=NULL;} } while(0)
  48.107 +#endif
  48.108 +
  48.109 +#define BUCKET(hash) ((hash) % tdb->header.hash_size)
  48.110 +TDB_DATA tdb_null;
  48.111 +
  48.112 +/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
  48.113 +static TDB_CONTEXT *tdbs = NULL;
  48.114 +
  48.115 +static int tdb_munmap(TDB_CONTEXT *tdb)
  48.116 +{
  48.117 +	if (tdb->flags & TDB_INTERNAL)
  48.118 +		return 0;
  48.119 +
  48.120 +#ifdef HAVE_MMAP
  48.121 +	if (tdb->map_ptr) {
  48.122 +		int ret = munmap(tdb->map_ptr, tdb->map_size);
  48.123 +		if (ret != 0)
  48.124 +			return ret;
  48.125 +	}
  48.126 +#endif
  48.127 +	tdb->map_ptr = NULL;
  48.128 +	return 0;
  48.129 +}
  48.130 +
  48.131 +static void tdb_mmap(TDB_CONTEXT *tdb)
  48.132 +{
  48.133 +	if (tdb->flags & TDB_INTERNAL)
  48.134 +		return;
  48.135 +
  48.136 +#ifdef HAVE_MMAP
  48.137 +	if (!(tdb->flags & TDB_NOMMAP)) {
  48.138 +		tdb->map_ptr = mmap(NULL, tdb->map_size, 
  48.139 +				    PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
  48.140 +				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
  48.141 +
  48.142 +		/*
  48.143 +		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
  48.144 +		 */
  48.145 +
  48.146 +		if (tdb->map_ptr == MAP_FAILED) {
  48.147 +			tdb->map_ptr = NULL;
  48.148 +			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n", 
  48.149 +				 tdb->map_size, strerror(errno)));
  48.150 +		}
  48.151 +	} else {
  48.152 +		tdb->map_ptr = NULL;
  48.153 +	}
  48.154 +#else
  48.155 +	tdb->map_ptr = NULL;
  48.156 +#endif
  48.157 +}
  48.158 +
  48.159 +/* Endian conversion: we only ever deal with 4 byte quantities */
  48.160 +static void *convert(void *buf, u32 size)
  48.161 +{
  48.162 +	u32 i, *p = buf;
  48.163 +	for (i = 0; i < size / 4; i++)
  48.164 +		p[i] = TDB_BYTEREV(p[i]);
  48.165 +	return buf;
  48.166 +}
  48.167 +#define DOCONV() (tdb->flags & TDB_CONVERT)
  48.168 +#define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
  48.169 +
  48.170 +/* the body of the database is made of one list_struct for the free space
  48.171 +   plus a separate data list for each hash value */
  48.172 +struct list_struct {
  48.173 +	tdb_off next; /* offset of the next record in the list */
  48.174 +	tdb_len rec_len; /* total byte length of record */
  48.175 +	tdb_len key_len; /* byte length of key */
  48.176 +	tdb_len data_len; /* byte length of data */
  48.177 +	u32 full_hash; /* the full 32 bit hash of the key */
  48.178 +	u32 magic;   /* try to catch errors */
  48.179 +	/* the following union is implied:
  48.180 +		union {
  48.181 +			char record[rec_len];
  48.182 +			struct {
  48.183 +				char key[key_len];
  48.184 +				char data[data_len];
  48.185 +			}
  48.186 +			u32 totalsize; (tailer)
  48.187 +		}
  48.188 +	*/
  48.189 +};
  48.190 +
  48.191 +/* a byte range locking function - return 0 on success
  48.192 +   this functions locks/unlocks 1 byte at the specified offset.
  48.193 +
  48.194 +   On error, errno is also set so that errors are passed back properly
  48.195 +   through tdb_open(). */
  48.196 +static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset, 
  48.197 +		      int rw_type, int lck_type, int probe)
  48.198 +{
  48.199 +	struct flock fl;
  48.200 +	int ret;
  48.201 +
  48.202 +	if (tdb->flags & TDB_NOLOCK)
  48.203 +		return 0;
  48.204 +	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
  48.205 +		errno = EACCES;
  48.206 +		return -1;
  48.207 +	}
  48.208 +
  48.209 +	fl.l_type = rw_type;
  48.210 +	fl.l_whence = SEEK_SET;
  48.211 +	fl.l_start = offset;
  48.212 +	fl.l_len = 1;
  48.213 +	fl.l_pid = 0;
  48.214 +
  48.215 +	do {
  48.216 +		ret = fcntl(tdb->fd,lck_type,&fl);
  48.217 +	} while (ret == -1 && errno == EINTR);
  48.218 +
  48.219 +	if (ret == -1) {
  48.220 +		if (!probe && lck_type != F_SETLK) {
  48.221 +			/* Ensure error code is set for log fun to examine. */
  48.222 +			tdb->ecode = TDB_ERR_LOCK;
  48.223 +			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n", 
  48.224 +				 tdb->fd, offset, rw_type, lck_type));
  48.225 +		}
  48.226 +		/* Generic lock error. errno set by fcntl.
  48.227 +		 * EAGAIN is an expected return from non-blocking
  48.228 +		 * locks. */
  48.229 +		if (errno != EAGAIN) {
  48.230 +		TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n", 
  48.231 +				 tdb->fd, offset, rw_type, lck_type, 
  48.232 +				 strerror(errno)));
  48.233 +		}
  48.234 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
  48.235 +	}
  48.236 +	return 0;
  48.237 +}
  48.238 +
  48.239 +/* lock a list in the database. list -1 is the alloc list */
  48.240 +static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
  48.241 +{
  48.242 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
  48.243 +		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n", 
  48.244 +			   list, ltype));
  48.245 +		return -1;
  48.246 +	}
  48.247 +	if (tdb->flags & TDB_NOLOCK)
  48.248 +		return 0;
  48.249 +
  48.250 +	/* Since fcntl locks don't nest, we do a lock for the first one,
  48.251 +	   and simply bump the count for future ones */
  48.252 +	if (tdb->locked[list+1].count == 0) {
  48.253 +		if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
  48.254 +			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n", 
  48.255 +					   list, ltype, strerror(errno)));
  48.256 +			return -1;
  48.257 +		}
  48.258 +		tdb->locked[list+1].ltype = ltype;
  48.259 +	}
  48.260 +	tdb->locked[list+1].count++;
  48.261 +	return 0;
  48.262 +}
  48.263 +
  48.264 +/* unlock the database: returns void because it's too late for errors. */
  48.265 +	/* changed to return int it may be interesting to know there
  48.266 +	   has been an error  --simo */
  48.267 +static int tdb_unlock(TDB_CONTEXT *tdb, int list,
  48.268 +		      int ltype __attribute__((unused)))
  48.269 +{
  48.270 +	int ret = -1;
  48.271 +
  48.272 +	if (tdb->flags & TDB_NOLOCK)
  48.273 +		return 0;
  48.274 +
  48.275 +	/* Sanity checks */
  48.276 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
  48.277 +		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
  48.278 +		return ret;
  48.279 +	}
  48.280 +
  48.281 +	if (tdb->locked[list+1].count==0) {
  48.282 +		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
  48.283 +		return ret;
  48.284 +	}
  48.285 +
  48.286 +	if (tdb->locked[list+1].count == 1) {
  48.287 +		/* Down to last nested lock: unlock underneath */
  48.288 +		ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
  48.289 +	} else {
  48.290 +		ret = 0;
  48.291 +	}
  48.292 +	tdb->locked[list+1].count--;
  48.293 +
  48.294 +	if (ret)
  48.295 +		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n")); 
  48.296 +	return ret;
  48.297 +}
  48.298 +
  48.299 +/* This is based on the hash algorithm from gdbm */
  48.300 +static u32 default_tdb_hash(TDB_DATA *key)
  48.301 +{
  48.302 +	u32 value;	/* Used to compute the hash value.  */
  48.303 +	u32   i;	/* Used to cycle through random values. */
  48.304 +
  48.305 +	/* Set the initial value from the key size. */
  48.306 +	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
  48.307 +		value = (value + (key->dptr[i] << (i*5 % 24)));
  48.308 +
  48.309 +	return (1103515243 * value + 12345);  
  48.310 +}
  48.311 +
  48.312 +/* check for an out of bounds access - if it is out of bounds then
  48.313 +   see if the database has been expanded by someone else and expand
  48.314 +   if necessary 
  48.315 +   note that "len" is the minimum length needed for the db
  48.316 +*/
  48.317 +static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
  48.318 +{
  48.319 +	struct stat st;
  48.320 +	if (len <= tdb->map_size)
  48.321 +		return 0;
  48.322 +	if (tdb->flags & TDB_INTERNAL) {
  48.323 +		if (!probe) {
  48.324 +			/* Ensure ecode is set for log fn. */
  48.325 +			tdb->ecode = TDB_ERR_IO;
  48.326 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
  48.327 +				 (int)len, (int)tdb->map_size));
  48.328 +		}
  48.329 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  48.330 +	}
  48.331 +
  48.332 +	if (fstat(tdb->fd, &st) == -1)
  48.333 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  48.334 +
  48.335 +	if (st.st_size < (off_t)len) {
  48.336 +		if (!probe) {
  48.337 +			/* Ensure ecode is set for log fn. */
  48.338 +			tdb->ecode = TDB_ERR_IO;
  48.339 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
  48.340 +				 (int)len, (int)st.st_size));
  48.341 +		}
  48.342 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  48.343 +	}
  48.344 +
  48.345 +	/* Unmap, update size, remap */
  48.346 +	if (tdb_munmap(tdb) == -1)
  48.347 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  48.348 +	tdb->map_size = st.st_size;
  48.349 +	tdb_mmap(tdb);
  48.350 +	return 0;
  48.351 +}
  48.352 +
  48.353 +/* write a lump of data at a specified offset */
  48.354 +static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
  48.355 +{
  48.356 +	if (tdb_oob(tdb, off + len, 0) != 0)
  48.357 +		return -1;
  48.358 +
  48.359 +	if (tdb->map_ptr)
  48.360 +		memcpy(off + (char *)tdb->map_ptr, buf, len);
  48.361 +#ifdef HAVE_PWRITE
  48.362 +	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
  48.363 +#else
  48.364 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
  48.365 +		 || write(tdb->fd, buf, len) != (off_t)len) {
  48.366 +#endif
  48.367 +		/* Ensure ecode is set for log fn. */
  48.368 +		tdb->ecode = TDB_ERR_IO;
  48.369 +		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
  48.370 +			   off, len, strerror(errno)));
  48.371 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  48.372 +	}
  48.373 +	return 0;
  48.374 +}
  48.375 +
  48.376 +/* read a lump of data at a specified offset, maybe convert */
  48.377 +static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
  48.378 +{
  48.379 +	if (tdb_oob(tdb, off + len, 0) != 0)
  48.380 +		return -1;
  48.381 +
  48.382 +	if (tdb->map_ptr)
  48.383 +		memcpy(buf, off + (char *)tdb->map_ptr, len);
  48.384 +#ifdef HAVE_PREAD
  48.385 +	else if (pread(tdb->fd, buf, len, off) != (off_t)len) {
  48.386 +#else
  48.387 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
  48.388 +		 || read(tdb->fd, buf, len) != (off_t)len) {
  48.389 +#endif
  48.390 +		/* Ensure ecode is set for log fn. */
  48.391 +		tdb->ecode = TDB_ERR_IO;
  48.392 +		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
  48.393 +			   off, len, strerror(errno)));
  48.394 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  48.395 +	}
  48.396 +	if (cv)
  48.397 +		convert(buf, len);
  48.398 +	return 0;
  48.399 +}
  48.400 +
  48.401 +/* don't allocate memory: used in tdb_delete path. */
  48.402 +static int tdb_key_eq(TDB_CONTEXT *tdb, tdb_off off, TDB_DATA key)
  48.403 +{
  48.404 +	char buf[64];
  48.405 +	u32 len;
  48.406 +
  48.407 +	if (tdb_oob(tdb, off + key.dsize, 0) != 0)
  48.408 +		return -1;
  48.409 +
  48.410 +	if (tdb->map_ptr)
  48.411 +		return !memcmp(off + (char*)tdb->map_ptr, key.dptr, key.dsize);
  48.412 +
  48.413 +	while (key.dsize) {
  48.414 +		len = key.dsize;
  48.415 +		if (len > sizeof(buf))
  48.416 +			len = sizeof(buf);
  48.417 +		if (tdb_read(tdb, off, buf, len, 0) != 0)
  48.418 +			return -1;
  48.419 +		if (memcmp(buf, key.dptr, len) != 0)
  48.420 +			return 0;
  48.421 +		key.dptr += len;
  48.422 +		key.dsize -= len;
  48.423 +		off += len;
  48.424 +	}
  48.425 +	return 1;
  48.426 +}
  48.427 +
  48.428 +/* read a lump of data, allocating the space for it */
  48.429 +static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
  48.430 +{
  48.431 +	char *buf;
  48.432 +
  48.433 +	if (!(buf = talloc_size(tdb, len))) {
  48.434 +		/* Ensure ecode is set for log fn. */
  48.435 +		tdb->ecode = TDB_ERR_OOM;
  48.436 +		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
  48.437 +			   len, strerror(errno)));
  48.438 +		return TDB_ERRCODE(TDB_ERR_OOM, buf);
  48.439 +	}
  48.440 +	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
  48.441 +		SAFE_FREE(buf);
  48.442 +		return NULL;
  48.443 +	}
  48.444 +	return buf;
  48.445 +}
  48.446 +
  48.447 +/* read/write a tdb_off */
  48.448 +static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
  48.449 +{
  48.450 +	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
  48.451 +}
  48.452 +static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
  48.453 +{
  48.454 +	tdb_off off = *d;
  48.455 +	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
  48.456 +}
  48.457 +
  48.458 +/* read/write a record */
  48.459 +static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  48.460 +{
  48.461 +	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
  48.462 +		return -1;
  48.463 +	if (TDB_BAD_MAGIC(rec)) {
  48.464 +		/* Ensure ecode is set for log fn. */
  48.465 +		tdb->ecode = TDB_ERR_CORRUPT;
  48.466 +		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
  48.467 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  48.468 +	}
  48.469 +	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
  48.470 +}
  48.471 +static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  48.472 +{
  48.473 +	struct list_struct r = *rec;
  48.474 +	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
  48.475 +}
  48.476 +
  48.477 +/* read a freelist record and check for simple errors */
  48.478 +static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
  48.479 +{
  48.480 +	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
  48.481 +		return -1;
  48.482 +
  48.483 +	if (rec->magic == TDB_MAGIC) {
  48.484 +		/* this happens when a app is showdown while deleting a record - we should
  48.485 +		   not completely fail when this happens */
  48.486 +		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n", 
  48.487 +			 rec->magic, off));
  48.488 +		rec->magic = TDB_FREE_MAGIC;
  48.489 +		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
  48.490 +			return -1;
  48.491 +	}
  48.492 +
  48.493 +	if (rec->magic != TDB_FREE_MAGIC) {
  48.494 +		/* Ensure ecode is set for log fn. */
  48.495 +		tdb->ecode = TDB_ERR_CORRUPT;
  48.496 +		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n", 
  48.497 +			   rec->magic, off));
  48.498 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  48.499 +	}
  48.500 +	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
  48.501 +		return -1;
  48.502 +	return 0;
  48.503 +}
  48.504 +
  48.505 +/* update a record tailer (must hold allocation lock) */
  48.506 +static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
  48.507 +			 const struct list_struct *rec)
  48.508 +{
  48.509 +	tdb_off totalsize;
  48.510 +
  48.511 +	/* Offset of tailer from record header */
  48.512 +	totalsize = sizeof(*rec) + rec->rec_len;
  48.513 +	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
  48.514 +			 &totalsize);
  48.515 +}
  48.516 +
  48.517 +static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
  48.518 +{
  48.519 +	struct list_struct rec;
  48.520 +	tdb_off tailer_ofs, tailer;
  48.521 +
  48.522 +	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
  48.523 +		printf("ERROR: failed to read record at %u\n", offset);
  48.524 +		return 0;
  48.525 +	}
  48.526 +
  48.527 +	printf(" rec: offset=0x%08x next=0x%08x rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
  48.528 +	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
  48.529 +
  48.530 +	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
  48.531 +	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
  48.532 +		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
  48.533 +		return rec.next;
  48.534 +	}
  48.535 +
  48.536 +	if (tailer != rec.rec_len + sizeof(rec)) {
  48.537 +		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
  48.538 +				(unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
  48.539 +	}
  48.540 +	return rec.next;
  48.541 +}
  48.542 +
  48.543 +static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
  48.544 +{
  48.545 +	tdb_off rec_ptr, top;
  48.546 +
  48.547 +	top = TDB_HASH_TOP(i);
  48.548 +
  48.549 +	if (tdb_lock(tdb, i, F_WRLCK) != 0)
  48.550 +		return -1;
  48.551 +
  48.552 +	if (ofs_read(tdb, top, &rec_ptr) == -1)
  48.553 +		return tdb_unlock(tdb, i, F_WRLCK);
  48.554 +
  48.555 +	if (rec_ptr)
  48.556 +		printf("hash=%d\n", i);
  48.557 +
  48.558 +	while (rec_ptr) {
  48.559 +		rec_ptr = tdb_dump_record(tdb, rec_ptr);
  48.560 +	}
  48.561 +
  48.562 +	return tdb_unlock(tdb, i, F_WRLCK);
  48.563 +}
  48.564 +
  48.565 +void tdb_dump_all(TDB_CONTEXT *tdb)
  48.566 +{
  48.567 +	unsigned int i;
  48.568 +	for (i=0;i<tdb->header.hash_size;i++) {
  48.569 +		tdb_dump_chain(tdb, i);
  48.570 +	}
  48.571 +	printf("freelist:\n");
  48.572 +	tdb_dump_chain(tdb, -1);
  48.573 +}
  48.574 +
  48.575 +int tdb_printfreelist(TDB_CONTEXT *tdb)
  48.576 +{
  48.577 +	int ret;
  48.578 +	long total_free = 0;
  48.579 +	tdb_off offset, rec_ptr;
  48.580 +	struct list_struct rec;
  48.581 +
  48.582 +	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
  48.583 +		return ret;
  48.584 +
  48.585 +	offset = FREELIST_TOP;
  48.586 +
  48.587 +	/* read in the freelist top */
  48.588 +	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
  48.589 +		tdb_unlock(tdb, -1, F_WRLCK);
  48.590 +		return 0;
  48.591 +	}
  48.592 +
  48.593 +	printf("freelist top=[0x%08x]\n", rec_ptr );
  48.594 +	while (rec_ptr) {
  48.595 +		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
  48.596 +			tdb_unlock(tdb, -1, F_WRLCK);
  48.597 +			return -1;
  48.598 +		}
  48.599 +
  48.600 +		if (rec.magic != TDB_FREE_MAGIC) {
  48.601 +			printf("bad magic 0x%08x in free list\n", rec.magic);
  48.602 +			tdb_unlock(tdb, -1, F_WRLCK);
  48.603 +			return -1;
  48.604 +		}
  48.605 +
  48.606 +		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n", 
  48.607 +		       rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
  48.608 +		total_free += rec.rec_len;
  48.609 +
  48.610 +		/* move to the next record */
  48.611 +		rec_ptr = rec.next;
  48.612 +	}
  48.613 +	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free, 
  48.614 +               (int)total_free);
  48.615 +
  48.616 +	return tdb_unlock(tdb, -1, F_WRLCK);
  48.617 +}
  48.618 +
  48.619 +/* Remove an element from the freelist.  Must have alloc lock. */
  48.620 +static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
  48.621 +{
  48.622 +	tdb_off last_ptr, i;
  48.623 +
  48.624 +	/* read in the freelist top */
  48.625 +	last_ptr = FREELIST_TOP;
  48.626 +	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
  48.627 +		if (i == off) {
  48.628 +			/* We've found it! */
  48.629 +			return ofs_write(tdb, last_ptr, &next);
  48.630 +		}
  48.631 +		/* Follow chain (next offset is at start of record) */
  48.632 +		last_ptr = i;
  48.633 +	}
  48.634 +	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
  48.635 +	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  48.636 +}
  48.637 +
  48.638 +/* Add an element into the freelist. Merge adjacent records if
  48.639 +   neccessary. */
  48.640 +static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  48.641 +{
  48.642 +	tdb_off right, left;
  48.643 +
  48.644 +	/* Allocation and tailer lock */
  48.645 +	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
  48.646 +		return -1;
  48.647 +
  48.648 +	/* set an initial tailer, so if we fail we don't leave a bogus record */
  48.649 +	if (update_tailer(tdb, offset, rec) != 0) {
  48.650 +		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
  48.651 +		goto fail;
  48.652 +	}
  48.653 +
  48.654 +	/* Look right first (I'm an Australian, dammit) */
  48.655 +	right = offset + sizeof(*rec) + rec->rec_len;
  48.656 +	if (right + sizeof(*rec) <= tdb->map_size) {
  48.657 +		struct list_struct r;
  48.658 +
  48.659 +		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
  48.660 +			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
  48.661 +			goto left;
  48.662 +		}
  48.663 +
  48.664 +		/* If it's free, expand to include it. */
  48.665 +		if (r.magic == TDB_FREE_MAGIC) {
  48.666 +			if (remove_from_freelist(tdb, right, r.next) == -1) {
  48.667 +				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
  48.668 +				goto left;
  48.669 +			}
  48.670 +			rec->rec_len += sizeof(r) + r.rec_len;
  48.671 +		}
  48.672 +	}
  48.673 +
  48.674 +left:
  48.675 +	/* Look left */
  48.676 +	left = offset - sizeof(tdb_off);
  48.677 +	if (left > TDB_DATA_START(tdb->header.hash_size)) {
  48.678 +		struct list_struct l;
  48.679 +		tdb_off leftsize;
  48.680 +		
  48.681 +		/* Read in tailer and jump back to header */
  48.682 +		if (ofs_read(tdb, left, &leftsize) == -1) {
  48.683 +			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
  48.684 +			goto update;
  48.685 +		}
  48.686 +		left = offset - leftsize;
  48.687 +
  48.688 +		/* Now read in record */
  48.689 +		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
  48.690 +			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
  48.691 +			goto update;
  48.692 +		}
  48.693 +
  48.694 +		/* If it's free, expand to include it. */
  48.695 +		if (l.magic == TDB_FREE_MAGIC) {
  48.696 +			if (remove_from_freelist(tdb, left, l.next) == -1) {
  48.697 +				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
  48.698 +				goto update;
  48.699 +			} else {
  48.700 +				offset = left;
  48.701 +				rec->rec_len += leftsize;
  48.702 +			}
  48.703 +		}
  48.704 +	}
  48.705 +
  48.706 +update:
  48.707 +	if (update_tailer(tdb, offset, rec) == -1) {
  48.708 +		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
  48.709 +		goto fail;
  48.710 +	}
  48.711 +
  48.712 +	/* Now, prepend to free list */
  48.713 +	rec->magic = TDB_FREE_MAGIC;
  48.714 +
  48.715 +	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
  48.716 +	    rec_write(tdb, offset, rec) == -1 ||
  48.717 +	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
  48.718 +		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
  48.719 +		goto fail;
  48.720 +	}
  48.721 +
  48.722 +	/* And we're done. */
  48.723 +	tdb_unlock(tdb, -1, F_WRLCK);
  48.724 +	return 0;
  48.725 +
  48.726 + fail:
  48.727 +	tdb_unlock(tdb, -1, F_WRLCK);
  48.728 +	return -1;
  48.729 +}
  48.730 +
  48.731 +
  48.732 +/* expand a file.  we prefer to use ftruncate, as that is what posix
  48.733 +  says to use for mmap expansion */
  48.734 +static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
  48.735 +{
  48.736 +	char buf[1024];
  48.737 +#if HAVE_FTRUNCATE_EXTEND
  48.738 +	if (ftruncate(tdb->fd, size+addition) != 0) {
  48.739 +		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n", 
  48.740 +			   size+addition, strerror(errno)));
  48.741 +		return -1;
  48.742 +	}
  48.743 +#else
  48.744 +	char b = 0;
  48.745 +
  48.746 +#ifdef HAVE_PWRITE
  48.747 +	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
  48.748 +#else
  48.749 +	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (off_t)(size+addition) - 1 || 
  48.750 +	    write(tdb->fd, &b, 1) != 1) {
  48.751 +#endif
  48.752 +		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n", 
  48.753 +			   size+addition, strerror(errno)));
  48.754 +		return -1;
  48.755 +	}
  48.756 +#endif
  48.757 +
  48.758 +	/* now fill the file with something. This ensures that the file isn't sparse, which would be
  48.759 +	   very bad if we ran out of disk. This must be done with write, not via mmap */
  48.760 +	memset(buf, 0x42, sizeof(buf));
  48.761 +	while (addition) {
  48.762 +		int n = addition>sizeof(buf)?sizeof(buf):addition;
  48.763 +#ifdef HAVE_PWRITE
  48.764 +		int ret = pwrite(tdb->fd, buf, n, size);
  48.765 +#else
  48.766 +		int ret;
  48.767 +		if (lseek(tdb->fd, size, SEEK_SET) != (off_t)size)
  48.768 +			return -1;
  48.769 +		ret = write(tdb->fd, buf, n);
  48.770 +#endif
  48.771 +		if (ret != n) {
  48.772 +			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n", 
  48.773 +				   n, strerror(errno)));
  48.774 +			return -1;
  48.775 +		}
  48.776 +		addition -= n;
  48.777 +		size += n;
  48.778 +	}
  48.779 +	return 0;
  48.780 +}
  48.781 +
  48.782 +
  48.783 +/* expand the database at least size bytes by expanding the underlying
  48.784 +   file and doing the mmap again if necessary */
  48.785 +static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
  48.786 +{
  48.787 +	struct list_struct rec;
  48.788 +	tdb_off offset;
  48.789 +
  48.790 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
  48.791 +		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
  48.792 +		return -1;
  48.793 +	}
  48.794 +
  48.795 +	/* must know about any previous expansions by another process */
  48.796 +	tdb_oob(tdb, tdb->map_size + 1, 1);
  48.797 +
  48.798 +	/* always make room for at least 10 more records, and round
  48.799 +           the database up to a multiple of TDB_PAGE_SIZE */
  48.800 +	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
  48.801 +
  48.802 +	if (!(tdb->flags & TDB_INTERNAL))
  48.803 +		tdb_munmap(tdb);
  48.804 +
  48.805 +	/*
  48.806 +	 * We must ensure the file is unmapped before doing this
  48.807 +	 * to ensure consistency with systems like OpenBSD where
  48.808 +	 * writes and mmaps are not consistent.
  48.809 +	 */
  48.810 +
  48.811 +	/* expand the file itself */
  48.812 +	if (!(tdb->flags & TDB_INTERNAL)) {
  48.813 +		if (expand_file(tdb, tdb->map_size, size) != 0)
  48.814 +			goto fail;
  48.815 +	}
  48.816 +
  48.817 +	tdb->map_size += size;
  48.818 +
  48.819 +	if (tdb->flags & TDB_INTERNAL) {
  48.820 +		char *new_map_ptr = talloc_realloc_size(tdb, tdb->map_ptr,
  48.821 +							tdb->map_size);
  48.822 +		if (!new_map_ptr) {
  48.823 +			tdb->map_size -= size;
  48.824 +			goto fail;
  48.825 +		}
  48.826 +		tdb->map_ptr = new_map_ptr;
  48.827 +	} else {
  48.828 +		/*
  48.829 +		 * We must ensure the file is remapped before adding the space
  48.830 +		 * to ensure consistency with systems like OpenBSD where
  48.831 +		 * writes and mmaps are not consistent.
  48.832 +		 */
  48.833 +
  48.834 +		/* We're ok if the mmap fails as we'll fallback to read/write */
  48.835 +		tdb_mmap(tdb);
  48.836 +	}
  48.837 +
  48.838 +	/* form a new freelist record */
  48.839 +	memset(&rec,'\0',sizeof(rec));
  48.840 +	rec.rec_len = size - sizeof(rec);
  48.841 +
  48.842 +	/* link it into the free list */
  48.843 +	offset = tdb->map_size - size;
  48.844 +	if (tdb_free(tdb, offset, &rec) == -1)
  48.845 +		goto fail;
  48.846 +
  48.847 +	tdb_unlock(tdb, -1, F_WRLCK);
  48.848 +	return 0;
  48.849 + fail:
  48.850 +	tdb_unlock(tdb, -1, F_WRLCK);
  48.851 +	return -1;
  48.852 +}
  48.853 +
  48.854 +
  48.855 +/* 
  48.856 +   the core of tdb_allocate - called when we have decided which
  48.857 +   free list entry to use
  48.858 + */
  48.859 +static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
  48.860 +				struct list_struct *rec, tdb_off last_ptr)
  48.861 +{
  48.862 +	struct list_struct newrec;
  48.863 +	tdb_off newrec_ptr;
  48.864 +
  48.865 +	memset(&newrec, '\0', sizeof(newrec));
  48.866 +
  48.867 +	/* found it - now possibly split it up  */
  48.868 +	if (rec->rec_len > length + MIN_REC_SIZE) {
  48.869 +		/* Length of left piece */
  48.870 +		length = TDB_ALIGN(length, TDB_ALIGNMENT);
  48.871 +		
  48.872 +		/* Right piece to go on free list */
  48.873 +		newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
  48.874 +		newrec_ptr = rec_ptr + sizeof(*rec) + length;
  48.875 +		
  48.876 +		/* And left record is shortened */
  48.877 +		rec->rec_len = length;
  48.878 +	} else {
  48.879 +		newrec_ptr = 0;
  48.880 +	}
  48.881 +	
  48.882 +	/* Remove allocated record from the free list */
  48.883 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
  48.884 +		return 0;
  48.885 +	}
  48.886 +	
  48.887 +	/* Update header: do this before we drop alloc
  48.888 +	   lock, otherwise tdb_free() might try to
  48.889 +	   merge with us, thinking we're free.
  48.890 +	   (Thanks Jeremy Allison). */
  48.891 +	rec->magic = TDB_MAGIC;
  48.892 +	if (rec_write(tdb, rec_ptr, rec) == -1) {
  48.893 +		return 0;
  48.894 +	}
  48.895 +	
  48.896 +	/* Did we create new block? */
  48.897 +	if (newrec_ptr) {
  48.898 +		/* Update allocated record tailer (we
  48.899 +		   shortened it). */
  48.900 +		if (update_tailer(tdb, rec_ptr, rec) == -1) {
  48.901 +			return 0;
  48.902 +		}
  48.903 +		
  48.904 +		/* Free new record */
  48.905 +		if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
  48.906 +			return 0;
  48.907 +		}
  48.908 +	}
  48.909 +	
  48.910 +	/* all done - return the new record offset */
  48.911 +	return rec_ptr;
  48.912 +}
  48.913 +
  48.914 +/* allocate some space from the free list. The offset returned points
  48.915 +   to a unconnected list_struct within the database with room for at
  48.916 +   least length bytes of total data
  48.917 +
  48.918 +   0 is returned if the space could not be allocated
  48.919 + */
  48.920 +static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
  48.921 +			    struct list_struct *rec)
  48.922 +{
  48.923 +	tdb_off rec_ptr, last_ptr, newrec_ptr;
  48.924 +	struct {
  48.925 +		tdb_off rec_ptr, last_ptr;
  48.926 +		tdb_len rec_len;
  48.927 +	} bestfit = { 0, 0, 0 };
  48.928 +
  48.929 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
  48.930 +		return 0;
  48.931 +
  48.932 +	/* Extra bytes required for tailer */
  48.933 +	length += sizeof(tdb_off);
  48.934 +
  48.935 + again:
  48.936 +	last_ptr = FREELIST_TOP;
  48.937 +
  48.938 +	/* read in the freelist top */
  48.939 +	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
  48.940 +		goto fail;
  48.941 +
  48.942 +	bestfit.rec_ptr = 0;
  48.943 +
  48.944 +	/* 
  48.945 +	   this is a best fit allocation strategy. Originally we used
  48.946 +	   a first fit strategy, but it suffered from massive fragmentation
  48.947 +	   issues when faced with a slowly increasing record size.
  48.948 +	 */
  48.949 +	while (rec_ptr) {
  48.950 +		if (rec_free_read(tdb, rec_ptr, rec) == -1) {
  48.951 +			goto fail;
  48.952 +		}
  48.953 +
  48.954 +		if (rec->rec_len >= length) {
  48.955 +			if (bestfit.rec_ptr == 0 ||
  48.956 +			    rec->rec_len < bestfit.rec_len) {
  48.957 +				bestfit.rec_len = rec->rec_len;
  48.958 +				bestfit.rec_ptr = rec_ptr;
  48.959 +				bestfit.last_ptr = last_ptr;
  48.960 +				/* consider a fit to be good enough if we aren't wasting more than half the space */
  48.961 +				if (bestfit.rec_len < 2*length) {
  48.962 +					break;
  48.963 +				}
  48.964 +			}
  48.965 +		}
  48.966 +
  48.967 +		/* move to the next record */
  48.968 +		last_ptr = rec_ptr;
  48.969 +		rec_ptr = rec->next;
  48.970 +	}
  48.971 +
  48.972 +	if (bestfit.rec_ptr != 0) {
  48.973 +		if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
  48.974 +			goto fail;
  48.975 +		}
  48.976 +
  48.977 +		newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
  48.978 +		tdb_unlock(tdb, -1, F_WRLCK);
  48.979 +		return newrec_ptr;
  48.980 +	}
  48.981 +
  48.982 +	/* we didn't find enough space. See if we can expand the
  48.983 +	   database and if we can then try again */
  48.984 +	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
  48.985 +		goto again;
  48.986 + fail:
  48.987 +	tdb_unlock(tdb, -1, F_WRLCK);
  48.988 +	return 0;
  48.989 +}
  48.990 +
  48.991 +/* initialise a new database with a specified hash size */
  48.992 +static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
  48.993 +{
  48.994 +	struct tdb_header *newdb;
  48.995 +	int size, ret = -1;
  48.996 +
  48.997 +	/* We make it up in memory, then write it out if not internal */
  48.998 +	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
  48.999 +	if (!(newdb = talloc_zero_size(tdb, size)))
 48.1000 +		return TDB_ERRCODE(TDB_ERR_OOM, -1);
 48.1001 +
 48.1002 +	/* Fill in the header */
 48.1003 +	newdb->version = TDB_VERSION;
 48.1004 +	newdb->hash_size = hash_size;
 48.1005 +	if (tdb->flags & TDB_INTERNAL) {
 48.1006 +		tdb->map_size = size;
 48.1007 +		tdb->map_ptr = (char *)newdb;
 48.1008 +		memcpy(&tdb->header, newdb, sizeof(tdb->header));
 48.1009 +		/* Convert the `ondisk' version if asked. */
 48.1010 +		CONVERT(*newdb);
 48.1011 +		return 0;
 48.1012 +	}
 48.1013 +	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 48.1014 +		goto fail;
 48.1015 +
 48.1016 +	if (ftruncate(tdb->fd, 0) == -1)
 48.1017 +		goto fail;
 48.1018 +
 48.1019 +	/* This creates an endian-converted header, as if read from disk */
 48.1020 +	CONVERT(*newdb);
 48.1021 +	memcpy(&tdb->header, newdb, sizeof(tdb->header));
 48.1022 +	/* Don't endian-convert the magic food! */
 48.1023 +	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 48.1024 +	if (write(tdb->fd, newdb, size) != size)
 48.1025 +		ret = -1;
 48.1026 +	else
 48.1027 +		ret = 0;
 48.1028 +
 48.1029 +  fail:
 48.1030 +	SAFE_FREE(newdb);
 48.1031 +	return ret;
 48.1032 +}
 48.1033 +
 48.1034 +/* Returns 0 on fail.  On success, return offset of record, and fills
 48.1035 +   in rec */
 48.1036 +static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 48.1037 +			struct list_struct *r)
 48.1038 +{
 48.1039 +	tdb_off rec_ptr;
 48.1040 +	
 48.1041 +	/* read in the hash top */
 48.1042 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 48.1043 +		return 0;
 48.1044 +
 48.1045 +	/* keep looking until we find the right record */
 48.1046 +	while (rec_ptr) {
 48.1047 +		if (rec_read(tdb, rec_ptr, r) == -1)
 48.1048 +			return 0;
 48.1049 +
 48.1050 +		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 48.1051 +			/* a very likely hit - read the key */
 48.1052 +			int cmp = tdb_key_eq(tdb, rec_ptr + sizeof(*r), key);
 48.1053 +			if (cmp < 0)
 48.1054 +				return 0;
 48.1055 +			else if (cmp > 0)
 48.1056 +				return rec_ptr;
 48.1057 +		}
 48.1058 +		rec_ptr = r->next;
 48.1059 +	}
 48.1060 +	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
 48.1061 +}
 48.1062 +
 48.1063 +/* As tdb_find, but if you succeed, keep the lock */
 48.1064 +static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
 48.1065 +			     struct list_struct *rec)
 48.1066 +{
 48.1067 +	u32 rec_ptr;
 48.1068 +
 48.1069 +	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 48.1070 +		return 0;
 48.1071 +	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 48.1072 +		tdb_unlock(tdb, BUCKET(hash), locktype);
 48.1073 +	return rec_ptr;
 48.1074 +}
 48.1075 +
 48.1076 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
 48.1077 +{
 48.1078 +	return tdb->ecode;
 48.1079 +}
 48.1080 +
 48.1081 +static struct tdb_errname {
 48.1082 +	enum TDB_ERROR ecode; const char *estring;
 48.1083 +} emap[] = { {TDB_SUCCESS, "Success"},
 48.1084 +	     {TDB_ERR_CORRUPT, "Corrupt database"},
 48.1085 +	     {TDB_ERR_IO, "IO Error"},
 48.1086 +	     {TDB_ERR_LOCK, "Locking error"},
 48.1087 +	     {TDB_ERR_OOM, "Out of memory"},
 48.1088 +	     {TDB_ERR_EXISTS, "Record exists"},
 48.1089 +	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
 48.1090 +	     {TDB_ERR_NOEXIST, "Record does not exist"} };
 48.1091 +
 48.1092 +/* Error string for the last tdb error */
 48.1093 +const char *tdb_errorstr(TDB_CONTEXT *tdb)
 48.1094 +{
 48.1095 +	u32 i;
 48.1096 +	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
 48.1097 +		if (tdb->ecode == emap[i].ecode)
 48.1098 +			return emap[i].estring;
 48.1099 +	return "Invalid error code";
 48.1100 +}
 48.1101 +
 48.1102 +/* update an entry in place - this only works if the new data size
 48.1103 +   is <= the old data size and the key exists.
 48.1104 +   on failure return -1.
 48.1105 +*/
 48.1106 +
 48.1107 +static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
 48.1108 +{
 48.1109 +	struct list_struct rec;
 48.1110 +	tdb_off rec_ptr;
 48.1111 +
 48.1112 +	/* find entry */
 48.1113 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 48.1114 +		return -1;
 48.1115 +
 48.1116 +	/* must be long enough key, data and tailer */
 48.1117 +	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
 48.1118 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
 48.1119 +		return -1;
 48.1120 +	}
 48.1121 +
 48.1122 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 48.1123 +		      dbuf.dptr, dbuf.dsize) == -1)
 48.1124 +		return -1;
 48.1125 +
 48.1126 +	if (dbuf.dsize != rec.data_len) {
 48.1127 +		/* update size */
 48.1128 +		rec.data_len = dbuf.dsize;
 48.1129 +		return rec_write(tdb, rec_ptr, &rec);
 48.1130 +	}
 48.1131 + 
 48.1132 +	return 0;
 48.1133 +}
 48.1134 +
 48.1135 +/* find an entry in the database given a key */
 48.1136 +/* If an entry doesn't exist tdb_err will be set to
 48.1137 + * TDB_ERR_NOEXIST. If a key has no data attached
 48.1138 + * then the TDB_DATA will have zero length but
 48.1139 + * a non-zero pointer
 48.1140 + */
 48.1141 +
 48.1142 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
 48.1143 +{
 48.1144 +	tdb_off rec_ptr;
 48.1145 +	struct list_struct rec;
 48.1146 +	TDB_DATA ret;
 48.1147 +	u32 hash;
 48.1148 +
 48.1149 +	/* find which hash bucket it is in */
 48.1150 +	hash = tdb->hash_fn(&key);
 48.1151 +	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 48.1152 +		return tdb_null;
 48.1153 +
 48.1154 +	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 48.1155 +				  rec.data_len);
 48.1156 +	ret.dsize = rec.data_len;
 48.1157 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 48.1158 +	return ret;
 48.1159 +}
 48.1160 +
 48.1161 +/* check if an entry in the database exists 
 48.1162 +
 48.1163 +   note that 1 is returned if the key is found and 0 is returned if not found
 48.1164 +   this doesn't match the conventions in the rest of this module, but is
 48.1165 +   compatible with gdbm
 48.1166 +*/
 48.1167 +static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
 48.1168 +{
 48.1169 +	struct list_struct rec;
 48.1170 +	
 48.1171 +	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 48.1172 +		return 0;
 48.1173 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 48.1174 +	return 1;
 48.1175 +}
 48.1176 +
 48.1177 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
 48.1178 +{
 48.1179 +	u32 hash = tdb->hash_fn(&key);
 48.1180 +	return tdb_exists_hash(tdb, key, hash);
 48.1181 +}
 48.1182 +
 48.1183 +/* record lock stops delete underneath */
 48.1184 +static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
 48.1185 +{
 48.1186 +	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
 48.1187 +}
 48.1188 +/*
 48.1189 +  Write locks override our own fcntl readlocks, so check it here.
 48.1190 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 48.1191 +  an error to fail to get the lock here.
 48.1192 +*/
 48.1193 + 
 48.1194 +static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
 48.1195 +{
 48.1196 +	struct tdb_traverse_lock *i;
 48.1197 +	for (i = &tdb->travlocks; i; i = i->next)
 48.1198 +		if (i->off == off)
 48.1199 +			return -1;
 48.1200 +	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
 48.1201 +}
 48.1202 +
 48.1203 +/*
 48.1204 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 48.1205 +  an error to fail to get the lock here.
 48.1206 +*/
 48.1207 +
 48.1208 +static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
 48.1209 +{
 48.1210 +	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
 48.1211 +}
 48.1212 +/* fcntl locks don't stack: avoid unlocking someone else's */
 48.1213 +static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
 48.1214 +{
 48.1215 +	struct tdb_traverse_lock *i;
 48.1216 +	u32 count = 0;
 48.1217 +
 48.1218 +	if (off == 0)
 48.1219 +		return 0;
 48.1220 +	for (i = &tdb->travlocks; i; i = i->next)
 48.1221 +		if (i->off == off)
 48.1222 +			count++;
 48.1223 +	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
 48.1224 +}
 48.1225 +
 48.1226 +/* actually delete an entry in the database given the offset */
 48.1227 +static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
 48.1228 +{
 48.1229 +	tdb_off last_ptr, i;
 48.1230 +	struct list_struct lastrec;
 48.1231 +
 48.1232 +	if (tdb->read_only) return -1;
 48.1233 +
 48.1234 +	if (write_lock_record(tdb, rec_ptr) == -1) {
 48.1235 +		/* Someone traversing here: mark it as dead */
 48.1236 +		rec->magic = TDB_DEAD_MAGIC;
 48.1237 +		return rec_write(tdb, rec_ptr, rec);
 48.1238 +	}
 48.1239 +	if (write_unlock_record(tdb, rec_ptr) != 0)
 48.1240 +		return -1;
 48.1241 +
 48.1242 +	/* find previous record in hash chain */
 48.1243 +	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 48.1244 +		return -1;
 48.1245 +	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 48.1246 +		if (rec_read(tdb, i, &lastrec) == -1)
 48.1247 +			return -1;
 48.1248 +
 48.1249 +	/* unlink it: next ptr is at start of record. */
 48.1250 +	if (last_ptr == 0)
 48.1251 +		last_ptr = TDB_HASH_TOP(rec->full_hash);
 48.1252 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 48.1253 +		return -1;
 48.1254 +
 48.1255 +	/* recover the space */
 48.1256 +	if (tdb_free(tdb, rec_ptr, rec) == -1)
 48.1257 +		return -1;
 48.1258 +	return 0;
 48.1259 +}
 48.1260 +
 48.1261 +/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
 48.1262 +static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
 48.1263 +			 struct list_struct *rec)
 48.1264 +{
 48.1265 +	int want_next = (tlock->off != 0);
 48.1266 +
 48.1267 +	/* Lock each chain from the start one. */
 48.1268 +	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
 48.1269 +
 48.1270 +		/* this is an optimisation for the common case where
 48.1271 +		   the hash chain is empty, which is particularly
 48.1272 +		   common for the use of tdb with ldb, where large
 48.1273 +		   hashes are used. In that case we spend most of our
 48.1274 +		   time in tdb_brlock(), locking empty hash chains.
 48.1275 +
 48.1276 +		   To avoid this, we do an unlocked pre-check to see
 48.1277 +		   if the hash chain is empty before starting to look
 48.1278 +		   inside it. If it is empty then we can avoid that
 48.1279 +		   hash chain. If it isn't empty then we can't believe
 48.1280 +		   the value we get back, as we read it without a
 48.1281 +		   lock, so instead we get the lock and re-fetch the
 48.1282 +		   value below.
 48.1283 +
 48.1284 +		   Notice that not doing this optimisation on the
 48.1285 +		   first hash chain is critical. We must guarantee
 48.1286 +		   that we have done at least one fcntl lock at the
 48.1287 +		   start of a search to guarantee that memory is
 48.1288 +		   coherent on SMP systems. If records are added by
 48.1289 +		   others during the search then thats OK, and we
 48.1290 +		   could possibly miss those with this trick, but we
 48.1291 +		   could miss them anyway without this trick, so the
 48.1292 +		   semantics don't change.
 48.1293 +
 48.1294 +		   With a non-indexed ldb search this trick gains us a
 48.1295 +		   factor of around 80 in speed on a linux 2.6.x
 48.1296 +		   system (testing using ldbtest).
 48.1297 +		 */
 48.1298 +		if (!tlock->off && tlock->hash != 0) {
 48.1299 +			u32 off;
 48.1300 +			if (tdb->map_ptr) {
 48.1301 +				for (;tlock->hash < tdb->header.hash_size;tlock->hash++) {
 48.1302 +					if (0 != *(u32 *)(TDB_HASH_TOP(tlock->hash) + (unsigned char *)tdb->map_ptr)) {
 48.1303 +						break;
 48.1304 +					}
 48.1305 +				}
 48.1306 +				if (tlock->hash == tdb->header.hash_size) {
 48.1307 +					continue;
 48.1308 +				}
 48.1309 +			} else {
 48.1310 +				if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash), &off) == 0 &&
 48.1311 +				    off == 0) {
 48.1312 +					continue;
 48.1313 +				}
 48.1314 +			}
 48.1315 +		}
 48.1316 +
 48.1317 +		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
 48.1318 +			return -1;
 48.1319 +
 48.1320 +		/* No previous record?  Start at top of chain. */
 48.1321 +		if (!tlock->off) {
 48.1322 +			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
 48.1323 +				     &tlock->off) == -1)
 48.1324 +				goto fail;
 48.1325 +		} else {
 48.1326 +			/* Otherwise unlock the previous record. */
 48.1327 +			if (unlock_record(tdb, tlock->off) != 0)
 48.1328 +				goto fail;
 48.1329 +		}
 48.1330 +
 48.1331 +		if (want_next) {
 48.1332 +			/* We have offset of old record: grab next */
 48.1333 +			if (rec_read(tdb, tlock->off, rec) == -1)
 48.1334 +				goto fail;
 48.1335 +			tlock->off = rec->next;
 48.1336 +		}
 48.1337 +
 48.1338 +		/* Iterate through chain */
 48.1339 +		while( tlock->off) {
 48.1340 +			tdb_off current;
 48.1341 +			if (rec_read(tdb, tlock->off, rec) == -1)
 48.1342 +				goto fail;
 48.1343 +
 48.1344 +			/* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
 48.1345 +			if (tlock->off == rec->next) {
 48.1346 +				TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
 48.1347 +				goto fail;
 48.1348 +			}
 48.1349 +
 48.1350 +			if (!TDB_DEAD(rec)) {
 48.1351 +				/* Woohoo: we found one! */
 48.1352 +				if (lock_record(tdb, tlock->off) != 0)
 48.1353 +					goto fail;
 48.1354 +				return tlock->off;
 48.1355 +			}
 48.1356 +
 48.1357 +			/* Try to clean dead ones from old traverses */
 48.1358 +			current = tlock->off;
 48.1359 +			tlock->off = rec->next;
 48.1360 +			if (!tdb->read_only && 
 48.1361 +			    do_delete(tdb, current, rec) != 0)
 48.1362 +				goto fail;
 48.1363 +		}
 48.1364 +		tdb_unlock(tdb, tlock->hash, F_WRLCK);
 48.1365 +		want_next = 0;
 48.1366 +	}
 48.1367 +	/* We finished iteration without finding anything */
 48.1368 +	return TDB_ERRCODE(TDB_SUCCESS, 0);
 48.1369 +
 48.1370 + fail:
 48.1371 +	tlock->off = 0;
 48.1372 +	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
 48.1373 +		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
 48.1374 +	return -1;
 48.1375 +}
 48.1376 +
 48.1377 +/* traverse the entire database - calling fn(tdb, key, data) on each element.
 48.1378 +   return -1 on error or the record count traversed
 48.1379 +   if fn is NULL then it is not called
 48.1380 +   a non-zero return value from fn() indicates that the traversal should stop
 48.1381 +  */
 48.1382 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
 48.1383 +{
 48.1384 +	TDB_DATA key, dbuf;
 48.1385 +	struct list_struct rec;
 48.1386 +	struct tdb_traverse_lock tl = { NULL, 0, 0 };
 48.1387 +	int ret, count = 0;
 48.1388 +
 48.1389 +	/* This was in the initializaton, above, but the IRIX compiler
 48.1390 +	 * did not like it.  crh
 48.1391 +	 */
 48.1392 +	tl.next = tdb->travlocks.next;
 48.1393 +
 48.1394 +	/* fcntl locks don't stack: beware traverse inside traverse */
 48.1395 +	tdb->travlocks.next = &tl;
 48.1396 +
 48.1397 +	/* tdb_next_lock places locks on the record returned, and its chain */
 48.1398 +	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
 48.1399 +		count++;
 48.1400 +		/* now read the full record */
 48.1401 +		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec), 
 48.1402 +					  rec.key_len + rec.data_len);
 48.1403 +		if (!key.dptr) {
 48.1404 +			ret = -1;
 48.1405 +			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
 48.1406 +				goto out;
 48.1407 +			if (unlock_record(tdb, tl.off) != 0)
 48.1408 +				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
 48.1409 +			goto out;
 48.1410 +		}
 48.1411 +		key.dsize = rec.key_len;
 48.1412 +		dbuf.dptr = key.dptr + rec.key_len;
 48.1413 +		dbuf.dsize = rec.data_len;
 48.1414 +
 48.1415 +		/* Drop chain lock, call out */
 48.1416 +		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
 48.1417 +			ret = -1;
 48.1418 +			goto out;
 48.1419 +		}
 48.1420 +		if (fn && fn(tdb, key, dbuf, private)) {
 48.1421 +			/* They want us to terminate traversal */
 48.1422 +			ret = count;
 48.1423 +			if (unlock_record(tdb, tl.off) != 0) {
 48.1424 +				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
 48.1425 +				ret = -1;
 48.1426 +			}
 48.1427 +			tdb->travlocks.next = tl.next;
 48.1428 +			SAFE_FREE(key.dptr);
 48.1429 +			return count;
 48.1430 +		}
 48.1431 +		SAFE_FREE(key.dptr);
 48.1432 +	}
 48.1433 +out:
 48.1434 +	tdb->travlocks.next = tl.next;
 48.1435 +	if (ret < 0)
 48.1436 +		return -1;
 48.1437 +	else
 48.1438 +		return count;
 48.1439 +}
 48.1440 +
 48.1441 +/* find the first entry in the database and return its key */
 48.1442 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
 48.1443 +{
 48.1444 +	TDB_DATA key;
 48.1445 +	struct list_struct rec;
 48.1446 +
 48.1447 +	/* release any old lock */
 48.1448 +	if (unlock_record(tdb, tdb->travlocks.off) != 0)
 48.1449 +		return tdb_null;
 48.1450 +	tdb->travlocks.off = tdb->travlocks.hash = 0;
 48.1451 +
 48.1452 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
 48.1453 +		return tdb_null;
 48.1454 +	/* now read the key */
 48.1455 +	key.dsize = rec.key_len;
 48.1456 +	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
 48.1457 +	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
 48.1458 +		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
 48.1459 +	return key;
 48.1460 +}
 48.1461 +
 48.1462 +/* find the next entry in the database, returning its key */
 48.1463 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
 48.1464 +{
 48.1465 +	u32 oldhash;
 48.1466 +	TDB_DATA key = tdb_null;
 48.1467 +	struct list_struct rec;
 48.1468 +	char *k = NULL;
 48.1469 +
 48.1470 +	/* Is locked key the old key?  If so, traverse will be reliable. */
 48.1471 +	if (tdb->travlocks.off) {
 48.1472 +		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
 48.1473 +			return tdb_null;
 48.1474 +		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
 48.1475 +		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
 48.1476 +					    rec.key_len))
 48.1477 +		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
 48.1478 +			/* No, it wasn't: unlock it and start from scratch */
 48.1479 +			if (unlock_record(tdb, tdb->travlocks.off) != 0)
 48.1480 +				return tdb_null;
 48.1481 +			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
 48.1482 +				return tdb_null;
 48.1483 +			tdb->travlocks.off = 0;
 48.1484 +		}
 48.1485 +
 48.1486 +		SAFE_FREE(k);
 48.1487 +	}
 48.1488 +
 48.1489 +	if (!tdb->travlocks.off) {
 48.1490 +		/* No previous element: do normal find, and lock record */
 48.1491 +		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
 48.1492 +		if (!tdb->travlocks.off)
 48.1493 +			return tdb_null;
 48.1494 +		tdb->travlocks.hash = BUCKET(rec.full_hash);
 48.1495 +		if (lock_record(tdb, tdb->travlocks.off) != 0) {
 48.1496 +			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
 48.1497 +			return tdb_null;
 48.1498 +		}
 48.1499 +	}
 48.1500 +	oldhash = tdb->travlocks.hash;
 48.1501 +
 48.1502 +	/* Grab next record: locks chain and returned record,
 48.1503 +	   unlocks old record */
 48.1504 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
 48.1505 +		key.dsize = rec.key_len;
 48.1506 +		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
 48.1507 +					  key.dsize);
 48.1508 +		/* Unlock the chain of this new record */
 48.1509 +		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
 48.1510 +			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 48.1511 +	}
 48.1512 +	/* Unlock the chain of old record */
 48.1513 +	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
 48.1514 +		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 48.1515 +	return key;
 48.1516 +}
 48.1517 +
 48.1518 +/* delete an entry in the database given a key */
 48.1519 +static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
 48.1520 +{
 48.1521 +	tdb_off rec_ptr;
 48.1522 +	struct list_struct rec;
 48.1523 +	int ret;
 48.1524 +
 48.1525 +	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
 48.1526 +		return -1;
 48.1527 +	ret = do_delete(tdb, rec_ptr, &rec);
 48.1528 +	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
 48.1529 +		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
 48.1530 +	return ret;
 48.1531 +}
 48.1532 +
 48.1533 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
 48.1534 +{
 48.1535 +	u32 hash = tdb->hash_fn(&key);
 48.1536 +	return tdb_delete_hash(tdb, key, hash);
 48.1537 +}
 48.1538 +
 48.1539 +/* store an element in the database, replacing any existing element
 48.1540 +   with the same key 
 48.1541 +
 48.1542 +   return 0 on success, -1 on failure
 48.1543 +*/
 48.1544 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 48.1545 +{
 48.1546 +	struct list_struct rec;
 48.1547 +	u32 hash;
 48.1548 +	tdb_off rec_ptr;
 48.1549 +	char *p = NULL;
 48.1550 +	int ret = 0;
 48.1551 +
 48.1552 +	/* find which hash bucket it is in */
 48.1553 +	hash = tdb->hash_fn(&key);
 48.1554 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 48.1555 +		return -1;
 48.1556 +
 48.1557 +	/* check for it existing, on insert. */
 48.1558 +	if (flag == TDB_INSERT) {
 48.1559 +		if (tdb_exists_hash(tdb, key, hash)) {
 48.1560 +			tdb->ecode = TDB_ERR_EXISTS;
 48.1561 +			goto fail;
 48.1562 +		}
 48.1563 +	} else {
 48.1564 +		/* first try in-place update, on modify or replace. */
 48.1565 +		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
 48.1566 +			goto out;
 48.1567 +		if (tdb->ecode == TDB_ERR_NOEXIST &&
 48.1568 +		    flag == TDB_MODIFY) {
 48.1569 +			/* if the record doesn't exist and we are in TDB_MODIFY mode then
 48.1570 +			 we should fail the store */
 48.1571 +			goto fail;
 48.1572 +		}
 48.1573 +	}
 48.1574 +	/* reset the error code potentially set by the tdb_update() */
 48.1575 +	tdb->ecode = TDB_SUCCESS;
 48.1576 +
 48.1577 +	/* delete any existing record - if it doesn't exist we don't
 48.1578 +           care.  Doing this first reduces fragmentation, and avoids
 48.1579 +           coalescing with `allocated' block before it's updated. */
 48.1580 +	if (flag != TDB_INSERT)
 48.1581 +		tdb_delete_hash(tdb, key, hash);
 48.1582 +
 48.1583 +	/* Copy key+value *before* allocating free space in case malloc
 48.1584 +	   fails and we are left with a dead spot in the tdb. */
 48.1585 +
 48.1586 +	if (!(p = (char *)talloc_size(tdb, key.dsize + dbuf.dsize))) {
 48.1587 +		tdb->ecode = TDB_ERR_OOM;
 48.1588 +		goto fail;
 48.1589 +	}
 48.1590 +
 48.1591 +	memcpy(p, key.dptr, key.dsize);
 48.1592 +	if (dbuf.dsize)
 48.1593 +		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
 48.1594 +
 48.1595 +	/* we have to allocate some space */
 48.1596 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
 48.1597 +		goto fail;
 48.1598 +
 48.1599 +	/* Read hash top into next ptr */
 48.1600 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 48.1601 +		goto fail;
 48.1602 +
 48.1603 +	rec.key_len = key.dsize;
 48.1604 +	rec.data_len = dbuf.dsize;
 48.1605 +	rec.full_hash = hash;
 48.1606 +	rec.magic = TDB_MAGIC;
 48.1607 +
 48.1608 +	/* write out and point the top of the hash chain at it */
 48.1609 +	if (rec_write(tdb, rec_ptr, &rec) == -1
 48.1610 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
 48.1611 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 48.1612 +		/* Need to tdb_unallocate() here */
 48.1613 +		goto fail;
 48.1614 +	}
 48.1615 + out:
 48.1616 +	SAFE_FREE(p); 
 48.1617 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 48.1618 +	return ret;
 48.1619 +fail:
 48.1620 +	ret = -1;
 48.1621 +	goto out;
 48.1622 +}
 48.1623 +
 48.1624 +/* Attempt to append data to an entry in place - this only works if the new data size
 48.1625 +   is <= the old data size and the key exists.
 48.1626 +   on failure return -1. Record must be locked before calling.
 48.1627 +*/
 48.1628 +static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
 48.1629 +{
 48.1630 +	struct list_struct rec;
 48.1631 +	tdb_off rec_ptr;
 48.1632 +
 48.1633 +	/* find entry */
 48.1634 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 48.1635 +		return -1;
 48.1636 +
 48.1637 +	/* Append of 0 is always ok. */
 48.1638 +	if (new_dbuf.dsize == 0)
 48.1639 +		return 0;
 48.1640 +
 48.1641 +	/* must be long enough for key, old data + new data and tailer */
 48.1642 +	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
 48.1643 +		/* No room. */
 48.1644 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
 48.1645 +		return -1;
 48.1646 +	}
 48.1647 +
 48.1648 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
 48.1649 +		      new_dbuf.dptr, new_dbuf.dsize) == -1)
 48.1650 +		return -1;
 48.1651 +
 48.1652 +	/* update size */
 48.1653 +	rec.data_len += new_dbuf.dsize;
 48.1654 +	return rec_write(tdb, rec_ptr, &rec);
 48.1655 +}
 48.1656 +
 48.1657 +/* Append to an entry. Create if not exist. */
 48.1658 +
 48.1659 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 48.1660 +{
 48.1661 +	struct list_struct rec;
 48.1662 +	u32 hash;
 48.1663 +	tdb_off rec_ptr;
 48.1664 +	char *p = NULL;
 48.1665 +	int ret = 0;
 48.1666 +	size_t new_data_size = 0;
 48.1667 +
 48.1668 +	/* find which hash bucket it is in */
 48.1669 +	hash = tdb->hash_fn(&key);
 48.1670 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 48.1671 +		return -1;
 48.1672 +
 48.1673 +	/* first try in-place. */
 48.1674 +	if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
 48.1675 +		goto out;
 48.1676 +
 48.1677 +	/* reset the error code potentially set by the tdb_append_inplace() */
 48.1678 +	tdb->ecode = TDB_SUCCESS;
 48.1679 +
 48.1680 +	/* find entry */
 48.1681 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
 48.1682 +		if (tdb->ecode != TDB_ERR_NOEXIST)
 48.1683 +			goto fail;
 48.1684 +
 48.1685 +		/* Not found - create. */
 48.1686 +
 48.1687 +		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
 48.1688 +		goto out;
 48.1689 +	}
 48.1690 +
 48.1691 +	new_data_size = rec.data_len + new_dbuf.dsize;
 48.1692 +
 48.1693 +	/* Copy key+old_value+value *before* allocating free space in case malloc
 48.1694 +	   fails and we are left with a dead spot in the tdb. */
 48.1695 +
 48.1696 +	if (!(p = (char *)talloc_size(tdb, key.dsize + new_data_size))) {
 48.1697 +		tdb->ecode = TDB_ERR_OOM;
 48.1698 +		goto fail;
 48.1699 +	}
 48.1700 +
 48.1701 +	/* Copy the key in place. */
 48.1702 +	memcpy(p, key.dptr, key.dsize);
 48.1703 +
 48.1704 +	/* Now read the old data into place. */
 48.1705 +	if (rec.data_len &&
 48.1706 +		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
 48.1707 +			goto fail;
 48.1708 +
 48.1709 +	/* Finally append the new data. */
 48.1710 +	if (new_dbuf.dsize)
 48.1711 +		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
 48.1712 +
 48.1713 +	/* delete any existing record - if it doesn't exist we don't
 48.1714 +           care.  Doing this first reduces fragmentation, and avoids
 48.1715 +           coalescing with `allocated' block before it's updated. */
 48.1716 +
 48.1717 +	tdb_delete_hash(tdb, key, hash);
 48.1718 +
 48.1719 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
 48.1720 +		goto fail;
 48.1721 +
 48.1722 +	/* Read hash top into next ptr */
 48.1723 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 48.1724 +		goto fail;
 48.1725 +
 48.1726 +	rec.key_len = key.dsize;
 48.1727 +	rec.data_len = new_data_size;
 48.1728 +	rec.full_hash = hash;
 48.1729 +	rec.magic = TDB_MAGIC;
 48.1730 +
 48.1731 +	/* write out and point the top of the hash chain at it */
 48.1732 +	if (rec_write(tdb, rec_ptr, &rec) == -1
 48.1733 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
 48.1734 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 48.1735 +		/* Need to tdb_unallocate() here */
 48.1736 +		goto fail;
 48.1737 +	}
 48.1738 +
 48.1739 + out:
 48.1740 +	SAFE_FREE(p); 
 48.1741 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 48.1742 +	return ret;
 48.1743 +
 48.1744 +fail:
 48.1745 +	ret = -1;
 48.1746 +	goto out;
 48.1747 +}
 48.1748 +
 48.1749 +static int tdb_already_open(dev_t device,
 48.1750 +			    ino_t ino)
 48.1751 +{
 48.1752 +	TDB_CONTEXT *i;
 48.1753 +	
 48.1754 +	for (i = tdbs; i; i = i->next) {
 48.1755 +		if (i->device == device && i->inode == ino) {
 48.1756 +			return 1;
 48.1757 +		}
 48.1758 +	}
 48.1759 +
 48.1760 +	return 0;
 48.1761 +}
 48.1762 +
 48.1763 +/* open the database, creating it if necessary 
 48.1764 +
 48.1765 +   The open_flags and mode are passed straight to the open call on the
 48.1766 +   database file. A flags value of O_WRONLY is invalid. The hash size
 48.1767 +   is advisory, use zero for a default value.
 48.1768 +
 48.1769 +   Return is NULL on error, in which case errno is also set.  Don't 
 48.1770 +   try to call tdb_error or tdb_errname, just do strerror(errno).
 48.1771 +
 48.1772 +   @param name may be NULL for internal databases. */
 48.1773 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
 48.1774 +		      int open_flags, mode_t mode)
 48.1775 +{
 48.1776 +	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
 48.1777 +}
 48.1778 +
 48.1779 +/* a default logging function */
 48.1780 +static void null_log_fn(TDB_CONTEXT *tdb __attribute__((unused)),
 48.1781 +			int level __attribute__((unused)),
 48.1782 +			const char *fmt __attribute__((unused)), ...)
 48.1783 +{
 48.1784 +}
 48.1785 +
 48.1786 +
 48.1787 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
 48.1788 +			 int open_flags, mode_t mode,
 48.1789 +			 tdb_log_func log_fn,
 48.1790 +			 tdb_hash_func hash_fn)
 48.1791 +{
 48.1792 +	TDB_CONTEXT *tdb;
 48.1793 +	struct stat st;
 48.1794 +	int rev = 0, locked = 0;
 48.1795 +	uint8_t *vp;
 48.1796 +	u32 vertest;
 48.1797 +
 48.1798 +	if (!(tdb = talloc_zero(name, TDB_CONTEXT))) {
 48.1799 +		/* Can't log this */
 48.1800 +		errno = ENOMEM;
 48.1801 +		goto fail;
 48.1802 +	}
 48.1803 +	tdb->fd = -1;
 48.1804 +	tdb->name = NULL;
 48.1805 +	tdb->map_ptr = NULL;
 48.1806 +	tdb->flags = tdb_flags;
 48.1807 +	tdb->open_flags = open_flags;
 48.1808 +	tdb->log_fn = log_fn?log_fn:null_log_fn;
 48.1809 +	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
 48.1810 +
 48.1811 +	if ((open_flags & O_ACCMODE) == O_WRONLY) {
 48.1812 +		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
 48.1813 +			 name));
 48.1814 +		errno = EINVAL;
 48.1815 +		goto fail;
 48.1816 +	}
 48.1817 +	
 48.1818 +	if (hash_size == 0)
 48.1819 +		hash_size = DEFAULT_HASH_SIZE;
 48.1820 +	if ((open_flags & O_ACCMODE) == O_RDONLY) {
 48.1821 +		tdb->read_only = 1;
 48.1822 +		/* read only databases don't do locking or clear if first */
 48.1823 +		tdb->flags |= TDB_NOLOCK;
 48.1824 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 48.1825 +	}
 48.1826 +
 48.1827 +	/* internal databases don't mmap or lock, and start off cleared */
 48.1828 +	if (tdb->flags & TDB_INTERNAL) {
 48.1829 +		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
 48.1830 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 48.1831 +		if (tdb_new_database(tdb, hash_size) != 0) {
 48.1832 +			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
 48.1833 +			goto fail;
 48.1834 +		}
 48.1835 +		goto internal;
 48.1836 +	}
 48.1837 +
 48.1838 +	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
 48.1839 +		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
 48.1840 +			 name, strerror(errno)));
 48.1841 +		goto fail;	/* errno set by open(2) */
 48.1842 +	}
 48.1843 +
 48.1844 +	/* ensure there is only one process initialising at once */
 48.1845 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
 48.1846 +		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
 48.1847 +			 name, strerror(errno)));
 48.1848 +		goto fail;	/* errno set by tdb_brlock */
 48.1849 +	}
 48.1850 +
 48.1851 +	/* we need to zero database if we are the only one with it open */
 48.1852 +	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
 48.1853 +		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
 48.1854 +		open_flags |= O_CREAT;
 48.1855 +		if (ftruncate(tdb->fd, 0) == -1) {
 48.1856 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
 48.1857 +				 "failed to truncate %s: %s\n",
 48.1858 +				 name, strerror(errno)));
 48.1859 +			goto fail; /* errno set by ftruncate */
 48.1860 +		}
 48.1861 +	}
 48.1862 +
 48.1863 +	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
 48.1864 +	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
 48.1865 +	    || (tdb->header.version != TDB_VERSION
 48.1866 +		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
 48.1867 +		/* its not a valid database - possibly initialise it */
 48.1868 +		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
 48.1869 +			errno = EIO; /* ie bad format or something */
 48.1870 +			goto fail;
 48.1871 +		}
 48.1872 +		rev = (tdb->flags & TDB_CONVERT);
 48.1873 +	}
 48.1874 +	vp = (uint8_t *)&tdb->header.version;
 48.1875 +	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
 48.1876 +		  (((u32)vp[2]) << 8) | (u32)vp[3];
 48.1877 +	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
 48.1878 +	if (!rev)
 48.1879 +		tdb->flags &= ~TDB_CONVERT;
 48.1880 +	else {
 48.1881 +		tdb->flags |= TDB_CONVERT;
 48.1882 +		convert(&tdb->header, sizeof(tdb->header));
 48.1883 +	}
 48.1884 +	if (fstat(tdb->fd, &st) == -1)
 48.1885 +		goto fail;
 48.1886 +
 48.1887 +	/* Is it already in the open list?  If so, fail. */
 48.1888 +	if (tdb_already_open(st.st_dev, st.st_ino)) {
 48.1889 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
 48.1890 +			 "%s (%d,%d) is already open in this process\n",
 48.1891 +			 name, (int)st.st_dev, (int)st.st_ino));
 48.1892 +		errno = EBUSY;
 48.1893 +		goto fail;
 48.1894 +	}
 48.1895 +
 48.1896 +	if (!(tdb->name = (char *)talloc_strdup(tdb, name))) {
 48.1897 +		errno = ENOMEM;
 48.1898 +		goto fail;
 48.1899 +	}
 48.1900 +
 48.1901 +	tdb->map_size = st.st_size;
 48.1902 +	tdb->device = st.st_dev;
 48.1903 +	tdb->inode = st.st_ino;
 48.1904 +	tdb->locked = talloc_zero_array(tdb, struct tdb_lock_type,
 48.1905 +					tdb->header.hash_size+1);
 48.1906 +	if (!tdb->locked) {
 48.1907 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
 48.1908 +			 "failed to allocate lock structure for %s\n",
 48.1909 +			 name));
 48.1910 +		errno = ENOMEM;
 48.1911 +		goto fail;
 48.1912 +	}
 48.1913 +	tdb_mmap(tdb);
 48.1914 +	if (locked) {
 48.1915 +		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
 48.1916 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
 48.1917 +				 "failed to take ACTIVE_LOCK on %s: %s\n",
 48.1918 +				 name, strerror(errno)));
 48.1919 +			goto fail;
 48.1920 +		}
 48.1921 +
 48.1922 +	}
 48.1923 +
 48.1924 +	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
 48.1925 +	   we didn't get the initial exclusive lock as we need to let all other
 48.1926 +	   users know we're using it. */
 48.1927 +
 48.1928 +	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
 48.1929 +	/* leave this lock in place to indicate it's in use */
 48.1930 +	if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
 48.1931 +		goto fail;
 48.1932 +	}
 48.1933 +
 48.1934 +
 48.1935 + internal:
 48.1936 +	/* Internal (memory-only) databases skip all the code above to
 48.1937 +	 * do with disk files, and resume here by releasing their
 48.1938 +	 * global lock and hooking into the active list. */
 48.1939 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
 48.1940 +		goto fail;
 48.1941 +	tdb->next = tdbs;
 48.1942 +	tdbs = tdb;
 48.1943 +	return tdb;
 48.1944 +
 48.1945 + fail:
 48.1946 +	{ int save_errno = errno;
 48.1947 +
 48.1948 +	if (!tdb)
 48.1949 +		return NULL;
 48.1950 +	
 48.1951 +	if (tdb->map_ptr) {
 48.1952 +		if (tdb->flags & TDB_INTERNAL)
 48.1953 +			SAFE_FREE(tdb->map_ptr);
 48.1954 +		else
 48.1955 +			tdb_munmap(tdb);
 48.1956 +	}
 48.1957 +	SAFE_FREE(tdb->name);
 48.1958 +	if (tdb->fd != -1)
 48.1959 +		if (close(tdb->fd) != 0)
 48.1960 +			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
 48.1961 +	SAFE_FREE(tdb->locked);
 48.1962 +	SAFE_FREE(tdb);
 48.1963 +	errno = save_errno;
 48.1964 +	return NULL;
 48.1965 +	}
 48.1966 +}
 48.1967 +
 48.1968 +/**
 48.1969 + * Close a database.
 48.1970 + *
 48.1971 + * @returns -1 for error; 0 for success.
 48.1972 + **/
 48.1973 +int tdb_close(TDB_CONTEXT *tdb)
 48.1974 +{
 48.1975 +	TDB_CONTEXT **i;
 48.1976 +	int ret = 0;
 48.1977 +
 48.1978 +	if (tdb->map_ptr) {
 48.1979 +		if (tdb->flags & TDB_INTERNAL)
 48.1980 +			SAFE_FREE(tdb->map_ptr);
 48.1981 +		else
 48.1982 +			tdb_munmap(tdb);
 48.1983 +	}
 48.1984 +	SAFE_FREE(tdb->name);
 48.1985 +	if (tdb->fd != -1)
 48.1986 +		ret = close(tdb->fd);
 48.1987 +	SAFE_FREE(tdb->locked);
 48.1988 +
 48.1989 +	/* Remove from contexts list */
 48.1990 +	for (i = &tdbs; *i; i = &(*i)->next) {
 48.1991 +		if (*i == tdb) {
 48.1992 +			*i = tdb->next;
 48.1993 +			break;
 48.1994 +		}
 48.1995 +	}
 48.1996 +
 48.1997 +	memset(tdb, 0, sizeof(*tdb));
 48.1998 +	SAFE_FREE(tdb);
 48.1999 +
 48.2000 +	return ret;
 48.2001 +}
 48.2002 +
 48.2003 +/* lock/unlock entire database */
 48.2004 +int tdb_lockall(TDB_CONTEXT *tdb)
 48.2005 +{
 48.2006 +	u32 i;
 48.2007 +
 48.2008 +	/* There are no locks on read-only dbs */
 48.2009 +	if (tdb->read_only)
 48.2010 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 48.2011 +	for (i = 0; i < tdb->header.hash_size; i++) 
 48.2012 +		if (tdb_lock(tdb, i, F_WRLCK))
 48.2013 +			break;
 48.2014 +
 48.2015 +	/* If error, release locks we have... */
 48.2016 +	if (i < tdb->header.hash_size) {
 48.2017 +		u32 j;
 48.2018 +
 48.2019 +		for ( j = 0; j < i; j++)
 48.2020 +			tdb_unlock(tdb, j, F_WRLCK);
 48.2021 +		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
 48.2022 +	}
 48.2023 +
 48.2024 +	return 0;
 48.2025 +}
 48.2026 +void tdb_unlockall(TDB_CONTEXT *tdb)
 48.2027 +{
 48.2028 +	u32 i;
 48.2029 +	for (i=0; i < tdb->header.hash_size; i++)
 48.2030 +		tdb_unlock(tdb, i, F_WRLCK);
 48.2031 +}
 48.2032 +
 48.2033 +/* lock/unlock one hash chain. This is meant to be used to reduce
 48.2034 +   contention - it cannot guarantee how many records will be locked */
 48.2035 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
 48.2036 +{
 48.2037 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 48.2038 +}
 48.2039 +
 48.2040 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
 48.2041 +{
 48.2042 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 48.2043 +}
 48.2044 +
 48.2045 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
 48.2046 +{
 48.2047 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 48.2048 +}
 48.2049 +
 48.2050 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
 48.2051 +{
 48.2052 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 48.2053 +}
 48.2054 +
 48.2055 +
 48.2056 +/* register a loging function */
 48.2057 +void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
 48.2058 +{
 48.2059 +	tdb->log_fn = fn?fn:null_log_fn;
 48.2060 +}
 48.2061 +
 48.2062 +
 48.2063 +/* reopen a tdb - this can be used after a fork to ensure that we have an independent
 48.2064 +   seek pointer from our parent and to re-establish locks */
 48.2065 +int tdb_reopen(TDB_CONTEXT *tdb)
 48.2066 +{
 48.2067 +	struct stat st;
 48.2068 +
 48.2069 +	if (tdb->flags & TDB_INTERNAL)
 48.2070 +		return 0; /* Nothing to do. */
 48.2071 +	if (tdb_munmap(tdb) != 0) {
 48.2072 +		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
 48.2073 +		goto fail;
 48.2074 +	}
 48.2075 +	if (close(tdb->fd) != 0)
 48.2076 +		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
 48.2077 +	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
 48.2078 +	if (tdb->fd == -1) {
 48.2079 +		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
 48.2080 +		goto fail;
 48.2081 +	}
 48.2082 +	if (fstat(tdb->fd, &st) != 0) {
 48.2083 +		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
 48.2084 +		goto fail;
 48.2085 +	}
 48.2086 +	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
 48.2087 +		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
 48.2088 +		goto fail;
 48.2089 +	}
 48.2090 +	tdb_mmap(tdb);
 48.2091 +	if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
 48.2092 +		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
 48.2093 +		goto fail;
 48.2094 +	}
 48.2095 +
 48.2096 +	return 0;
 48.2097 +
 48.2098 +fail:
 48.2099 +	tdb_close(tdb);
 48.2100 +	return -1;
 48.2101 +}
 48.2102 +
 48.2103 +/* Not general: only works if single writer. */
 48.2104 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile)
 48.2105 +{
 48.2106 +	int fd, saved_errno;
 48.2107 +	TDB_CONTEXT *copy;
 48.2108 +
 48.2109 +	fd = open(outfile, O_TRUNC|O_CREAT|O_WRONLY, 0640);
 48.2110 +	if (fd < 0)
 48.2111 +		return NULL;
 48.2112 +	if (tdb->map_ptr) {
 48.2113 +		if (write(fd,tdb->map_ptr,tdb->map_size) != (int)tdb->map_size)
 48.2114 +			goto fail;
 48.2115 +	} else {
 48.2116 +		char buf[65536];
 48.2117 +		int r;
 48.2118 +
 48.2119 +		lseek(tdb->fd, 0, SEEK_SET);
 48.2120 +		while ((r = read(tdb->fd, buf, sizeof(buf))) > 0) {
 48.2121 +			if (write(fd, buf, r) != r)
 48.2122 +				goto fail;
 48.2123 +		}
 48.2124 +		if (r < 0)
 48.2125 +			goto fail;
 48.2126 +	}
 48.2127 +	copy = tdb_open(outfile, 0, 0, O_RDWR, 0);
 48.2128 +	if (!copy)
 48.2129 +		goto fail;
 48.2130 +	close(fd);
 48.2131 +	return copy;
 48.2132 +
 48.2133 +fail:
 48.2134 +	saved_errno = errno;
 48.2135 +	close(fd);
 48.2136 +	unlink(outfile);
 48.2137 +	errno = saved_errno;
 48.2138 +	return NULL;
 48.2139 +}
 48.2140 +
 48.2141 +/* reopen all tdb's */
 48.2142 +int tdb_reopen_all(void)
 48.2143 +{
 48.2144 +	TDB_CONTEXT *tdb;
 48.2145 +
 48.2146 +	for (tdb=tdbs; tdb; tdb = tdb->next) {
 48.2147 +		/* Ensure no clear-if-first. */
 48.2148 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 48.2149 +		if (tdb_reopen(tdb) != 0)
 48.2150 +			return -1;
 48.2151 +	}
 48.2152 +
 48.2153 +	return 0;
 48.2154 +}
    49.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    49.2 +++ b/tools/xenstore/tdb.h	Thu Sep 29 16:22:02 2005 -0600
    49.3 @@ -0,0 +1,157 @@
    49.4 +#ifndef __TDB_H__
    49.5 +#define __TDB_H__
    49.6 +
    49.7 +/* 
    49.8 +   Unix SMB/CIFS implementation.
    49.9 +
   49.10 +   trivial database library
   49.11 +
   49.12 +   Copyright (C) Andrew Tridgell 1999-2004
   49.13 +   
   49.14 +     ** NOTE! The following LGPL license applies to the tdb
   49.15 +     ** library. This does NOT imply that all of Samba is released
   49.16 +     ** under the LGPL
   49.17 +   
   49.18 +   This library is free software; you can redistribute it and/or
   49.19 +   modify it under the terms of the GNU Lesser General Public
   49.20 +   License as published by the Free Software Foundation; either
   49.21 +   version 2 of the License, or (at your option) any later version.
   49.22 +
   49.23 +   This library is distributed in the hope that it will be useful,
   49.24 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
   49.25 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   49.26 +   Lesser General Public License for more details.
   49.27 +
   49.28 +   You should have received a copy of the GNU Lesser General Public
   49.29 +   License along with this library; if not, write to the Free Software
   49.30 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   49.31 +*/
   49.32 +
   49.33 +#ifdef  __cplusplus
   49.34 +extern "C" {
   49.35 +#endif
   49.36 +
   49.37 +
   49.38 +/* flags to tdb_store() */
   49.39 +#define TDB_REPLACE 1
   49.40 +#define TDB_INSERT 2
   49.41 +#define TDB_MODIFY 3
   49.42 +
   49.43 +/* flags for tdb_open() */
   49.44 +#define TDB_DEFAULT 0 /* just a readability place holder */
   49.45 +#define TDB_CLEAR_IF_FIRST 1
   49.46 +#define TDB_INTERNAL 2 /* don't store on disk */
   49.47 +#define TDB_NOLOCK   4 /* don't do any locking */
   49.48 +#define TDB_NOMMAP   8 /* don't use mmap */
   49.49 +#define TDB_CONVERT 16 /* convert endian (internal use) */
   49.50 +#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */
   49.51 +
   49.52 +#define TDB_ERRCODE(code, ret) ((tdb->ecode = (code)), ret)
   49.53 +
   49.54 +/* error codes */
   49.55 +enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
   49.56 +		TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
   49.57 +		TDB_ERR_NOEXIST};
   49.58 +
   49.59 +#ifndef u32
   49.60 +#define u32 unsigned
   49.61 +#endif
   49.62 +
   49.63 +typedef struct TDB_DATA {
   49.64 +	char *dptr;
   49.65 +	size_t dsize;
   49.66 +} TDB_DATA;
   49.67 +
   49.68 +typedef u32 tdb_len;
   49.69 +typedef u32 tdb_off;
   49.70 +
   49.71 +/* this is stored at the front of every database */
   49.72 +struct tdb_header {
   49.73 +	char magic_food[32]; /* for /etc/magic */
   49.74 +	u32 version; /* version of the code */
   49.75 +	u32 hash_size; /* number of hash entries */
   49.76 +	tdb_off rwlocks;
   49.77 +	tdb_off reserved[31];
   49.78 +};
   49.79 +
   49.80 +struct tdb_lock_type {
   49.81 +	u32 count;
   49.82 +	u32 ltype;
   49.83 +};
   49.84 +
   49.85 +struct tdb_traverse_lock {
   49.86 +	struct tdb_traverse_lock *next;
   49.87 +	u32 off;
   49.88 +	u32 hash;
   49.89 +};
   49.90 +
   49.91 +#ifndef PRINTF_ATTRIBUTE
   49.92 +#define PRINTF_ATTRIBUTE(a,b)
   49.93 +#endif
   49.94 +
   49.95 +/* this is the context structure that is returned from a db open */
   49.96 +typedef struct tdb_context {
   49.97 +	char *name; /* the name of the database */
   49.98 +	void *map_ptr; /* where it is currently mapped */
   49.99 +	int fd; /* open file descriptor for the database */
  49.100 +	tdb_len map_size; /* how much space has been mapped */
  49.101 +	int read_only; /* opened read-only */
  49.102 +	struct tdb_lock_type *locked; /* array of chain locks */
  49.103 +	enum TDB_ERROR ecode; /* error code for last tdb error */
  49.104 +	struct tdb_header header; /* a cached copy of the header */
  49.105 +	u32 flags; /* the flags passed to tdb_open */
  49.106 +	struct tdb_traverse_lock travlocks; /* current traversal locks */
  49.107 +	struct tdb_context *next; /* all tdbs to avoid multiple opens */
  49.108 +	dev_t device;	/* uniquely identifies this tdb */
  49.109 +	ino_t inode;	/* uniquely identifies this tdb */
  49.110 +	void (*log_fn)(struct tdb_context *tdb, int level, const char *, ...) PRINTF_ATTRIBUTE(3,4); /* logging function */
  49.111 +	u32 (*hash_fn)(TDB_DATA *key);
  49.112 +	int open_flags; /* flags used in the open - needed by reopen */
  49.113 +} TDB_CONTEXT;
  49.114 +
  49.115 +typedef int (*tdb_traverse_func)(TDB_CONTEXT *, TDB_DATA, TDB_DATA, void *);
  49.116 +typedef void (*tdb_log_func)(TDB_CONTEXT *, int , const char *, ...);
  49.117 +typedef u32 (*tdb_hash_func)(TDB_DATA *key);
  49.118 +
  49.119 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
  49.120 +		      int open_flags, mode_t mode);
  49.121 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
  49.122 +			 int open_flags, mode_t mode,
  49.123 +			 tdb_log_func log_fn,
  49.124 +			 tdb_hash_func hash_fn);
  49.125 +
  49.126 +int tdb_reopen(TDB_CONTEXT *tdb);
  49.127 +int tdb_reopen_all(void);
  49.128 +void tdb_logging_function(TDB_CONTEXT *tdb, tdb_log_func);
  49.129 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb);
  49.130 +const char *tdb_errorstr(TDB_CONTEXT *tdb);
  49.131 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key);
  49.132 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key);
  49.133 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag);
  49.134 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf);
  49.135 +int tdb_close(TDB_CONTEXT *tdb);
  49.136 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb);
  49.137 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key);
  49.138 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *);
  49.139 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key);
  49.140 +int tdb_lockall(TDB_CONTEXT *tdb);
  49.141 +void tdb_unlockall(TDB_CONTEXT *tdb);
  49.142 +
  49.143 +/* Low level locking functions: use with care */
  49.144 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key);
  49.145 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key);
  49.146 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
  49.147 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
  49.148 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile);
  49.149 +
  49.150 +/* Debug functions. Not used in production. */
  49.151 +void tdb_dump_all(TDB_CONTEXT *tdb);
  49.152 +int tdb_printfreelist(TDB_CONTEXT *tdb);
  49.153 +
  49.154 +extern TDB_DATA tdb_null;
  49.155 +
  49.156 +#ifdef  __cplusplus
  49.157 +}
  49.158 +#endif
  49.159 +
  49.160 +#endif /* tdb.h */
    50.1 --- a/tools/xenstore/testsuite/04rm.test	Thu Sep 29 13:35:13 2005 -0600
    50.2 +++ b/tools/xenstore/testsuite/04rm.test	Thu Sep 29 16:22:02 2005 -0600
    50.3 @@ -6,6 +6,8 @@ rm /dir/test
    50.4  # Create file and remove it
    50.5  write /test contents
    50.6  rm /test
    50.7 +expect tool
    50.8 +dir /
    50.9  
   50.10  # Create directory and remove it.
   50.11  mkdir /dir
   50.12 @@ -15,3 +17,4 @@ rm /dir
   50.13  mkdir /dir
   50.14  write /dir/test contents
   50.15  rm /dir
   50.16 +
    51.1 --- a/tools/xenstore/testsuite/08transaction.slowtest	Thu Sep 29 13:35:13 2005 -0600
    51.2 +++ b/tools/xenstore/testsuite/08transaction.slowtest	Thu Sep 29 16:22:02 2005 -0600
    51.3 @@ -1,21 +1,43 @@
    51.4 -# Test transaction timeouts.  Take a second each.
    51.5 +# Test transaction clashes.
    51.6  
    51.7  mkdir /test
    51.8  write /test/entry1 contents
    51.9  
   51.10 -# Transactions can take as long as the want...
   51.11 -start /test
   51.12 -sleep 1100
   51.13 -rm /test/entry1
   51.14 -commit
   51.15 -dir /test
   51.16 +# Start transaction, do read-only op, transaction succeeds
   51.17 +1 start
   51.18 +1 write /test/entry1 contents2
   51.19 +expect contents
   51.20 +read /test/entry1
   51.21 +1 commit
   51.22 +expect contents2
   51.23 +read /test/entry1
   51.24 +
   51.25 +# Start transaction, abort other transaction, transaction succeeds.
   51.26 +1 start
   51.27 +1 write /test/entry1 contents3
   51.28 +start
   51.29 +write /test/entry1 contents
   51.30 +abort
   51.31 +1 commit
   51.32 +expect contents3
   51.33 +read /test/entry1
   51.34  
   51.35 -# ... as long as noone is waiting.
   51.36 -1 start /test
   51.37 -notimeout
   51.38 -2 mkdir /test/dir
   51.39 -1 mkdir /test/dir
   51.40 -expect 1:dir
   51.41 -1 dir /test
   51.42 -expect 1: commit failed: Connection timed out
   51.43 +# Start transaction, do write op, transaction fails
   51.44 +1 start
   51.45 +1 write /test/entry1 contents4
   51.46 +write /test/entry1 contents
   51.47 +expect 1: commit failed: Resource temporarily unavailable
   51.48  1 commit
   51.49 +expect contents
   51.50 +read /test/entry1
   51.51 +
   51.52 +# Start transaction, do other transaction, transaction fails
   51.53 +1 start
   51.54 +1 write /test/entry1 contents4
   51.55 +start
   51.56 +write /test/entry1 contents5
   51.57 +commit
   51.58 +expect 1: commit failed: Resource temporarily unavailable
   51.59 +1 commit
   51.60 +expect contents5
   51.61 +read /test/entry1
    52.1 --- a/tools/xenstore/testsuite/08transaction.test	Thu Sep 29 13:35:13 2005 -0600
    52.2 +++ b/tools/xenstore/testsuite/08transaction.test	Thu Sep 29 16:22:02 2005 -0600
    52.3 @@ -3,7 +3,7 @@
    52.4  mkdir /test
    52.5  
    52.6  # Simple transaction: create a file inside transaction.
    52.7 -1 start /test
    52.8 +1 start
    52.9  1 write /test/entry1 contents
   52.10  2 dir /test
   52.11  expect 1:entry1
   52.12 @@ -15,7 +15,7 @@ 2 read /test/entry1
   52.13  rm /test/entry1
   52.14  
   52.15  # Create a file and abort transaction.
   52.16 -1 start /test
   52.17 +1 start
   52.18  1 write /test/entry1 contents
   52.19  2 dir /test
   52.20  expect 1:entry1
   52.21 @@ -25,7 +25,7 @@ 2 dir /test
   52.22  
   52.23  write /test/entry1 contents
   52.24  # Delete in transaction, commit
   52.25 -1 start /test
   52.26 +1 start
   52.27  1 rm /test/entry1
   52.28  expect 2:entry1
   52.29  2 dir /test
   52.30 @@ -35,7 +35,7 @@ 2 dir /test
   52.31  
   52.32  # Delete in transaction, abort.
   52.33  write /test/entry1 contents
   52.34 -1 start /test
   52.35 +1 start
   52.36  1 rm /test/entry1
   52.37  expect 2:entry1
   52.38  2 dir /test
   52.39 @@ -47,7 +47,7 @@ 2 dir /test
   52.40  # Events inside transactions don't trigger watches until (successful) commit.
   52.41  mkdir /test/dir
   52.42  1 watch /test token
   52.43 -2 start /test
   52.44 +2 start
   52.45  2 mkdir /test/dir/sub
   52.46  expect 1: waitwatch failed: Connection timed out
   52.47  1 waitwatch
   52.48 @@ -55,7 +55,7 @@ 2 close
   52.49  1 close
   52.50  
   52.51  1 watch /test token
   52.52 -2 start /test
   52.53 +2 start
   52.54  2 mkdir /test/dir/sub
   52.55  2 abort
   52.56  expect 1: waitwatch failed: Connection timed out
   52.57 @@ -63,7 +63,7 @@ 1 waitwatch
   52.58  1 close
   52.59  
   52.60  1 watch /test token
   52.61 -2 start /test
   52.62 +2 start
   52.63  2 mkdir /test/dir/sub
   52.64  2 commit
   52.65  expect 1:/test/dir/sub:token
   52.66 @@ -73,7 +73,7 @@ 1 close
   52.67  
   52.68  # Rm inside transaction works like rm outside: children get notified.
   52.69  1 watch /test/dir/sub token
   52.70 -2 start /test
   52.71 +2 start
   52.72  2 rm /test/dir
   52.73  2 commit
   52.74  expect 1:/test/dir/sub:token
   52.75 @@ -83,7 +83,7 @@ 1 close
   52.76  
   52.77  # Multiple events from single transaction don't trigger assert
   52.78  1 watch /test token
   52.79 -2 start /test
   52.80 +2 start
   52.81  2 write /test/1 contents
   52.82  2 write /test/2 contents
   52.83  2 commit
    53.1 --- a/tools/xenstore/testsuite/12readonly.test	Thu Sep 29 13:35:13 2005 -0600
    53.2 +++ b/tools/xenstore/testsuite/12readonly.test	Thu Sep 29 16:22:02 2005 -0600
    53.3 @@ -13,23 +13,23 @@ expect 0 READ
    53.4  getperm /test
    53.5  watch /test token
    53.6  unwatch /test token 
    53.7 -start /
    53.8 +start
    53.9  commit
   53.10 -start /
   53.11 +start
   53.12  abort
   53.13  
   53.14  # These don't work
   53.15 -expect write failed: Read-only file system
   53.16 +expect write failed: Permission denied
   53.17  write /test2 contents
   53.18 -expect write failed: Read-only file system
   53.19 +expect write failed: Permission denied
   53.20  write /test contents
   53.21 -expect setperm failed: Read-only file system
   53.22 +expect setperm failed: Permission denied
   53.23  setperm /test 100 NONE
   53.24 -expect setperm failed: Read-only file system
   53.25 +expect setperm failed: Permission denied
   53.26  setperm /test 100 NONE
   53.27 -expect shutdown failed: Read-only file system
   53.28 +expect shutdown failed: Permission denied
   53.29  shutdown
   53.30 -expect introduce failed: Read-only file system
   53.31 +expect introduce failed: Permission denied
   53.32  introduce 1 100 7 /home
   53.33  
   53.34  # Check that watches work like normal.
    54.1 --- a/tools/xenstore/testsuite/14complexperms.test	Thu Sep 29 13:35:13 2005 -0600
    54.2 +++ b/tools/xenstore/testsuite/14complexperms.test	Thu Sep 29 16:22:02 2005 -0600
    54.3 @@ -33,14 +33,6 @@ unwatch /dir/file token
    54.4  expect *No such file or directory
    54.5  unwatch /dir/file token 
    54.6  expect *Permission denied
    54.7 -start /dir/file
    54.8 -expect *No such file or directory
    54.9 -abort
   54.10 -expect *Permission denied
   54.11 -start /dir/file
   54.12 -expect *No such file or directory
   54.13 -commit
   54.14 -expect *Permission denied
   54.15  introduce 2 100 7 /dir/file
   54.16  
   54.17  # Now it exists
   54.18 @@ -73,12 +65,4 @@ unwatch /dir/file token
   54.19  expect *No such file or directory
   54.20  unwatch /dir/file token 
   54.21  expect *Permission denied
   54.22 -start /dir/file
   54.23 -expect *No such file or directory
   54.24 -abort
   54.25 -expect *Permission denied
   54.26 -start /dir/file
   54.27 -expect *No such file or directory
   54.28 -commit
   54.29 -expect *Permission denied
   54.30  introduce 2 100 7 /dir/file
    55.1 --- a/tools/xenstore/testsuite/16block-watch-crash.test	Thu Sep 29 13:35:13 2005 -0600
    55.2 +++ b/tools/xenstore/testsuite/16block-watch-crash.test	Thu Sep 29 16:22:02 2005 -0600
    55.3 @@ -1,13 +1,14 @@
    55.4  # Test case where blocked connection gets sent watch.
    55.5  
    55.6 -mkdir /test
    55.7 -watch /test token
    55.8 -1 start /test
    55.9 -# This will block on above
   55.10 -noackwrite /test/entry contents
   55.11 -1 write /test/entry2 contents
   55.12 -1 commit
   55.13 -readack
   55.14 -expect /test/entry2:token
   55.15 -waitwatch
   55.16 -ackwatch token
   55.17 +# FIXME: We no longer block connections 
   55.18 +# mkdir /test
   55.19 +# watch /test token
   55.20 +# 1 start
   55.21 +# # This will block on above
   55.22 +# noackwrite /test/entry contents
   55.23 +# 1 write /test/entry2 contents
   55.24 +# 1 commit
   55.25 +# readack
   55.26 +# expect /test/entry2:token
   55.27 +# waitwatch
   55.28 +# ackwatch token
    56.1 --- a/tools/xenstore/xenstore_client.c	Thu Sep 29 13:35:13 2005 -0600
    56.2 +++ b/tools/xenstore/xenstore_client.c	Thu Sep 29 16:22:02 2005 -0600
    56.3 @@ -14,6 +14,7 @@
    56.4  #include <stdlib.h>
    56.5  #include <string.h>
    56.6  #include <xs.h>
    56.7 +#include <errno.h>
    56.8  
    56.9  static void
   56.10  usage(const char *progname)
   56.11 @@ -82,8 +83,8 @@ main(int argc, char **argv)
   56.12      }
   56.13  #endif
   56.14  
   56.15 -    /* XXX maybe find longest common prefix */
   56.16 -    success = xs_transaction_start(xsh, "/");
   56.17 +  again:
   56.18 +    success = xs_transaction_start(xsh);
   56.19      if (!success)
   56.20  	errx(1, "couldn't start transaction");
   56.21  
   56.22 @@ -145,8 +146,10 @@ main(int argc, char **argv)
   56.23  
   56.24   out:
   56.25      success = xs_transaction_end(xsh, ret ? true : false);
   56.26 -    if (!success)
   56.27 +    if (!success) {
   56.28 +	if (ret == 0 && errno == EAGAIN)
   56.29 +	    goto again;
   56.30  	errx(1, "couldn't end transaction");
   56.31 -
   56.32 +    }
   56.33      return ret;
   56.34  }
    57.1 --- a/tools/xenstore/xenstored.h	Thu Sep 29 13:35:13 2005 -0600
    57.2 +++ b/tools/xenstore/xenstored.h	Thu Sep 29 16:22:02 2005 -0600
    57.3 @@ -75,7 +75,7 @@ static struct xsd_errors xsd_errors[] __
    57.4  	XSD_ERROR(ENOSYS),
    57.5  	XSD_ERROR(EROFS),
    57.6  	XSD_ERROR(EBUSY),
    57.7 -	XSD_ERROR(ETIMEDOUT),
    57.8 +	XSD_ERROR(EAGAIN),
    57.9  	XSD_ERROR(EISCONN),
   57.10  };
   57.11  struct xsd_sockmsg
    58.1 --- a/tools/xenstore/xenstored_core.c	Thu Sep 29 13:35:13 2005 -0600
    58.2 +++ b/tools/xenstore/xenstored_core.c	Thu Sep 29 16:22:02 2005 -0600
    58.3 @@ -50,10 +50,12 @@
    58.4  #include "xenstored_transaction.h"
    58.5  #include "xenstored_domain.h"
    58.6  #include "xenctrl.h"
    58.7 +#include "tdb.h"
    58.8  
    58.9  static bool verbose;
   58.10  LIST_HEAD(connections);
   58.11  static int tracefd = -1;
   58.12 +static TDB_CONTEXT *tdb_ctx;
   58.13  
   58.14  #ifdef TESTING
   58.15  static bool failtest = false;
   58.16 @@ -126,6 +128,23 @@ void __attribute__((noreturn)) corrupt(s
   58.17  	_exit(2);
   58.18  }
   58.19  
   58.20 +TDB_CONTEXT *tdb_context(struct connection *conn)
   58.21 +{
   58.22 +	/* conn = NULL used in manual_node at setup. */
   58.23 +	if (!conn || !conn->transaction)
   58.24 +		return tdb_ctx;
   58.25 +	return tdb_transaction_context(conn->transaction);
   58.26 +}
   58.27 +
   58.28 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb)
   58.29 +{
   58.30 +	if (rename(newname, xs_daemon_tdb()) != 0)
   58.31 +		return false;
   58.32 +	tdb_close(tdb_ctx);
   58.33 +	tdb_ctx = talloc_steal(talloc_autofree_context(), newtdb);
   58.34 +	return true;
   58.35 +}
   58.36 +
   58.37  static char *sockmsg_string(enum xsd_sockmsg_type type)
   58.38  {
   58.39  	switch (type) {
   58.40 @@ -202,37 +221,6 @@ void trace_destroy(const void *data, con
   58.41  	write(tracefd, string, strlen(string));
   58.42  }
   58.43  
   58.44 -void trace_watch_timeout(const struct connection *conn, const char *node, const char *token)
   58.45 -{
   58.46 -	char string[64];
   58.47 -	if (tracefd < 0)
   58.48 -		return;
   58.49 -	write(tracefd, "WATCH_TIMEOUT ", strlen("WATCH_TIMEOUT "));
   58.50 -	sprintf(string, " %p ", conn);
   58.51 -	write(tracefd, string, strlen(string));
   58.52 -	write(tracefd, " (", 2);
   58.53 -	write(tracefd, node, strlen(node));
   58.54 -	write(tracefd, " ", 1);
   58.55 -	write(tracefd, token, strlen(token));
   58.56 -	write(tracefd, ")\n", 2);
   58.57 -}
   58.58 -
   58.59 -static void trace_blocked(const struct connection *conn,
   58.60 -			  const struct buffered_data *data)
   58.61 -{
   58.62 -	char string[64];
   58.63 -
   58.64 -	if (tracefd < 0)
   58.65 -		return;
   58.66 -
   58.67 -	write(tracefd, "BLOCKED", strlen("BLOCKED"));
   58.68 -	sprintf(string, " %p (", conn);
   58.69 -	write(tracefd, string, strlen(string));
   58.70 -	write(tracefd, sockmsg_string(data->hdr.msg.type),
   58.71 -	      strlen(sockmsg_string(data->hdr.msg.type)));
   58.72 -	write(tracefd, ")\n", 2);
   58.73 -}
   58.74 -
   58.75  void trace(const char *fmt, ...)
   58.76  {
   58.77  	va_list arglist;
   58.78 @@ -253,7 +241,6 @@ static bool write_message(struct connect
   58.79  	int ret;
   58.80  	struct buffered_data *out = conn->out;
   58.81  
   58.82 -	assert(conn->state != BLOCKED);
   58.83  	if (out->inhdr) {
   58.84  		if (verbose)
   58.85  			xprintf("Writing msg %s (%s) out to %p\n",
   58.86 @@ -351,24 +338,6 @@ static int initialize_set(fd_set *inset,
   58.87  	return max;
   58.88  }
   58.89  
   58.90 -/* Read everything from a talloc_open'ed fd. */
   58.91 -void *read_all(int *fd, unsigned int *size)
   58.92 -{
   58.93 -	unsigned int max = 4;
   58.94 -	int ret;
   58.95 -	void *buffer = talloc_size(fd, max);
   58.96 -
   58.97 -	*size = 0;
   58.98 -	while ((ret = read(*fd, buffer + *size, max - *size)) > 0) {
   58.99 -		*size += ret;
  58.100 -		if (*size == max)
  58.101 -			buffer = talloc_realloc_size(fd, buffer, max *= 2);
  58.102 -	}
  58.103 -	if (ret < 0)
  58.104 -		return NULL;
  58.105 -	return buffer;
  58.106 -}
  58.107 -
  58.108  static int destroy_fd(void *_fd)
  58.109  {
  58.110  	int *fd = _fd;
  58.111 @@ -409,42 +378,167 @@ bool is_child(const char *child, const c
  58.112  	return child[len] == '/' || child[len] == '\0';
  58.113  }
  58.114  
  58.115 -/* Answer never ends in /. */
  58.116 -char *node_dir_outside_transaction(const char *node)
  58.117 +/* If it fails, returns NULL and sets errno. */
  58.118 +static struct node *read_node(struct connection *conn, const char *name)
  58.119  {
  58.120 -	if (streq(node, "/"))
  58.121 -		return talloc_strdup(node, xs_daemon_store());
  58.122 -	return talloc_asprintf(node, "%s%s", xs_daemon_store(), node);
  58.123 +	TDB_DATA key, data;
  58.124 +	u32 *p;
  58.125 +	struct node *node;
  58.126 +
  58.127 +	key.dptr = (void *)name;
  58.128 +	key.dsize = strlen(name);
  58.129 +	data = tdb_fetch(tdb_context(conn), key);
  58.130 +
  58.131 +	if (data.dptr == NULL) {
  58.132 +		if (tdb_error(tdb_context(conn)) == TDB_ERR_NOEXIST)
  58.133 +			errno = ENOENT;
  58.134 +		else
  58.135 +			errno = EIO;
  58.136 +		return NULL;
  58.137 +	}
  58.138 +
  58.139 +	node = talloc(name, struct node);
  58.140 +	node->name = talloc_strdup(node, name);
  58.141 +	node->parent = NULL;
  58.142 +	node->tdb = tdb_context(conn);
  58.143 +	talloc_steal(node, data.dptr);
  58.144 +
  58.145 +	/* Datalen, childlen, number of permissions */
  58.146 +	p = (u32 *)data.dptr;
  58.147 +	node->num_perms = p[0];
  58.148 +	node->datalen = p[1];
  58.149 +	node->childlen = p[2];
  58.150 +
  58.151 +	/* Permissions are struct xs_permissions. */
  58.152 +	node->perms = (void *)&p[3];
  58.153 +	/* Data is binary blob (usually ascii, no nul). */
  58.154 +	node->data = node->perms + node->num_perms;
  58.155 +	/* Children is strings, nul separated. */
  58.156 +	node->children = node->data + node->datalen;
  58.157 +
  58.158 +	return node;
  58.159  }
  58.160  
  58.161 -static char *node_dir(struct transaction *trans, const char *node)
  58.162 +static bool write_node(struct connection *conn, const struct node *node)
  58.163  {
  58.164 -	if (!trans || !within_transaction(trans, node))
  58.165 -		return node_dir_outside_transaction(node);
  58.166 -	return node_dir_inside_transaction(trans, node);
  58.167 +	TDB_DATA key, data;
  58.168 +	void *p;
  58.169 +
  58.170 +	key.dptr = (void *)node->name;
  58.171 +	key.dsize = strlen(node->name);
  58.172 +
  58.173 +	data.dsize = 3*sizeof(u32)
  58.174 +		+ node->num_perms*sizeof(node->perms[0])
  58.175 +		+ node->datalen + node->childlen;
  58.176 +	data.dptr = talloc_size(node, data.dsize);
  58.177 +	((u32 *)data.dptr)[0] = node->num_perms;
  58.178 +	((u32 *)data.dptr)[1] = node->datalen;
  58.179 +	((u32 *)data.dptr)[2] = node->childlen;
  58.180 +	p = data.dptr + 3 * sizeof(u32);
  58.181 +
  58.182 +	memcpy(p, node->perms, node->num_perms*sizeof(node->perms[0]));
  58.183 +	p += node->num_perms*sizeof(node->perms[0]);
  58.184 +	memcpy(p, node->data, node->datalen);
  58.185 +	p += node->datalen;
  58.186 +	memcpy(p, node->children, node->childlen);
  58.187 +
  58.188 +	/* TDB should set errno, but doesn't even set ecode AFAICT. */
  58.189 +	if (tdb_store(tdb_context(conn), key, data, TDB_REPLACE) != 0) {
  58.190 +		errno = ENOSPC;
  58.191 +		return false;
  58.192 +	}
  58.193 +	return true;
  58.194  }
  58.195  
  58.196 -static char *datafile(const char *dir)
  58.197 +static enum xs_perm_type perm_for_conn(struct connection *conn,
  58.198 +				       struct xs_permissions *perms,
  58.199 +				       unsigned int num)
  58.200  {
  58.201 -	return talloc_asprintf(dir, "%s/.data", dir);
  58.202 +	unsigned int i;
  58.203 +	enum xs_perm_type mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  58.204 +
  58.205 +	if (!conn->can_write)
  58.206 +		mask &= ~XS_PERM_WRITE;
  58.207 +
  58.208 +	/* Owners and tools get it all... */
  58.209 +	if (!conn->id || perms[0].id == conn->id)
  58.210 +		return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
  58.211 +
  58.212 +	for (i = 1; i < num; i++)
  58.213 +		if (perms[i].id == conn->id)
  58.214 +			return perms[i].perms & mask;
  58.215 +
  58.216 +	return perms[0].perms & mask;
  58.217  }
  58.218  
  58.219 -static char *node_datafile(struct transaction *trans, const char *node)
  58.220 +static char *get_parent(const char *node)
  58.221  {
  58.222 -	return datafile(node_dir(trans, node));
  58.223 +	char *slash = strrchr(node + 1, '/');
  58.224 +	if (!slash)
  58.225 +		return talloc_strdup(node, "/");
  58.226 +	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  58.227  }
  58.228  
  58.229 -static char *permfile(const char *dir)
  58.230 +/* What do parents say? */
  58.231 +static enum xs_perm_type ask_parents(struct connection *conn, const char *name)
  58.232  {
  58.233 -	return talloc_asprintf(dir, "%s/.perms", dir);
  58.234 +	struct node *node;
  58.235 +
  58.236 +	do {
  58.237 +		name = get_parent(name);
  58.238 +		node = read_node(conn, name);
  58.239 +		if (node)
  58.240 +			break;
  58.241 +	} while (!streq(name, "/"));
  58.242 +
  58.243 +	/* No permission at root?  We're in trouble. */
  58.244 +	if (!node)
  58.245 +		corrupt(conn, "No permissions file at root");
  58.246 +
  58.247 +	return perm_for_conn(conn, node->perms, node->num_perms);
  58.248  }
  58.249  
  58.250 -static char *node_permfile(struct transaction *trans, const char *node)
  58.251 +/* We have a weird permissions system.  You can allow someone into a
  58.252 + * specific node without allowing it in the parents.  If it's going to
  58.253 + * fail, however, we don't want the errno to indicate any information
  58.254 + * about the node. */
  58.255 +static int errno_from_parents(struct connection *conn, const char *node,
  58.256 +			      int errnum, enum xs_perm_type perm)
  58.257  {
  58.258 -	return permfile(node_dir(trans, node));
  58.259 +	/* We always tell them about memory failures. */
  58.260 +	if (errnum == ENOMEM)
  58.261 +		return errnum;
  58.262 +
  58.263 +	if (ask_parents(conn, node) & perm)
  58.264 +		return errnum;
  58.265 +	return EACCES;
  58.266  }
  58.267  
  58.268 -struct buffered_data *new_buffer(void *ctx)
  58.269 +/* If it fails, returns NULL and sets errno. */
  58.270 +struct node *get_node(struct connection *conn,
  58.271 +		      const char *name,
  58.272 +		      enum xs_perm_type perm)
  58.273 +{
  58.274 +	struct node *node;
  58.275 +
  58.276 +	if (!name || !is_valid_nodename(name)) {
  58.277 +		errno = EINVAL;
  58.278 +		return NULL;
  58.279 +	}
  58.280 +	node = read_node(conn, name);
  58.281 +	/* If we don't have permission, we don't have node. */
  58.282 +	if (node) {
  58.283 +		if ((perm_for_conn(conn, node->perms, node->num_perms) & perm)
  58.284 +		    != perm)
  58.285 +			node = NULL;
  58.286 +	}
  58.287 +	/* Clean up errno if they weren't supposed to know. */
  58.288 +	if (!node) 
  58.289 +		errno = errno_from_parents(conn, name, errno, perm);
  58.290 +	return node;
  58.291 +}
  58.292 +
  58.293 +static struct buffered_data *new_buffer(void *ctx)
  58.294  {
  58.295  	struct buffered_data *data;
  58.296  
  58.297 @@ -457,7 +551,8 @@ struct buffered_data *new_buffer(void *c
  58.298  }
  58.299  
  58.300  /* Return length of string (including nul) at this offset. */
  58.301 -unsigned int get_string(const struct buffered_data *data, unsigned int offset)
  58.302 +static unsigned int get_string(const struct buffered_data *data,
  58.303 +			       unsigned int offset)
  58.304  {
  58.305  	const char *nul;
  58.306  
  58.307 @@ -508,7 +603,6 @@ void send_reply(struct connection *conn,
  58.308  		conn->waiting_reply = bdata;
  58.309  	} else
  58.310  		conn->out = bdata;
  58.311 -	assert(conn->state != BLOCKED);
  58.312  	conn->state = BUSY;
  58.313  }
  58.314  
  58.315 @@ -567,29 +661,6 @@ static const char *onearg(struct buffere
  58.316  	return in->buffer;
  58.317  }
  58.318  
  58.319 -/* If it fails, returns NULL and sets errno. */
  58.320 -static struct xs_permissions *get_perms(const char *dir, unsigned int *num)
  58.321 -{
  58.322 -	unsigned int size;
  58.323 -	char *strings;
  58.324 -	struct xs_permissions *ret;
  58.325 -	int *fd;
  58.326 -
  58.327 -	fd = talloc_open(permfile(dir), O_RDONLY, 0);
  58.328 -	if (!fd)
  58.329 -		return NULL;
  58.330 -	strings = read_all(fd, &size);
  58.331 -	if (!strings)
  58.332 -		return NULL;
  58.333 -
  58.334 -	*num = xs_count_strings(strings, size);
  58.335 -	ret = talloc_array(dir, struct xs_permissions, *num);
  58.336 -	if (!xs_strings_to_perms(ret, *num, strings))
  58.337 -		corrupt(NULL, "Permissions corrupt for %s", dir);
  58.338 -
  58.339 -	return ret;
  58.340 -}
  58.341 -
  58.342  static char *perms_to_strings(const void *ctx,
  58.343  			      struct xs_permissions *perms, unsigned int num,
  58.344  			      unsigned int *len)
  58.345 @@ -610,173 +681,6 @@ static char *perms_to_strings(const void
  58.346  	return strings;
  58.347  }
  58.348  
  58.349 -/* Destroy this, and its children, and its children's children. */
  58.350 -int destroy_path(void *path)
  58.351 -{
  58.352 -	DIR *dir;
  58.353 -	struct dirent *dirent;
  58.354 -
  58.355 -	dir = opendir(path);
  58.356 -	if (!dir) {
  58.357 -		if (unlink(path) == 0 || errno == ENOENT)
  58.358 -			return 0;
  58.359 -		corrupt(NULL, "Destroying path %s", path);
  58.360 -	}
  58.361 -
  58.362 -	while ((dirent = readdir(dir)) != NULL) {
  58.363 -		char fullpath[strlen(path) + 1 + strlen(dirent->d_name) + 1];
  58.364 -		sprintf(fullpath, "%s/%s", (char *)path, dirent->d_name);
  58.365 -		if (!streq(dirent->d_name,".") && !streq(dirent->d_name,".."))
  58.366 -			destroy_path(fullpath);
  58.367 -	}
  58.368 -	closedir(dir);
  58.369 -	if (rmdir(path) != 0)
  58.370 -		corrupt(NULL, "Destroying directory %s", path);
  58.371 -	return 0;
  58.372 -}
  58.373 -
  58.374 -/* Create a self-destructing temporary path */
  58.375 -static char *temppath(const char *path)
  58.376 -{
  58.377 -	char *tmppath = talloc_asprintf(path, "%s.tmp", path);
  58.378 -	talloc_set_destructor(tmppath, destroy_path);
  58.379 -	return tmppath;
  58.380 -}
  58.381 -
  58.382 -/* Create a self-destructing temporary file */
  58.383 -static char *tempfile(const char *path, void *contents, unsigned int len)
  58.384 -{
  58.385 -	int *fd;
  58.386 -	char *tmppath = temppath(path);
  58.387 -
  58.388 -	fd = talloc_open(tmppath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  58.389 -	if (!fd)
  58.390 -		return NULL;
  58.391 -	if (!xs_write_all(*fd, contents, len))
  58.392 -		return NULL;
  58.393 -
  58.394 -	return tmppath;
  58.395 -}
  58.396 -
  58.397 -static int destroy_opendir(void *_dir)
  58.398 -{
  58.399 -	DIR **dir = _dir;
  58.400 -	closedir(*dir);
  58.401 -	return 0;
  58.402 -}
  58.403 -
  58.404 -/* Return a pointer to a DIR*, self-closing and attached to this pathname. */
  58.405 -DIR **talloc_opendir(const char *pathname)
  58.406 -{
  58.407 -	DIR **dir;
  58.408 -
  58.409 -	dir = talloc(pathname, DIR *);
  58.410 -	*dir = opendir(pathname);
  58.411 -	if (!*dir) {
  58.412 -		int saved_errno = errno;
  58.413 -		talloc_free(dir);
  58.414 -		errno = saved_errno;
  58.415 -		return NULL;
  58.416 -	}
  58.417 -	talloc_set_destructor(dir, destroy_opendir);
  58.418 -	return dir;
  58.419 -}
  58.420 -
  58.421 -/* We assume rename() doesn't fail on moves in same dir. */
  58.422 -static void commit_tempfile(const char *path)
  58.423 -{
  58.424 -	char realname[strlen(path) + 1];
  58.425 -	unsigned int len = strrchr(path, '.') - path;
  58.426 -
  58.427 -	memcpy(realname, path, len);
  58.428 -	realname[len] = '\0';
  58.429 -	if (rename(path, realname) != 0)
  58.430 -		corrupt(NULL, "Committing %s", realname);
  58.431 -	talloc_set_destructor(path, NULL);
  58.432 -}
  58.433 -
  58.434 -static bool set_perms(struct transaction *transaction,
  58.435 -		      const char *node,
  58.436 -		      struct xs_permissions *perms, unsigned int num)
  58.437 -{
  58.438 -	unsigned int len;
  58.439 -	char *permpath, *strings;
  58.440 -
  58.441 -	strings = perms_to_strings(node, perms, num, &len);
  58.442 -	if (!strings)
  58.443 -		return false;
  58.444 -
  58.445 -	/* Create then move. */
  58.446 -	permpath = tempfile(node_permfile(transaction, node), strings, len);
  58.447 -	if (!permpath)
  58.448 -		return false;
  58.449 -
  58.450 -	commit_tempfile(permpath);
  58.451 -	return true;
  58.452 -}
  58.453 -
  58.454 -static char *get_parent(const char *node)
  58.455 -{
  58.456 -	char *slash = strrchr(node + 1, '/');
  58.457 -	if (!slash)
  58.458 -		return talloc_strdup(node, "/");
  58.459 -	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  58.460 -}
  58.461 -
  58.462 -static enum xs_perm_type perm_for_id(domid_t id,
  58.463 -				     struct xs_permissions *perms,
  58.464 -				     unsigned int num)
  58.465 -{
  58.466 -	unsigned int i;
  58.467 -
  58.468 -	/* Owners and tools get it all... */
  58.469 -	if (!id || perms[0].id == id)
  58.470 -		return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  58.471 -
  58.472 -	for (i = 1; i < num; i++)
  58.473 -		if (perms[i].id == id)
  58.474 -			return perms[i].perms;
  58.475 -
  58.476 -	return perms[0].perms;
  58.477 -}
  58.478 -
  58.479 -/* What do parents say? */
  58.480 -static enum xs_perm_type ask_parents(struct connection *conn,
  58.481 -				     const char *node)
  58.482 -{
  58.483 -	struct xs_permissions *perms;
  58.484 -	unsigned int num;
  58.485 -
  58.486 -	do {
  58.487 -		node = get_parent(node);
  58.488 -		perms = get_perms(node_dir(conn->transaction, node), &num);
  58.489 -		if (perms)
  58.490 -			break;
  58.491 -	} while (!streq(node, "/"));
  58.492 -
  58.493 -	/* No permission at root?  We're in trouble. */
  58.494 -	if (!perms)
  58.495 -		corrupt(conn, "No permissions file at root");
  58.496 -
  58.497 -	return perm_for_id(conn->id, perms, num);
  58.498 -}
  58.499 -
  58.500 -/* We have a weird permissions system.  You can allow someone into a
  58.501 - * specific node without allowing it in the parents.  If it's going to
  58.502 - * fail, however, we don't want the errno to indicate any information
  58.503 - * about the node. */
  58.504 -static int errno_from_parents(struct connection *conn, const char *node,
  58.505 -			      int errnum)
  58.506 -{
  58.507 -	/* We always tell them about memory failures. */
  58.508 -	if (errnum == ENOMEM)
  58.509 -		return errnum;
  58.510 -
  58.511 -	if (ask_parents(conn, node) & XS_PERM_READ)
  58.512 -		return errnum;
  58.513 -	return EACCES;
  58.514 -}
  58.515 -
  58.516  char *canonicalize(struct connection *conn, const char *node)
  58.517  {
  58.518  	const char *prefix;
  58.519 @@ -789,46 +693,6 @@ char *canonicalize(struct connection *co
  58.520  	return (char *)node;
  58.521  }
  58.522  
  58.523 -bool check_node_perms(struct connection *conn, const char *node,
  58.524 -		      enum xs_perm_type perm)
  58.525 -{
  58.526 -	struct xs_permissions *perms;
  58.527 -	unsigned int num;
  58.528 -
  58.529 -	if (!node || !is_valid_nodename(node)) {
  58.530 -		errno = EINVAL;
  58.531 -		return false;
  58.532 -	}
  58.533 -
  58.534 -	if (!conn->can_write && (perm & XS_PERM_WRITE)) {
  58.535 -		errno = EROFS;
  58.536 -		return false;
  58.537 -	}
  58.538 -
  58.539 -	perms = get_perms(node_dir(conn->transaction, node), &num);
  58.540 -
  58.541 -	if (perms) {
  58.542 -		if (perm_for_id(conn->id, perms, num) & perm)
  58.543 -			return true;
  58.544 -		errno = EACCES;
  58.545 -		return false;
  58.546 -	}
  58.547 -
  58.548 -	/* If it's OK not to exist, we consult parents. */
  58.549 -	if (errno == ENOENT && (perm & XS_PERM_ENOENT_OK)) {
  58.550 -		if (ask_parents(conn, node) & perm)
  58.551 -			return true;
  58.552 -		/* Parents say they should not know. */
  58.553 -		errno = EACCES;
  58.554 -		return false;
  58.555 -	}
  58.556 -
  58.557 -	/* They might not have permission to even *see* this node, in
  58.558 -	 * which case we return EACCES even if it's ENOENT or EIO. */
  58.559 -	errno = errno_from_parents(conn, node, errno);
  58.560 -	return false;
  58.561 -}
  58.562 -
  58.563  bool check_event_node(const char *node)
  58.564  {
  58.565  	if (!node || !strstarts(node, "@")) {
  58.566 @@ -838,142 +702,144 @@ bool check_event_node(const char *node)
  58.567  	return true;
  58.568  }
  58.569  
  58.570 -static void send_directory(struct connection *conn, const char *node)
  58.571 +static void send_directory(struct connection *conn, const char *name)
  58.572  {
  58.573 -	char *path, *reply;
  58.574 -	unsigned int reply_len = 0;
  58.575 -	DIR **dir;
  58.576 -	struct dirent *dirent;
  58.577 +	struct node *node;
  58.578  
  58.579 -	node = canonicalize(conn, node);
  58.580 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  58.581 -		send_error(conn, errno);
  58.582 -		return;
  58.583 -	}
  58.584 -
  58.585 -	path = node_dir(conn->transaction, node);
  58.586 -	dir = talloc_opendir(path);
  58.587 -	if (!dir) {
  58.588 +	name = canonicalize(conn, name);
  58.589 +	node = get_node(conn, name, XS_PERM_READ);
  58.590 +	if (!node) {
  58.591  		send_error(conn, errno);
  58.592  		return;
  58.593  	}
  58.594  
  58.595 -	reply = talloc_strdup(node, "");
  58.596 -	while ((dirent = readdir(*dir)) != NULL) {
  58.597 -		int len = strlen(dirent->d_name) + 1;
  58.598 -
  58.599 -		if (!valid_chars(dirent->d_name))
  58.600 -			continue;
  58.601 -
  58.602 -		reply = talloc_realloc(path, reply, char, reply_len + len);
  58.603 -		strcpy(reply + reply_len, dirent->d_name);
  58.604 -		reply_len += len;
  58.605 -	}
  58.606 -
  58.607 -	send_reply(conn, XS_DIRECTORY, reply, reply_len);
  58.608 +	send_reply(conn, XS_DIRECTORY, node->children, node->childlen);
  58.609  }
  58.610  
  58.611 -static void do_read(struct connection *conn, const char *node)
  58.612 +static void do_read(struct connection *conn, const char *name)
  58.613  {
  58.614 -	char *value;
  58.615 -	unsigned int size;
  58.616 -	int *fd;
  58.617 +	struct node *node;
  58.618  
  58.619 -	node = canonicalize(conn, node);
  58.620 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  58.621 -		send_error(conn, errno);
  58.622 -		return;
  58.623 -	}
  58.624 -
  58.625 -	fd = talloc_open(node_datafile(conn->transaction, node), O_RDONLY, 0);
  58.626 -	if (!fd) {
  58.627 -		/* Data file doesn't exist?  We call that a directory */
  58.628 -		if (errno == ENOENT)
  58.629 -			errno = EISDIR;
  58.630 +	name = canonicalize(conn, name);
  58.631 +	node = get_node(conn, name, XS_PERM_READ);
  58.632 +	if (!node) {
  58.633  		send_error(conn, errno);
  58.634  		return;
  58.635  	}
  58.636  
  58.637 -	value = read_all(fd, &size);
  58.638 -	if (!value)
  58.639 -		send_error(conn, errno);
  58.640 -	else
  58.641 -		send_reply(conn, XS_READ, value, size);
  58.642 +	send_reply(conn, XS_READ, node->data, node->datalen);
  58.643  }
  58.644  
  58.645 -/* Commit this directory, eg. comitting a/b.tmp/c causes a/b.tmp -> a.b */
  58.646 -static bool commit_dir(char *dir)
  58.647 +static void delete_node_single(struct connection *conn, struct node *node)
  58.648  {
  58.649 -	char *dot, *slash, *dest;
  58.650 +	TDB_DATA key;
  58.651 +
  58.652 +	key.dptr = (void *)node->name;
  58.653 +	key.dsize = strlen(node->name);
  58.654  
  58.655 -	dot = strrchr(dir, '.');
  58.656 -	slash = strchr(dot, '/');
  58.657 -	if (slash)
  58.658 -		*slash = '\0';
  58.659 +	if (tdb_delete(tdb_context(conn), key) != 0)
  58.660 +		corrupt(conn, "Could not delete '%s'", node->name);
  58.661 +}
  58.662  
  58.663 -	dest = talloc_asprintf(dir, "%.*s", (int)(dot - dir), dir);
  58.664 -	return rename(dir, dest) == 0;
  58.665 +/* Must not be / */
  58.666 +static char *basename(const char *name)
  58.667 +{
  58.668 +	return strrchr(name, '/') + 1;
  58.669  }
  58.670  
  58.671 -/* Create a temporary directory.  Put data in it (if data != NULL) */
  58.672 -static char *tempdir(struct connection *conn,
  58.673 -		     const char *node, void *data, unsigned int datalen)
  58.674 +static struct node *construct_node(struct connection *conn, const char *name)
  58.675  {
  58.676 -	struct xs_permissions *perms;
  58.677 -	char *permstr;
  58.678 -	unsigned int num, len;
  58.679 -	int *fd;
  58.680 -	char *dir;
  58.681 +	const char *base;
  58.682 +	unsigned int baselen;
  58.683 +	struct node *parent, *node;
  58.684 +	char *children, *parentname = get_parent(name);
  58.685 +
  58.686 +	/* If parent doesn't exist, create it. */
  58.687 +	parent = read_node(conn, parentname);
  58.688 +	if (!parent)
  58.689 +		parent = construct_node(conn, parentname);
  58.690 +	if (!parent)
  58.691 +		return NULL;
  58.692 +	
  58.693 +	/* Add child to parent. */
  58.694 +	base = basename(name);
  58.695 +	baselen = strlen(base) + 1;
  58.696 +	children = talloc_array(name, char, parent->childlen + baselen);
  58.697 +	memcpy(children, parent->children, parent->childlen);
  58.698 +	memcpy(children + parent->childlen, base, baselen);
  58.699 +	parent->children = children;
  58.700 +	parent->childlen += baselen;
  58.701 +
  58.702 +	/* Allocate node */
  58.703 +	node = talloc(name, struct node);
  58.704 +	node->tdb = tdb_context(conn);
  58.705 +	node->name = talloc_strdup(node, name);
  58.706 +
  58.707 +	/* Inherit permissions, except domains own what they create */
  58.708 +	node->num_perms = parent->num_perms;
  58.709 +	node->perms = talloc_memdup(node, parent->perms,
  58.710 +				    node->num_perms * sizeof(node->perms[0]));
  58.711 +	if (conn->id)
  58.712 +		node->perms[0].id = conn->id;
  58.713  
  58.714 -	dir = temppath(node_dir(conn->transaction, node));
  58.715 -	if (mkdir(dir, 0750) != 0) {
  58.716 -		if (errno != ENOENT)
  58.717 -			return NULL;
  58.718 +	/* No children, no data */
  58.719 +	node->children = node->data = NULL;
  58.720 +	node->childlen = node->datalen = 0;
  58.721 +	node->parent = parent;
  58.722 +	return node;
  58.723 +}
  58.724 +
  58.725 +static int destroy_node(void *_node)
  58.726 +{
  58.727 +	struct node *node = _node;
  58.728 +	TDB_DATA key;
  58.729 +
  58.730 +	if (streq(node->name, "/"))
  58.731 +		corrupt(NULL, "Destroying root node!");
  58.732 +
  58.733 +	key.dptr = (void *)node->name;
  58.734 +	key.dsize = strlen(node->name);
  58.735 +
  58.736 +	tdb_delete(node->tdb, key);
  58.737 +	return 0;
  58.738 +}
  58.739  
  58.740 -		dir = tempdir(conn, get_parent(node), NULL, 0);
  58.741 -		if (!dir)
  58.742 -			return NULL;
  58.743 +/* Be careful: create heirarchy, put entry in existing parent *last*.
  58.744 + * This helps fsck if we die during this. */
  58.745 +static struct node *create_node(struct connection *conn, 
  58.746 +				const char *name,
  58.747 +				void *data, unsigned int datalen)
  58.748 +{
  58.749 +	struct node *node, *i;
  58.750  
  58.751 -		dir = talloc_asprintf(dir, "%s%s", dir, strrchr(node, '/'));
  58.752 -		if (mkdir(dir, 0750) != 0)
  58.753 +	node = construct_node(conn, name);
  58.754 +	if (!node)
  58.755 +		return NULL;
  58.756 +
  58.757 +	node->data = data;
  58.758 +	node->datalen = datalen;
  58.759 +
  58.760 +	/* We write out the nodes down, setting destructor in case
  58.761 +	 * something goes wrong. */
  58.762 +	for (i = node; i; i = i->parent) {
  58.763 +		if (!write_node(conn, i))
  58.764  			return NULL;
  58.765 -		talloc_set_destructor(dir, destroy_path);
  58.766 +		talloc_set_destructor(i, destroy_node);
  58.767  	}
  58.768  
  58.769 -	perms = get_perms(get_parent(dir), &num);
  58.770 -	assert(perms);
  58.771 -	/* Domains own what they create. */
  58.772 -	if (conn->id)
  58.773 -		perms->id = conn->id;
  58.774 -
  58.775 -	permstr = perms_to_strings(dir, perms, num, &len);
  58.776 -	fd = talloc_open(permfile(dir), O_WRONLY|O_CREAT|O_EXCL, 0640);
  58.777 -	if (!fd || !xs_write_all(*fd, permstr, len))
  58.778 -		return NULL;
  58.779 -
  58.780 -	if (data) {
  58.781 -		char *datapath = datafile(dir);
  58.782 -
  58.783 -		fd = talloc_open(datapath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  58.784 -		if (!fd || !xs_write_all(*fd, data, datalen))
  58.785 -			return NULL;
  58.786 -	}
  58.787 -	return dir;
  58.788 -}
  58.789 -
  58.790 -static bool node_exists(struct connection *conn, const char *node)
  58.791 -{
  58.792 -	struct stat st;
  58.793 -
  58.794 -	return lstat(node_dir(conn->transaction, node), &st) == 0;
  58.795 +	/* OK, now remove destructors so they stay around */
  58.796 +	for (i = node; i; i = i->parent)
  58.797 +		talloc_set_destructor(i, NULL);
  58.798 +	return node;
  58.799  }
  58.800  
  58.801  /* path, data... */
  58.802  static void do_write(struct connection *conn, struct buffered_data *in)
  58.803  {
  58.804  	unsigned int offset, datalen;
  58.805 +	struct node *node;
  58.806  	char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
  58.807 -	char *node, *tmppath;
  58.808 +	char *name;
  58.809  
  58.810  	/* Extra "strings" can be created by binary data. */
  58.811  	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) {
  58.812 @@ -981,99 +847,115 @@ static void do_write(struct connection *
  58.813  		return;
  58.814  	}
  58.815  
  58.816 -	node = canonicalize(conn, vec[0]);
  58.817 -	if (!within_transaction(conn->transaction, node)) {
  58.818 -		send_error(conn, EROFS);
  58.819 -		return;
  58.820 -	}
  58.821 -
  58.822 -	if (transaction_block(conn, node))
  58.823 -		return;
  58.824 -
  58.825  	offset = strlen(vec[0]) + 1;
  58.826  	datalen = in->used - offset;
  58.827  
  58.828 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  58.829 -		send_error(conn, errno);
  58.830 -		return;
  58.831 +	name = canonicalize(conn, vec[0]);
  58.832 +	node = get_node(conn, name, XS_PERM_WRITE);
  58.833 +	if (!node) {
  58.834 +		/* No permissions, invalid input? */
  58.835 +		if (errno != ENOENT) {
  58.836 +			send_error(conn, errno);
  58.837 +			return;
  58.838 +		}
  58.839 +		node = create_node(conn, name, in->buffer + offset, datalen);
  58.840 +		if (!node) {
  58.841 +			send_error(conn, errno);
  58.842 +			return;
  58.843 +		}
  58.844 +	} else {
  58.845 +		node->data = in->buffer + offset;
  58.846 +		node->datalen = datalen;
  58.847 +		if (!write_node(conn, node)){
  58.848 +			send_error(conn, errno);
  58.849 +			return;
  58.850 +		}
  58.851  	}
  58.852  
  58.853 -	if (!node_exists(conn, node)) {
  58.854 -		char *dir;
  58.855 +	add_change_node(conn->transaction, name, false);
  58.856 +	fire_watches(conn, name, false);
  58.857 +	send_ack(conn, XS_WRITE);
  58.858 +}
  58.859  
  58.860 -		/* Does not exist... */
  58.861 +static void do_mkdir(struct connection *conn, const char *name)
  58.862 +{
  58.863 +	struct node *node;
  58.864 +
  58.865 +	name = canonicalize(conn, name);
  58.866 +	node = get_node(conn, name, XS_PERM_WRITE);
  58.867 +
  58.868 +	/* If it already exists, fine. */
  58.869 +	if (!node) {
  58.870 +		/* No permissions? */
  58.871  		if (errno != ENOENT) {
  58.872  			send_error(conn, errno);
  58.873  			return;
  58.874  		}
  58.875 -
  58.876 -		dir = tempdir(conn, node, in->buffer + offset, datalen);
  58.877 -		if (!dir || !commit_dir(dir)) {
  58.878 -			send_error(conn, errno);
  58.879 -			return;
  58.880 -		}
  58.881 -		
  58.882 -	} else {
  58.883 -		/* Exists... */
  58.884 -		tmppath = tempfile(node_datafile(conn->transaction, node),
  58.885 -				   in->buffer + offset, datalen);
  58.886 -		if (!tmppath) {
  58.887 +		node = create_node(conn, name, NULL, 0);
  58.888 +		if (!node) {
  58.889  			send_error(conn, errno);
  58.890  			return;
  58.891  		}
  58.892 -
  58.893 -		commit_tempfile(tmppath);
  58.894 -	}
  58.895 -
  58.896 -	add_change_node(conn->transaction, node, false);
  58.897 -	fire_watches(conn, node, false);
  58.898 -	send_ack(conn, XS_WRITE);
  58.899 -}
  58.900 -
  58.901 -static void do_mkdir(struct connection *conn, const char *node)
  58.902 -{
  58.903 -	char *dir;
  58.904 -
  58.905 -	node = canonicalize(conn, node);
  58.906 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  58.907 -		send_error(conn, errno);
  58.908 -		return;
  58.909 +		add_change_node(conn->transaction, name, false);
  58.910 +		fire_watches(conn, name, false);
  58.911  	}
  58.912 -
  58.913 -	if (!within_transaction(conn->transaction, node)) {
  58.914 -		send_error(conn, EROFS);
  58.915 -		return;
  58.916 -	}
  58.917 -
  58.918 -	if (transaction_block(conn, node))
  58.919 -		return;
  58.920 -
  58.921 -	/* If it already exists, fine. */
  58.922 -	if (node_exists(conn, node)) {
  58.923 -		send_ack(conn, XS_MKDIR);
  58.924 -		return;
  58.925 -	}
  58.926 -
  58.927 -	dir = tempdir(conn, node, NULL, 0);
  58.928 -	if (!dir || !commit_dir(dir)) {
  58.929 -		send_error(conn, errno);
  58.930 -		return;
  58.931 -	}
  58.932 -
  58.933 -	add_change_node(conn->transaction, node, false);
  58.934 -	fire_watches(conn, node, false);
  58.935  	send_ack(conn, XS_MKDIR);
  58.936  }
  58.937  
  58.938 -static void do_rm(struct connection *conn, const char *node)
  58.939 +static void delete_node(struct connection *conn, struct node *node)
  58.940 +{
  58.941 +	unsigned int i;
  58.942 +
  58.943 +	/* Delete self, then delete children.  If something goes wrong,
  58.944 +	 * consistency check will clean up this way. */
  58.945 +	delete_node_single(conn, node);
  58.946 +
  58.947 +	/* Delete children, too. */
  58.948 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  58.949 +		struct node *child;
  58.950 +
  58.951 +		child = read_node(conn, 
  58.952 +				  talloc_asprintf(node, "%s/%s", node->name,
  58.953 +						  node->children + i));
  58.954 +		if (!child)
  58.955 +			corrupt(conn, "No child '%s' found", child);
  58.956 +		delete_node(conn, child);
  58.957 +	}
  58.958 +}
  58.959 +
  58.960 +/* Delete memory using memmove. */
  58.961 +static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
  58.962  {
  58.963 -	char *tmppath, *path;
  58.964 +	memmove(mem + off, mem + off + len, total - off - len);
  58.965 +}
  58.966 +
  58.967 +static bool delete_child(struct connection *conn,
  58.968 +			 struct node *node, const char *childname)
  58.969 +{
  58.970 +	unsigned int i;
  58.971  
  58.972 -	node = canonicalize(conn, node);
  58.973 -	if (!check_node_perms(conn, node, XS_PERM_WRITE)) {
  58.974 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  58.975 +		if (streq(node->children+i, childname)) {
  58.976 +			memdel(node->children, i, strlen(childname) + 1,
  58.977 +			       node->childlen);
  58.978 +			node->childlen -= strlen(childname) + 1;
  58.979 +			return write_node(conn, node);
  58.980 +		}
  58.981 +	}
  58.982 +	corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
  58.983 +}
  58.984 +
  58.985 +static void do_rm(struct connection *conn, const char *name)
  58.986 +{
  58.987 +	struct node *node, *parent;
  58.988 +
  58.989 +	name = canonicalize(conn, name);
  58.990 +	node = get_node(conn, name, XS_PERM_WRITE);
  58.991 +	if (!node) {
  58.992  		/* Didn't exist already?  Fine, if parent exists. */
  58.993  		if (errno == ENOENT) {
  58.994 -			if (node_exists(conn, get_parent(node))) {
  58.995 +			node = read_node(conn, get_parent(name));
  58.996 +			if (node) {
  58.997  				send_ack(conn, XS_RM);
  58.998  				return;
  58.999  			}
 58.1000 @@ -1084,53 +966,43 @@ static void do_rm(struct connection *con
 58.1001  		return;
 58.1002  	}
 58.1003  
 58.1004 -	if (!within_transaction(conn->transaction, node)) {
 58.1005 -		send_error(conn, EROFS);
 58.1006 +	if (streq(name, "/")) {
 58.1007 +		send_error(conn, EINVAL);
 58.1008  		return;
 58.1009  	}
 58.1010  
 58.1011 -	if (transaction_block(conn, node))
 58.1012 -		return;
 58.1013 -
 58.1014 -	if (streq(node, "/")) {
 58.1015 +	/* Delete from parent first, then if something explodes fsck cleans. */
 58.1016 +	parent = read_node(conn, get_parent(name));
 58.1017 +	if (!parent) {
 58.1018  		send_error(conn, EINVAL);
 58.1019  		return;
 58.1020  	}
 58.1021  
 58.1022 -	/* We move the directory to temporary name, destructor cleans up. */
 58.1023 -	path = node_dir(conn->transaction, node);
 58.1024 -	tmppath = talloc_asprintf(node, "%s.tmp", path);
 58.1025 -	talloc_set_destructor(tmppath, destroy_path);
 58.1026 +	if (!delete_child(conn, parent, basename(name))) {
 58.1027 +		send_error(conn, EINVAL);
 58.1028 +		return;
 58.1029 +	}
 58.1030  
 58.1031 -	if (rename(path, tmppath) != 0) {
 58.1032 +	delete_node(conn, node);
 58.1033 +	add_change_node(conn->transaction, name, true);
 58.1034 +	fire_watches(conn, name, true);
 58.1035 +	send_ack(conn, XS_RM);
 58.1036 +}
 58.1037 +
 58.1038 +static void do_get_perms(struct connection *conn, const char *name)
 58.1039 +{
 58.1040 +	struct node *node;
 58.1041 +	char *strings;
 58.1042 +	unsigned int len;
 58.1043 +
 58.1044 +	name = canonicalize(conn, name);
 58.1045 +	node = get_node(conn, name, XS_PERM_READ);
 58.1046 +	if (!node) {
 58.1047  		send_error(conn, errno);
 58.1048  		return;
 58.1049  	}
 58.1050  
 58.1051 -	add_change_node(conn->transaction, node, true);
 58.1052 -	fire_watches(conn, node, true);
 58.1053 -	send_ack(conn, XS_RM);
 58.1054 -}
 58.1055 -
 58.1056 -static void do_get_perms(struct connection *conn, const char *node)
 58.1057 -{
 58.1058 -	struct xs_permissions *perms;
 58.1059 -	char *strings;
 58.1060 -	unsigned int len, num;
 58.1061 -
 58.1062 -	node = canonicalize(conn, node);
 58.1063 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
 58.1064 -		send_error(conn, errno);
 58.1065 -		return;
 58.1066 -	}
 58.1067 -
 58.1068 -	perms = get_perms(node_dir(conn->transaction, node), &num);
 58.1069 -	if (!perms) {
 58.1070 -		send_error(conn, errno);
 58.1071 -		return;
 58.1072 -	}
 58.1073 -
 58.1074 -	strings = perms_to_strings(node, perms, num, &len);
 58.1075 +	strings = perms_to_strings(node, node->perms, node->num_perms, &len);
 58.1076  	if (!strings)
 58.1077  		send_error(conn, errno);
 58.1078  	else
 58.1079 @@ -1140,8 +1012,8 @@ static void do_get_perms(struct connecti
 58.1080  static void do_set_perms(struct connection *conn, struct buffered_data *in)
 58.1081  {
 58.1082  	unsigned int num;
 58.1083 -	char *node, *permstr;
 58.1084 -	struct xs_permissions *perms;
 58.1085 +	char *name, *permstr;
 58.1086 +	struct node *node;
 58.1087  
 58.1088  	num = xs_count_strings(in->buffer, in->used);
 58.1089  	if (num < 2) {
 58.1090 @@ -1150,37 +1022,30 @@ static void do_set_perms(struct connecti
 58.1091  	}
 58.1092  
 58.1093  	/* First arg is node name. */
 58.1094 -	node = canonicalize(conn, in->buffer);
 58.1095 +	name = canonicalize(conn, in->buffer);
 58.1096  	permstr = in->buffer + strlen(in->buffer) + 1;
 58.1097  	num--;
 58.1098  
 58.1099 -	if (!within_transaction(conn->transaction, node)) {
 58.1100 -		send_error(conn, EROFS);
 58.1101 -		return;
 58.1102 -	}
 58.1103 -
 58.1104 -	if (transaction_block(conn, node))
 58.1105 -		return;
 58.1106 -
 58.1107  	/* We must own node to do this (tools can do this too). */
 58.1108 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_OWNER)) {
 58.1109 +	node = get_node(conn, name, XS_PERM_WRITE|XS_PERM_OWNER);
 58.1110 +	if (!node) {
 58.1111  		send_error(conn, errno);
 58.1112  		return;
 58.1113  	}
 58.1114  
 58.1115 -	perms = talloc_array(node, struct xs_permissions, num);
 58.1116 -	if (!xs_strings_to_perms(perms, num, permstr)) {
 58.1117 +	node->perms = talloc_array(node, struct xs_permissions, num);
 58.1118 +	node->num_perms = num;
 58.1119 +	if (!xs_strings_to_perms(node->perms, num, permstr)) {
 58.1120 +		send_error(conn, errno);
 58.1121 +		return;
 58.1122 +	}
 58.1123 +	if (!write_node(conn, node)) {
 58.1124  		send_error(conn, errno);
 58.1125  		return;
 58.1126  	}
 58.1127  
 58.1128 -	if (!set_perms(conn->transaction, node, perms, num)) {
 58.1129 -		send_error(conn, errno);
 58.1130 -		return;
 58.1131 -	}
 58.1132 -
 58.1133 -	add_change_node(conn->transaction, node, false);
 58.1134 -	fire_watches(conn, node, false);
 58.1135 +	add_change_node(conn->transaction, name, false);
 58.1136 +	fire_watches(conn, name, false);
 58.1137  	send_ack(conn, XS_SET_PERMS);
 58.1138  }
 58.1139  
 58.1140 @@ -1221,14 +1086,10 @@ static void process_message(struct conne
 58.1141  	case XS_SHUTDOWN:
 58.1142  		/* FIXME: Implement gentle shutdown too. */
 58.1143  		/* Only tools can do this. */
 58.1144 -		if (conn->id != 0) {
 58.1145 +		if (conn->id != 0 || !conn->can_write) {
 58.1146  			send_error(conn, EACCES);
 58.1147  			break;
 58.1148  		}
 58.1149 -		if (!conn->can_write) {
 58.1150 -			send_error(conn, EROFS);
 58.1151 -			break;
 58.1152 -		}
 58.1153  		send_ack(conn, XS_SHUTDOWN);
 58.1154  		/* Everything hangs off auto-free context, freed at exit. */
 58.1155  		exit(0);
 58.1156 @@ -1263,7 +1124,7 @@ static void process_message(struct conne
 58.1157  		break;
 58.1158  
 58.1159  	case XS_TRANSACTION_START:
 58.1160 -		do_transaction_start(conn, onearg(in));
 58.1161 +		do_transaction_start(conn, in);
 58.1162  		break;
 58.1163  
 58.1164  	case XS_TRANSACTION_END:
 58.1165 @@ -1309,6 +1170,8 @@ static void consider_message(struct conn
 58.1166  	/* For simplicity, we kill the connection on OOM. */
 58.1167  	talloc_set_fail_handler(out_of_mem, &talloc_fail);
 58.1168  	if (setjmp(talloc_fail)) {
 58.1169 +		/* Free in before conn, in case it needs something. */
 58.1170 +		talloc_free(in);
 58.1171  		talloc_free(conn);
 58.1172  		goto end;
 58.1173  	}
 58.1174 @@ -1330,16 +1193,8 @@ static void consider_message(struct conn
 58.1175  	conn->in = new_buffer(conn);
 58.1176  	process_message(conn, in);
 58.1177  
 58.1178 -	if (conn->state == BLOCKED) {
 58.1179 -		/* Blocked by transaction: queue for re-xmit. */
 58.1180 -		talloc_free(conn->in);
 58.1181 -		conn->in = in;
 58.1182 -		in = NULL;
 58.1183 -		trace_blocked(conn, conn->in);
 58.1184 -	}
 58.1185 -
 58.1186 +	talloc_free(in);
 58.1187  end:
 58.1188 -	talloc_free(in);
 58.1189  	talloc_set_fail_handler(NULL, NULL);
 58.1190  	if (talloc_total_blocks(NULL)
 58.1191  	    != talloc_total_blocks(talloc_autofree_context()) + 1) {
 58.1192 @@ -1350,7 +1205,7 @@ end:
 58.1193  
 58.1194  /* Errors in reading or allocating here mean we get out of sync, so we
 58.1195   * drop the whole client connection. */
 58.1196 -void handle_input(struct connection *conn)
 58.1197 +static void handle_input(struct connection *conn)
 58.1198  {
 58.1199  	int bytes;
 58.1200  	struct buffered_data *in;
 58.1201 @@ -1402,41 +1257,12 @@ bad_client:
 58.1202  	talloc_free(conn);
 58.1203  }
 58.1204  
 58.1205 -void handle_output(struct connection *conn)
 58.1206 +static void handle_output(struct connection *conn)
 58.1207  {
 58.1208  	if (!write_message(conn))
 58.1209  		talloc_free(conn);
 58.1210  }
 58.1211  
 58.1212 -/* If a transaction has ended, see if we can unblock any connections. */
 58.1213 -static void unblock_connections(void)
 58.1214 -{
 58.1215 -	struct connection *i, *tmp;
 58.1216 -
 58.1217 -	list_for_each_entry_safe(i, tmp, &connections, list) {
 58.1218 -		switch (i->state) {
 58.1219 -		case BLOCKED:
 58.1220 -			if (!transaction_covering_node(i->blocked_by)) {
 58.1221 -				talloc_free(i->blocked_by);
 58.1222 -				i->blocked_by = NULL;
 58.1223 -				i->state = OK;
 58.1224 -				consider_message(i);
 58.1225 -			}
 58.1226 -			break;
 58.1227 -		case BUSY:
 58.1228 -		case OK:
 58.1229 -			break;
 58.1230 -		}
 58.1231 -	}
 58.1232 -
 58.1233 -	/* To balance bias, move first entry to end. */
 58.1234 -	if (!list_empty(&connections)) {
 58.1235 -		i = list_top(&connections, struct connection, list);
 58.1236 -		list_del(&i->list);
 58.1237 -		list_add_tail(&i->list, &connections);
 58.1238 -	}
 58.1239 -}
 58.1240 -
 58.1241  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
 58.1242  {
 58.1243  	/*
 58.1244 @@ -1451,7 +1277,6 @@ struct connection *new_connection(connwr
 58.1245  		return NULL;
 58.1246  
 58.1247  	new->state = OK;
 58.1248 -	new->blocked_by = NULL;
 58.1249  	new->out = new->waiting_reply = NULL;
 58.1250  	new->waiting_for_ack = NULL;
 58.1251  	new->fd = -1;
 58.1252 @@ -1504,25 +1329,9 @@ static void accept_connection(int sock, 
 58.1253  		close(fd);
 58.1254  }
 58.1255  
 58.1256 -/* Calc timespan from now to absolute time. */
 58.1257 -static void time_relative_to_now(struct timeval *tv)
 58.1258 -{
 58.1259 -	struct timeval now;
 58.1260 -
 58.1261 -	gettimeofday(&now, NULL);
 58.1262 -	if (timercmp(&now, tv, >))
 58.1263 -		timerclear(tv);
 58.1264 -	else {
 58.1265 -		tv->tv_sec -= now.tv_sec;
 58.1266 -		if (now.tv_usec > tv->tv_usec) {
 58.1267 -			tv->tv_sec--;
 58.1268 -			tv->tv_usec += 1000000;
 58.1269 -		}
 58.1270 -		tv->tv_usec -= now.tv_usec;
 58.1271 -	}
 58.1272 -}
 58.1273 -
 58.1274  #ifdef TESTING
 58.1275 +/* Valgrind can check our writes better if we don't use mmap */
 58.1276 +#define TDB_FLAGS TDB_NOMMAP
 58.1277  /* Useful for running under debugger. */
 58.1278  void dump_connection(void)
 58.1279  {
 58.1280 @@ -1532,13 +1341,10 @@ void dump_connection(void)
 58.1281  		printf("Connection %p:\n", i);
 58.1282  		printf("    state = %s\n",
 58.1283  		       i->state == OK ? "OK"
 58.1284 -		       : i->state == BLOCKED ? "BLOCKED"
 58.1285  		       : i->state == BUSY ? "BUSY"
 58.1286  		       : "INVALID");
 58.1287  		if (i->id)
 58.1288  			printf("    id = %i\n", i->id);
 58.1289 -		if (i->blocked_by)
 58.1290 -			printf("    blocked on = %s\n", i->blocked_by);
 58.1291  		if (!i->in->inhdr || i->in->used)
 58.1292  			printf("    got %i bytes of %s\n",
 58.1293  			       i->in->used, i->in->inhdr ? "header" : "data");
 58.1294 @@ -1559,44 +1365,53 @@ void dump_connection(void)
 58.1295  		dump_watches(i);
 58.1296  	}
 58.1297  }
 58.1298 +#else
 58.1299 +#define TDB_FLAGS 0
 58.1300  #endif
 58.1301  
 58.1302 +/* We create initial nodes manually. */
 58.1303 +static void manual_node(const char *name, const char *child)
 58.1304 +{
 58.1305 +	struct node *node;
 58.1306 +	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 58.1307 +
 58.1308 +	node = talloc(NULL, struct node);
 58.1309 +	node->name = name;
 58.1310 +	node->perms = &perms;
 58.1311 +	node->num_perms = 1;
 58.1312 +	node->data = NULL;
 58.1313 +	node->datalen = 0;
 58.1314 +	node->children = (char *)child;
 58.1315 +	if (child)
 58.1316 +		node->childlen = strlen(child) + 1;
 58.1317 +	else
 58.1318 +		node->childlen = 0;
 58.1319 +
 58.1320 +	if (!write_node(NULL, node))
 58.1321 +		barf_perror("Could not create initial node %s", name);
 58.1322 +	talloc_free(node);
 58.1323 +}
 58.1324 +
 58.1325 +#
 58.1326 +
 58.1327  static void setup_structure(void)
 58.1328  {
 58.1329 -	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 58.1330 -	char *root, *dir, *permfile;
 58.1331 -
 58.1332 -	/* Create root directory, with permissions. */
 58.1333 -	if (mkdir(xs_daemon_store(), 0750) != 0) {
 58.1334 -		if (errno != EEXIST)
 58.1335 -			barf_perror("Could not create root %s",
 58.1336 -				    xs_daemon_store());
 58.1337 -		return;
 58.1338 -	}
 58.1339 -	root = talloc_strdup(talloc_autofree_context(), "/");
 58.1340 -	if (!set_perms(NULL, root, &perms, 1))
 58.1341 -		barf_perror("Could not create permissions in root");
 58.1342 +	char *tdbname;
 58.1343 +	tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
 58.1344 +	tdb_ctx = tdb_open(tdbname, 0, TDB_FLAGS, O_RDWR, 0);
 58.1345  
 58.1346 -	/* Create tool directory, with xenstored subdir. */
 58.1347 -	dir = talloc_asprintf(root, "%s/%s", xs_daemon_store(), "tool");
 58.1348 -	if (mkdir(dir, 0750) != 0)
 58.1349 -		barf_perror("Making dir %s", dir);
 58.1350 -	
 58.1351 -	permfile = talloc_strdup(root, "/tool");
 58.1352 -	if (!set_perms(NULL, permfile, &perms, 1))
 58.1353 -		barf_perror("Could not create permissions on %s", permfile);
 58.1354 +	if (!tdb_ctx) {
 58.1355 +		tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
 58.1356 +				   0640);
 58.1357 +		if (!tdb_ctx)
 58.1358 +			barf_perror("Could not create tdb file %s", tdbname);
 58.1359  
 58.1360 -	dir = talloc_asprintf(root, "%s/%s", dir, "xenstored");
 58.1361 -	if (mkdir(dir, 0750) != 0)
 58.1362 -		barf_perror("Making dir %s", dir);
 58.1363 -	
 58.1364 -	permfile = talloc_strdup(root, "/tool/xenstored");
 58.1365 -	if (!set_perms(NULL, permfile, &perms, 1))
 58.1366 -		barf_perror("Could not create permissions on %s", permfile);
 58.1367 -	talloc_free(root);
 58.1368 -	if (mkdir(xs_daemon_transactions(), 0750) != 0)
 58.1369 -		barf_perror("Could not create transaction dir %s",
 58.1370 -			    xs_daemon_transactions());
 58.1371 +		manual_node("/", "tool");
 58.1372 +		manual_node("/tool", "xenstored");
 58.1373 +		manual_node("/tool/xenstored", NULL);
 58.1374 +	}
 58.1375 +
 58.1376 +	/* FIXME: Fsck */
 58.1377  }
 58.1378  
 58.1379  static void write_pidfile(const char *pidfile)
 58.1380 @@ -1759,17 +1574,8 @@ int main(int argc, char *argv[])
 58.1381  	/* FIXME: Rewrite so noone can starve. */
 58.1382  	for (;;) {
 58.1383  		struct connection *i;
 58.1384 -		struct timeval *tvp = NULL, tv;
 58.1385  
 58.1386 -		timerclear(&tv);
 58.1387 -		shortest_transaction_timeout(&tv);
 58.1388 -		shortest_watch_ack_timeout(&tv);
 58.1389 -		if (timerisset(&tv)) {
 58.1390 -			time_relative_to_now(&tv);
 58.1391 -			tvp = &tv;
 58.1392 -		}
 58.1393 -
 58.1394 -		if (select(max+1, &inset, &outset, NULL, tvp) < 0) {
 58.1395 +		if (select(max+1, &inset, &outset, NULL, NULL) < 0) {
 58.1396  			if (errno == EINTR)
 58.1397  				continue;
 58.1398  			barf_perror("Select failed");
 58.1399 @@ -1818,14 +1624,6 @@ int main(int argc, char *argv[])
 58.1400  			}
 58.1401  		}
 58.1402  
 58.1403 -		if (tvp) {
 58.1404 -			check_transaction_timeout();
 58.1405 -			check_watch_ack_timeout();
 58.1406 -		}
 58.1407 -
 58.1408 -		/* If transactions ended, we might be able to do more work. */
 58.1409 -		unblock_connections();
 58.1410 -
 58.1411  		max = initialize_set(&inset, &outset, *sock, *ro_sock,
 58.1412  				     event_fd);
 58.1413  	}
    59.1 --- a/tools/xenstore/xenstored_core.h	Thu Sep 29 13:35:13 2005 -0600
    59.2 +++ b/tools/xenstore/xenstored_core.h	Thu Sep 29 16:22:02 2005 -0600
    59.3 @@ -28,6 +28,7 @@
    59.4  #include "xs_lib.h"
    59.5  #include "xenstored.h"
    59.6  #include "list.h"
    59.7 +#include "tdb.h"
    59.8  
    59.9  struct buffered_data
   59.10  {
   59.11 @@ -49,8 +50,6 @@ typedef int connreadfn_t(struct connecti
   59.12  
   59.13  enum state
   59.14  {
   59.15 -	/* Blocked by transaction. */
   59.16 -	BLOCKED,
   59.17  	/* Doing action, not listening */
   59.18  	BUSY,
   59.19  	/* Completed */
   59.20 @@ -70,9 +69,6 @@ struct connection
   59.21  	/* Blocked on transaction?  Busy? */
   59.22  	enum state state;
   59.23  
   59.24 -	/* Node we are waiting for (if state == BLOCKED) */
   59.25 -	char *blocked_by;
   59.26 -
   59.27  	/* Is this a read-only connection? */
   59.28  	bool can_write;
   59.29  
   59.30 @@ -103,9 +99,27 @@ struct connection
   59.31  };
   59.32  extern struct list_head connections;
   59.33  
   59.34 -/* Return length of string (including nul) at this offset. */
   59.35 -unsigned int get_string(const struct buffered_data *data,
   59.36 -			unsigned int offset);
   59.37 +struct node {
   59.38 +	const char *name;
   59.39 +
   59.40 +	/* Database I came from */
   59.41 +	TDB_CONTEXT *tdb;
   59.42 +
   59.43 +	/* Parent (optional) */
   59.44 +	struct node *parent;
   59.45 +
   59.46 +	/* Permissions. */
   59.47 +	unsigned int num_perms;
   59.48 +	struct xs_permissions *perms;
   59.49 +
   59.50 +	/* Contents. */
   59.51 +	unsigned int datalen;
   59.52 +	void *data;
   59.53 +
   59.54 +	/* Children, each nul-terminated. */
   59.55 +	unsigned int childlen;
   59.56 +	char *children;
   59.57 +};
   59.58  
   59.59  /* Break input into vectors, return the number, fill in up to num of them. */
   59.60  unsigned int get_strings(struct buffered_data *data,
   59.61 @@ -114,9 +128,6 @@ unsigned int get_strings(struct buffered
   59.62  /* Is child node a child or equal to parent node? */
   59.63  bool is_child(const char *child, const char *parent);
   59.64  
   59.65 -/* Create a new buffer with lifetime of context. */
   59.66 -struct buffered_data *new_buffer(void *ctx);
   59.67 -
   59.68  void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
   59.69  		const void *data, unsigned int len);
   59.70  
   59.71 @@ -129,15 +140,22 @@ void send_error(struct connection *conn,
   59.72  /* Canonicalize this path if possible. */
   59.73  char *canonicalize(struct connection *conn, const char *node);
   59.74  
   59.75 -/* Check permissions on this node. */
   59.76 -bool check_node_perms(struct connection *conn, const char *node,
   59.77 -		      enum xs_perm_type perm);
   59.78 -
   59.79  /* Check if node is an event node. */
   59.80  bool check_event_node(const char *node);
   59.81  
   59.82 -/* Path to this node outside transaction. */
   59.83 -char *node_dir_outside_transaction(const char *node);
   59.84 +/* Get this node, checking we have permissions. */
   59.85 +struct node *get_node(struct connection *conn,
   59.86 +		      const char *name,
   59.87 +		      enum xs_perm_type perm);
   59.88 +
   59.89 +/* Get TDB context for this connection */
   59.90 +TDB_CONTEXT *tdb_context(struct connection *conn);
   59.91 +
   59.92 +/* Destructor for tdbs: required for transaction code */
   59.93 +int destroy_tdb(void *_tdb);
   59.94 +
   59.95 +/* Replace the tdb: required for transaction code */
   59.96 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
   59.97  
   59.98  /* Fail due to excessive corruption, capitalist pigdogs! */
   59.99  void __attribute__((noreturn)) corrupt(struct connection *conn,
  59.100 @@ -145,23 +163,9 @@ void __attribute__((noreturn)) corrupt(s
  59.101  
  59.102  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
  59.103  
  59.104 -void handle_input(struct connection *conn);
  59.105 -void handle_output(struct connection *conn);
  59.106 -
  59.107  /* Is this a valid node name? */
  59.108  bool is_valid_nodename(const char *node);
  59.109  
  59.110 -/* Return a pointer to an open dir, self-closig and attached to pathname. */
  59.111 -DIR **talloc_opendir(const char *pathname);
  59.112 -
  59.113 -/* Return a pointer to an fd, self-closing and attached to this pathname. */
  59.114 -int *talloc_open(const char *pathname, int flags, int mode);
  59.115 -
  59.116 -/* Convenient talloc-style destructor for paths. */
  59.117 -int destroy_path(void *path);
  59.118 -
  59.119 -/* Read entire contents of a talloced fd. */
  59.120 -void *read_all(int *fd, unsigned int *size);
  59.121  
  59.122  /* Tracing infrastructure. */
  59.123  void trace_create(const void *data, const char *type);
    60.1 --- a/tools/xenstore/xenstored_domain.c	Thu Sep 29 13:35:13 2005 -0600
    60.2 +++ b/tools/xenstore/xenstored_domain.c	Thu Sep 29 16:22:02 2005 -0600
    60.3 @@ -309,16 +309,11 @@ void do_introduce(struct connection *con
    60.4  		return;
    60.5  	}
    60.6  
    60.7 -	if (conn->id != 0) {
    60.8 +	if (conn->id != 0 || !conn->can_write) {
    60.9  		send_error(conn, EACCES);
   60.10  		return;
   60.11  	}
   60.12  
   60.13 -	if (!conn->can_write) {
   60.14 -		send_error(conn, EROFS);
   60.15 -		return;
   60.16 -	}
   60.17 -
   60.18  	/* Sanity check args. */
   60.19  	if ((atoi(vec[2]) <= 0) || !is_valid_nodename(vec[3])) {
   60.20  		send_error(conn, EINVAL);
   60.21 @@ -386,7 +381,7 @@ void do_release(struct connection *conn,
   60.22  
   60.23  	talloc_free(domain->conn);
   60.24  
   60.25 -	fire_watches(NULL, "@releaseDomain", false);
   60.26 +	fire_watches(conn, "@releaseDomain", false);
   60.27  
   60.28  	send_ack(conn, XS_RELEASE);
   60.29  }
    61.1 --- a/tools/xenstore/xenstored_transaction.c	Thu Sep 29 13:35:13 2005 -0600
    61.2 +++ b/tools/xenstore/xenstored_transaction.c	Thu Sep 29 16:22:02 2005 -0600
    61.3 @@ -26,6 +26,7 @@
    61.4  #include <stdarg.h>
    61.5  #include <stdlib.h>
    61.6  #include <fcntl.h>
    61.7 +#include <unistd.h>
    61.8  #include "talloc.h"
    61.9  #include "list.h"
   61.10  #include "xenstored_transaction.h"
   61.11 @@ -51,74 +52,26 @@ struct transaction
   61.12  	/* Global list of transactions. */
   61.13  	struct list_head list;
   61.14  
   61.15 +	/* Generation when transaction started. */
   61.16 +	unsigned int generation;
   61.17 +
   61.18  	/* My owner (conn->transaction == me). */
   61.19  	struct connection *conn;
   61.20  
   61.21 -	/* Subtree this transaction covers */
   61.22 -	char *node;
   61.23 -
   61.24 -	/* Base for this transaction. */
   61.25 -	char *divert;
   61.26 +	/* TDB to work on, and filename */
   61.27 +	TDB_CONTEXT *tdb;
   61.28 +	char *tdb_name;
   61.29  
   61.30  	/* List of changed nodes. */
   61.31  	struct list_head changes;
   61.32 -
   61.33 -	/* Someone's waiting: time limit. */
   61.34 -	struct timeval timeout;
   61.35 -
   61.36 -	/* We've timed out. */
   61.37 -	bool destined_to_fail;
   61.38  };
   61.39  static LIST_HEAD(transactions);
   61.40 -
   61.41 -bool within_transaction(struct transaction *trans, const char *node)
   61.42 -{
   61.43 -	if (!trans)
   61.44 -		return true;
   61.45 -	return is_child(node, trans->node);
   61.46 -}
   61.47 -
   61.48 -/* You are on notice: this transaction is blocking someone. */
   61.49 -static void start_transaction_timeout(struct transaction *trans)
   61.50 -{
   61.51 -	if (timerisset(&trans->timeout))
   61.52 -		return;
   61.53 -
   61.54 -	/* One second timeout. */
   61.55 -	gettimeofday(&trans->timeout, NULL);
   61.56 -	trans->timeout.tv_sec += 1;
   61.57 -}
   61.58 -
   61.59 -struct transaction *transaction_covering_node(const char *node)
   61.60 -{
   61.61 -	struct transaction *i;
   61.62 +static unsigned int generation;
   61.63  
   61.64 -	list_for_each_entry(i, &transactions, list) {
   61.65 -		if (i->destined_to_fail)
   61.66 -			continue;
   61.67 -		if (is_child(i->node, node) || is_child(node, i->node))
   61.68 -			return i;
   61.69 -	}
   61.70 -	return NULL;
   61.71 -}
   61.72 -
   61.73 -bool transaction_block(struct connection *conn, const char *node)
   61.74 +/* Return tdb context to use for this connection. */
   61.75 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans)
   61.76  {
   61.77 -	struct transaction *trans;
   61.78 -
   61.79 -	/* Transactions don't overlap, so we can't be blocked by
   61.80 -	 * others if we're in one. */
   61.81 -	if (conn->transaction)
   61.82 -		return false;
   61.83 -
   61.84 -	trans = transaction_covering_node(node);
   61.85 -	if (trans) {
   61.86 -		start_transaction_timeout(trans);
   61.87 -		conn->state = BLOCKED;
   61.88 -		conn->blocked_by = talloc_strdup(conn, node);
   61.89 -		return true;
   61.90 -	}
   61.91 -	return false;
   61.92 +	return trans->tdb;
   61.93  }
   61.94  
   61.95  /* Callers get a change node (which can fail) and only commit after they've
   61.96 @@ -127,8 +80,11 @@ void add_change_node(struct transaction 
   61.97  {
   61.98  	struct changed_node *i;
   61.99  
  61.100 -	if (!trans)
  61.101 +	if (!trans) {
  61.102 +		/* They're changing the global database. */
  61.103 +		generation++;
  61.104  		return;
  61.105 +	}
  61.106  
  61.107  	list_for_each_entry(i, &trans->changes, list)
  61.108  		if (streq(i->node, node))
  61.109 @@ -140,167 +96,47 @@ void add_change_node(struct transaction 
  61.110  	list_add_tail(&i->list, &trans->changes);
  61.111  }
  61.112  
  61.113 -char *node_dir_inside_transaction(struct transaction *trans, const char *node)
  61.114 -{
  61.115 -	return talloc_asprintf(node, "%s/%s", trans->divert,
  61.116 -			       node + strlen(trans->node));
  61.117 -}
  61.118 -
  61.119 -void shortest_transaction_timeout(struct timeval *tv)
  61.120 -{
  61.121 -	struct transaction *i;
  61.122 -
  61.123 -	list_for_each_entry(i, &transactions, list) {
  61.124 -		if (!timerisset(&i->timeout))
  61.125 -			continue;
  61.126 -
  61.127 -		if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
  61.128 -			*tv = i->timeout;
  61.129 -	}
  61.130 -}	
  61.131 -
  61.132 -void check_transaction_timeout(void)
  61.133 -{
  61.134 -	struct transaction *i;
  61.135 -	struct timeval now;
  61.136 -
  61.137 -	gettimeofday(&now, NULL);
  61.138 -
  61.139 -	list_for_each_entry(i, &transactions, list) {
  61.140 -		if (!timerisset(&i->timeout))
  61.141 -			continue;
  61.142 -
  61.143 -		if (timercmp(&i->timeout, &now, <))
  61.144 -			i->destined_to_fail = true;
  61.145 -	}
  61.146 -}
  61.147 -
  61.148  static int destroy_transaction(void *_transaction)
  61.149  {
  61.150  	struct transaction *trans = _transaction;
  61.151  
  61.152  	list_del(&trans->list);
  61.153  	trace_destroy(trans, "transaction");
  61.154 -	return destroy_path(trans->divert);
  61.155 -}
  61.156 -
  61.157 -static bool copy_file(const char *src, const char *dst)
  61.158 -{
  61.159 -	int *infd, *outfd;
  61.160 -	void *data;
  61.161 -	unsigned int size;
  61.162 -
  61.163 -	infd = talloc_open(src, O_RDONLY, 0);
  61.164 -	if (!infd)
  61.165 -		return false;
  61.166 -	outfd = talloc_open(dst, O_WRONLY|O_CREAT|O_EXCL, 0640);
  61.167 -	if (!outfd)
  61.168 -		return false;
  61.169 -	data = read_all(infd, &size);
  61.170 -	if (!data)
  61.171 -		return false;
  61.172 -	return xs_write_all(*outfd, data, size);
  61.173 +	if (trans->tdb)
  61.174 +		tdb_close(trans->tdb);
  61.175 +	unlink(trans->tdb_name);
  61.176 +	return 0;
  61.177  }
  61.178  
  61.179 -static bool copy_dir(const char *src, const char *dst)
  61.180 +void do_transaction_start(struct connection *conn, struct buffered_data *in)
  61.181  {
  61.182 -	DIR **dir;
  61.183 -	struct dirent *dirent;
  61.184 -
  61.185 -	if (mkdir(dst, 0750) != 0)
  61.186 -		return false;
  61.187 -
  61.188 -	dir = talloc_opendir(src);
  61.189 -	if (!dir)
  61.190 -		return false;
  61.191 -
  61.192 -	while ((dirent = readdir(*dir)) != NULL) {
  61.193 -		struct stat st;
  61.194 -		char *newsrc, *newdst;
  61.195 -
  61.196 -		if (streq(dirent->d_name, ".") || streq(dirent->d_name, ".."))
  61.197 -			continue;
  61.198 -
  61.199 -		newsrc = talloc_asprintf(src, "%s/%s", src, dirent->d_name);
  61.200 -		newdst = talloc_asprintf(src, "%s/%s", dst, dirent->d_name);
  61.201 -		if (stat(newsrc, &st) != 0)
  61.202 -			return false;
  61.203 -		
  61.204 -		if (S_ISDIR(st.st_mode)) {
  61.205 -			if (!copy_dir(newsrc, newdst))
  61.206 -				return false;
  61.207 -		} else {
  61.208 -			if (!copy_file(newsrc, newdst))
  61.209 -				return false;
  61.210 -		}
  61.211 -		/* Free now so we don't run out of file descriptors */
  61.212 -		talloc_free(newsrc);
  61.213 -		talloc_free(newdst);
  61.214 -	}
  61.215 -	return true;
  61.216 -}
  61.217 -
  61.218 -void do_transaction_start(struct connection *conn, const char *node)
  61.219 -{
  61.220 -	struct transaction *transaction;
  61.221 -	char *dir;
  61.222 +	struct transaction *trans;
  61.223  
  61.224  	if (conn->transaction) {
  61.225  		send_error(conn, EBUSY);
  61.226  		return;
  61.227  	}
  61.228  
  61.229 -	node = canonicalize(conn, node);
  61.230 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  61.231 +	/* Attach transaction to input for autofree until it's complete */
  61.232 +	trans = talloc(in, struct transaction);
  61.233 +	INIT_LIST_HEAD(&trans->changes);
  61.234 +	trans->conn = conn;
  61.235 +	trans->generation = generation;
  61.236 +	trans->tdb_name = talloc_asprintf(trans, "%s.%p",
  61.237 +					  xs_daemon_tdb(), trans);
  61.238 +	trans->tdb = tdb_copy(tdb_context(conn), trans->tdb_name);
  61.239 +	if (!trans->tdb) {
  61.240  		send_error(conn, errno);
  61.241  		return;
  61.242  	}
  61.243 -
  61.244 -	if (transaction_block(conn, node))
  61.245 -		return;
  61.246 -
  61.247 -	dir = node_dir_outside_transaction(node);
  61.248 -
  61.249 -	/* Attach transaction to node for autofree until it's complete */
  61.250 -	transaction = talloc(node, struct transaction);
  61.251 -	transaction->node = talloc_strdup(transaction, node);
  61.252 -	transaction->divert = talloc_asprintf(transaction, "%s/%p", 
  61.253 -					      xs_daemon_transactions(),
  61.254 -					      transaction);
  61.255 -	INIT_LIST_HEAD(&transaction->changes);
  61.256 -	transaction->conn = conn;
  61.257 -	timerclear(&transaction->timeout);
  61.258 -	transaction->destined_to_fail = false;
  61.259 -	list_add_tail(&transaction->list, &transactions);
  61.260 -	talloc_set_destructor(transaction, destroy_transaction);
  61.261 -	trace_create(transaction, "transaction");
  61.262 +	/* Make it close if we go away. */
  61.263 +	talloc_steal(trans, trans->tdb);
  61.264  
  61.265 -	if (!copy_dir(dir, transaction->divert)) {
  61.266 -		send_error(conn, errno);
  61.267 -		return;
  61.268 -	}
  61.269 -
  61.270 -	talloc_steal(conn, transaction);
  61.271 -	conn->transaction = transaction;
  61.272 -	send_ack(transaction->conn, XS_TRANSACTION_START);
  61.273 -}
  61.274 -
  61.275 -static bool commit_transaction(struct transaction *trans)
  61.276 -{
  61.277 -	char *tmp, *dir;
  61.278 -
  61.279 -	/* Move: orig -> .old, repl -> orig.  Cleanup deletes .old. */
  61.280 -	dir = node_dir_outside_transaction(trans->node);
  61.281 -	tmp = talloc_asprintf(trans, "%s.old", dir);
  61.282 -
  61.283 -	if (rename(dir, tmp) != 0)
  61.284 -		return false;
  61.285 -	if (rename(trans->divert, dir) != 0)
  61.286 -		corrupt(trans->conn, "Failed rename %s to %s",
  61.287 -			trans->divert, dir);
  61.288 -
  61.289 -	trans->divert = tmp;
  61.290 -	return true;
  61.291 +	/* Now we own it. */
  61.292 +	conn->transaction = talloc_steal(conn, trans);
  61.293 +	list_add_tail(&trans->list, &transactions);
  61.294 +	talloc_set_destructor(trans, destroy_transaction);
  61.295 +	send_ack(conn, XS_TRANSACTION_START);
  61.296  }
  61.297  
  61.298  void do_transaction_end(struct connection *conn, const char *arg)
  61.299 @@ -318,25 +154,29 @@ void do_transaction_end(struct connectio
  61.300  		return;
  61.301  	}
  61.302  
  61.303 -	/* Set to NULL so fire_watches sends events. */
  61.304 +	/* Set to NULL so fire_watches sends events, tdb_context works. */
  61.305  	trans = conn->transaction;
  61.306  	conn->transaction = NULL;
  61.307  	/* Attach transaction to arg for auto-cleanup */
  61.308  	talloc_steal(arg, trans);
  61.309  
  61.310  	if (streq(arg, "T")) {
  61.311 -		if (trans->destined_to_fail) {
  61.312 -			send_error(conn, ETIMEDOUT);
  61.313 +		/* FIXME: Merge, rather failing on any change. */
  61.314 +		if (trans->generation != generation) {
  61.315 +			send_error(conn, EAGAIN);
  61.316  			return;
  61.317  		}
  61.318 -		if (!commit_transaction(trans)) {
  61.319 +		if (!replace_tdb(trans->tdb_name, trans->tdb)) {
  61.320  			send_error(conn, errno);
  61.321  			return;
  61.322  		}
  61.323 +		/* Don't close this: we won! */
  61.324 +		trans->tdb = NULL;
  61.325  
  61.326  		/* Fire off the watches for everything that changed. */
  61.327  		list_for_each_entry(i, &trans->changes, list)
  61.328  			fire_watches(conn, i->node, i->recurse);
  61.329 +		generation++;
  61.330  	}
  61.331  	send_ack(conn, XS_TRANSACTION_END);
  61.332  }
    62.1 --- a/tools/xenstore/xenstored_transaction.h	Thu Sep 29 13:35:13 2005 -0600
    62.2 +++ b/tools/xenstore/xenstored_transaction.h	Thu Sep 29 16:22:02 2005 -0600
    62.3 @@ -22,29 +22,14 @@
    62.4  
    62.5  struct transaction;
    62.6  
    62.7 -void do_transaction_start(struct connection *conn, const char *node);
    62.8 +void do_transaction_start(struct connection *conn, struct buffered_data *node);
    62.9  void do_transaction_end(struct connection *conn, const char *arg);
   62.10  
   62.11 -/* Is node covered by this transaction? */
   62.12 -bool within_transaction(struct transaction *trans, const char *node);
   62.13 -
   62.14 -/* If a write op on this node blocked by another connections' transaction,
   62.15 - * mark conn, setup transaction timeout and return true.
   62.16 - */
   62.17 -bool transaction_block(struct connection *conn, const char *node);
   62.18 -
   62.19 -/* Return transaction which covers this node. */
   62.20 -struct transaction *transaction_covering_node(const char *node);
   62.21 -
   62.22 -/* Return directory of node within transaction t. */
   62.23 -char *node_dir_inside_transaction(struct transaction *t, const char *node);
   62.24 +bool transaction_block(struct connection *conn);
   62.25  
   62.26  /* This node was changed: can fail and longjmp. */
   62.27  void add_change_node(struct transaction *trans, const char *node, bool recurse);
   62.28  
   62.29 -/* Get shortest timeout: leave tv unset if none. */
   62.30 -void shortest_transaction_timeout(struct timeval *tv);
   62.31 -
   62.32 -/* Have any transactions timed out yet? */
   62.33 -void check_transaction_timeout(void);
   62.34 +/* Return tdb context to use for this connection. */
   62.35 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans);
   62.36  #endif /* _XENSTORED_TRANSACTION_H */
    63.1 --- a/tools/xenstore/xenstored_watch.c	Thu Sep 29 13:35:13 2005 -0600
    63.2 +++ b/tools/xenstore/xenstored_watch.c	Thu Sep 29 16:22:02 2005 -0600
    63.3 @@ -96,36 +96,38 @@ static int destroy_watch_event(void *_ev
    63.4  }
    63.5  
    63.6  static void add_event(struct connection *conn,
    63.7 -		      struct watch *watch, const char *node)
    63.8 +		      struct watch *watch,
    63.9 +		      const char *name)
   63.10  {
   63.11  	struct watch_event *event;
   63.12  
   63.13 -	/* Check read permission: no permission, no watch event.
   63.14 -	 * If it doesn't exist, we need permission to read parent.
   63.15 -	 */
   63.16 -	if (!check_node_perms(conn, node, XS_PERM_READ|XS_PERM_ENOENT_OK) &&
   63.17 -	    !check_event_node(node)) {
   63.18 -		return;
   63.19 +	if (!check_event_node(name)) {
   63.20 +		/* Can this conn load node, or see that it doesn't exist? */
   63.21 +		struct node *node;
   63.22 +
   63.23 +		node = get_node(conn, name, XS_PERM_READ);
   63.24 +		if (!node && errno != ENOENT)
   63.25 +			return;
   63.26  	}
   63.27  
   63.28  	if (watch->relative_path) {
   63.29 -		node += strlen(watch->relative_path);
   63.30 -		if (*node == '/') /* Could be "" */
   63.31 -			node++;
   63.32 +		name += strlen(watch->relative_path);
   63.33 +		if (*name == '/') /* Could be "" */
   63.34 +			name++;
   63.35  	}
   63.36  
   63.37  	event = talloc(watch, struct watch_event);
   63.38 -	event->len = strlen(node) + 1 + strlen(watch->token) + 1;
   63.39 +	event->len = strlen(name) + 1 + strlen(watch->token) + 1;
   63.40  	event->data = talloc_array(event, char, event->len);
   63.41 -	strcpy(event->data, node);
   63.42 -	strcpy(event->data + strlen(node) + 1, watch->token);
   63.43 +	strcpy(event->data, name);
   63.44 +	strcpy(event->data + strlen(name) + 1, watch->token);
   63.45  	talloc_set_destructor(event, destroy_watch_event);
   63.46  	list_add_tail(&event->list, &watch->events);
   63.47  	trace_create(event, "watch_event");
   63.48  }
   63.49  
   63.50  /* FIXME: we fail to fire on out of memory.  Should drop connections. */
   63.51 -void fire_watches(struct connection *conn, const char *node, bool recurse)
   63.52 +void fire_watches(struct connection *conn, const char *name, bool recurse)
   63.53  {
   63.54  	struct connection *i;
   63.55  	struct watch *watch;
   63.56 @@ -137,9 +139,9 @@ void fire_watches(struct connection *con
   63.57  	/* Create an event for each watch. */
   63.58  	list_for_each_entry(i, &connections, list) {
   63.59  		list_for_each_entry(watch, &i->watches, list) {
   63.60 -			if (is_child(node, watch->node))
   63.61 -				add_event(i, watch, node);
   63.62 -			else if (recurse && is_child(watch->node, node))
   63.63 +			if (is_child(name, watch->node))
   63.64 +				add_event(i, watch, name);
   63.65 +			else if (recurse && is_child(watch->node, name))
   63.66  				add_event(i, watch, watch->node);
   63.67  			else
   63.68  				continue;
   63.69 @@ -156,49 +158,6 @@ static int destroy_watch(void *_watch)
   63.70  	return 0;
   63.71  }
   63.72  
   63.73 -void shortest_watch_ack_timeout(struct timeval *tv)
   63.74 -{
   63.75 -	(void)tv;
   63.76 -#if 0 /* FIXME */
   63.77 -	struct watch *watch;
   63.78 -
   63.79 -	list_for_each_entry(watch, &watches, list) {
   63.80 -		struct watch_event *i;
   63.81 -		list_for_each_entry(i, &watch->events, list) {
   63.82 -			if (!timerisset(&i->timeout))
   63.83 -				continue;
   63.84 -			if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
   63.85 -				*tv = i->timeout;
   63.86 -		}
   63.87 -	}
   63.88 -#endif
   63.89 -}	
   63.90 -
   63.91 -void check_watch_ack_timeout(void)
   63.92 -{
   63.93 -#if 0
   63.94 -	struct watch *watch;
   63.95 -	struct timeval now;
   63.96 -
   63.97 -	gettimeofday(&now, NULL);
   63.98 -	list_for_each_entry(watch, &watches, list) {
   63.99 -		struct watch_event *i, *tmp;
  63.100 -		list_for_each_entry_safe(i, tmp, &watch->events, list) {
  63.101 -			if (!timerisset(&i->timeout))
  63.102 -				continue;
  63.103 -			if (timercmp(&i->timeout, &now, <)) {
  63.104 -				xprintf("Warning: timeout on watch event %s"
  63.105 -					" token %s\n",
  63.106 -					i->node, watch->token);
  63.107 -				trace_watch_timeout(watch->conn, i->node,
  63.108 -						    watch->token);
  63.109 -				timerclear(&i->timeout);
  63.110 -			}
  63.111 -		}
  63.112 -	}
  63.113 -#endif
  63.114 -}
  63.115 -
  63.116  void do_watch(struct connection *conn, struct buffered_data *in)
  63.117  {
  63.118  	struct watch *watch;
    64.1 --- a/tools/xenstore/xenstored_watch.h	Thu Sep 29 13:35:13 2005 -0600
    64.2 +++ b/tools/xenstore/xenstored_watch.h	Thu Sep 29 16:22:02 2005 -0600
    64.3 @@ -32,15 +32,9 @@ bool is_watch_event(struct connection *c
    64.4  /* Look through our watches: if any of them have an event, queue it. */
    64.5  void queue_next_event(struct connection *conn);
    64.6  
    64.7 -/* Fire all watches: recurse means all the children are effected (ie. rm).
    64.8 +/* Fire all watches: recurse means all the children are affected (ie. rm).
    64.9   */
   64.10 -void fire_watches(struct connection *conn, const char *node, bool recurse);
   64.11 -
   64.12 -/* Find shortest timeout: if any, reduce tv (may already be set). */
   64.13 -void shortest_watch_ack_timeout(struct timeval *tv);
   64.14 -
   64.15 -/* Check for watches which may have timed out. */
   64.16 -void check_watch_ack_timeout(void);
   64.17 +void fire_watches(struct connection *conn, const char *name, bool recurse);
   64.18  
   64.19  void dump_watches(struct connection *conn);
   64.20  
    65.1 --- a/tools/xenstore/xs.c	Thu Sep 29 13:35:13 2005 -0600
    65.2 +++ b/tools/xenstore/xs.c	Thu Sep 29 16:22:02 2005 -0600
    65.3 @@ -497,13 +497,12 @@ bool xs_unwatch(struct xs_handle *h, con
    65.4  
    65.5  /* Start a transaction: changes by others will not be seen during this
    65.6   * transaction, and changes will not be visible to others until end.
    65.7 - * Transaction only applies to the given subtree.
    65.8   * You can only have one transaction at any time.
    65.9   * Returns false on failure.
   65.10   */
   65.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree)
   65.12 +bool xs_transaction_start(struct xs_handle *h)
   65.13  {
   65.14 -	return xs_bool(xs_single(h, XS_TRANSACTION_START, subtree, NULL));
   65.15 +	return xs_bool(xs_single(h, XS_TRANSACTION_START, "", NULL));
   65.16  }
   65.17  
   65.18  /* End a transaction.
    66.1 --- a/tools/xenstore/xs.h	Thu Sep 29 13:35:13 2005 -0600
    66.2 +++ b/tools/xenstore/xs.h	Thu Sep 29 16:22:02 2005 -0600
    66.3 @@ -109,16 +109,15 @@ bool xs_unwatch(struct xs_handle *h, con
    66.4  
    66.5  /* Start a transaction: changes by others will not be seen during this
    66.6   * transaction, and changes will not be visible to others until end.
    66.7 - * Transaction only applies to the given subtree.
    66.8   * You can only have one transaction at any time.
    66.9   * Returns false on failure.
   66.10   */
   66.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree);
   66.12 +bool xs_transaction_start(struct xs_handle *h);
   66.13  
   66.14  /* End a transaction.
   66.15   * If abandon is true, transaction is discarded instead of committed.
   66.16 - * Returns false on failure, which indicates an error: transactions will
   66.17 - * not fail spuriously.
   66.18 + * Returns false on failure: if errno == EAGAIN, you have to restart
   66.19 + * transaction.
   66.20   */
   66.21  bool xs_transaction_end(struct xs_handle *h, bool abort);
   66.22  
    67.1 --- a/tools/xenstore/xs_lib.c	Thu Sep 29 13:35:13 2005 -0600
    67.2 +++ b/tools/xenstore/xs_lib.c	Thu Sep 29 16:22:02 2005 -0600
    67.3 @@ -50,6 +50,13 @@ static const char *xs_daemon_path(void)
    67.4  	return buf;
    67.5  }
    67.6  
    67.7 +const char *xs_daemon_tdb(void)
    67.8 +{
    67.9 +	static char buf[PATH_MAX];
   67.10 +	sprintf(buf, "%s/tdb", xs_daemon_rootdir());
   67.11 +	return buf;
   67.12 +}
   67.13 +
   67.14  const char *xs_daemon_socket(void)
   67.15  {
   67.16  	return xs_daemon_path();
   67.17 @@ -66,24 +73,6 @@ const char *xs_daemon_socket_ro(void)
   67.18  	return buf;
   67.19  }
   67.20  
   67.21 -const char *xs_daemon_store(void)
   67.22 -{
   67.23 -	static char buf[PATH_MAX];
   67.24 -	if (snprintf(buf, PATH_MAX, "%s/store",
   67.25 -		     xs_daemon_rootdir()) >= PATH_MAX)
   67.26 -		return NULL;
   67.27 -	return buf;
   67.28 -}
   67.29 -
   67.30 -const char *xs_daemon_transactions(void)
   67.31 -{
   67.32 -	static char buf[PATH_MAX];
   67.33 -	if (snprintf(buf, PATH_MAX, "%s/transactions",
   67.34 -		     xs_daemon_rootdir()) >= PATH_MAX)
   67.35 -		return NULL;
   67.36 -	return buf;
   67.37 -}
   67.38 -
   67.39  const char *xs_domain_dev(void)
   67.40  {
   67.41  	char *s = getenv("XENSTORED_PATH");
    68.1 --- a/tools/xenstore/xs_lib.h	Thu Sep 29 13:35:13 2005 -0600
    68.2 +++ b/tools/xenstore/xs_lib.h	Thu Sep 29 16:22:02 2005 -0600
    68.3 @@ -36,7 +36,7 @@ enum xs_perm_type {
    68.4  
    68.5  struct xs_permissions
    68.6  {
    68.7 -	domid_t id;
    68.8 +	unsigned int id;
    68.9  	enum xs_perm_type perms;
   68.10  };
   68.11  
   68.12 @@ -46,9 +46,8 @@ struct xs_permissions
   68.13  /* Path for various daemon things: env vars can override. */
   68.14  const char *xs_daemon_socket(void);
   68.15  const char *xs_daemon_socket_ro(void);
   68.16 -const char *xs_daemon_store(void);
   68.17 -const char *xs_daemon_transactions(void);
   68.18  const char *xs_domain_dev(void);
   68.19 +const char *xs_daemon_tdb(void);
   68.20  
   68.21  /* Simple write function: loops for you. */
   68.22  bool xs_write_all(int fd, const void *data, unsigned int len);
    69.1 --- a/tools/xenstore/xs_random.c	Thu Sep 29 13:35:13 2005 -0600
    69.2 +++ b/tools/xenstore/xs_random.c	Thu Sep 29 16:22:02 2005 -0600
    69.3 @@ -41,7 +41,7 @@ struct ops
    69.4  			  struct xs_permissions *perms,
    69.5  			  unsigned int num);
    69.6  
    69.7 -	bool (*transaction_start)(void *h, const char *subtree);
    69.8 +	bool (*transaction_start)(void *h);
    69.9  	bool (*transaction_end)(void *h, bool abort);
   69.10  
   69.11  	/* Create and destroy a new handle. */
   69.12 @@ -53,7 +53,6 @@ struct file_ops_info
   69.13  {
   69.14  	const char *base;
   69.15  	char *transact_base;
   69.16 -	char *transact;
   69.17  };
   69.18  
   69.19  static void convert_to_dir(const char *dirname)
   69.20 @@ -96,31 +95,6 @@ static char *path_to_name(struct file_op
   69.21  	return filename;
   69.22  }
   69.23  
   69.24 -/* Is child a subnode of parent, or equal? */
   69.25 -static bool is_child(const char *child, const char *parent)
   69.26 -{
   69.27 -	unsigned int len = strlen(parent);
   69.28 -
   69.29 -	/* / should really be "" for this algorithm to work, but that's a
   69.30 -	 * usability nightmare. */
   69.31 -	if (streq(parent, "/"))
   69.32 -		return true;
   69.33 -
   69.34 -	if (strncmp(child, parent, len) != 0)
   69.35 -		return false;
   69.36 -
   69.37 -	return child[len] == '/' || child[len] == '\0';
   69.38 -}
   69.39 -
   69.40 -static bool write_ok(struct file_ops_info *info, const char *path)
   69.41 -{
   69.42 -	if (info->transact && !is_child(path, info->transact)) {
   69.43 -		errno = EROFS;
   69.44 -		return false;
   69.45 -	}
   69.46 -	return true;
   69.47 -}	
   69.48 -
   69.49  static char **file_directory(struct file_ops_info *info,
   69.50  			     const char *path, unsigned int *num)
   69.51  {
   69.52 @@ -184,8 +158,10 @@ static void *file_read(struct file_ops_i
   69.53  
   69.54  	ret = grab_file(filename, &size);
   69.55  	/* Directory exists, .DATA doesn't. */
   69.56 -	if (!ret && errno == ENOENT && strends(filename, ".DATA"))
   69.57 -		errno = EISDIR;
   69.58 +	if (!ret && errno == ENOENT && strends(filename, ".DATA")) {
   69.59 +		ret = strdup("");
   69.60 +		size = 0;
   69.61 +	}
   69.62  	*len = size;
   69.63  	return ret;
   69.64  }
   69.65 @@ -270,9 +246,6 @@ static bool file_set_perms(struct file_o
   69.66  		return false;
   69.67  	}
   69.68  
   69.69 -	if (!write_ok(info, path))
   69.70 -		return false;
   69.71 -
   69.72  	/* Check non-perm file exists/ */
   69.73  	if (lstat(filename, &st) != 0)
   69.74  		return false;
   69.75 @@ -338,9 +311,6 @@ static bool file_write(struct file_ops_i
   69.76  	char *filename = filename_to_data(path_to_name(info, path));
   69.77  	int fd;
   69.78  
   69.79 -	if (!write_ok(info, path))
   69.80 -		return false;
   69.81 -
   69.82  	make_dirs(parent_filename(filename));
   69.83  	fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0600);
   69.84  	if (fd < 0)
   69.85 @@ -358,9 +328,6 @@ static bool file_mkdir(struct file_ops_i
   69.86  {
   69.87  	char *dirname = path_to_name(info, path);
   69.88  
   69.89 -	if (!write_ok(info, path))
   69.90 -		return false;
   69.91 -
   69.92  	make_dirs(parent_filename(dirname));
   69.93  	if (mkdir(dirname, 0700) != 0)
   69.94  		return (errno == EEXIST);
   69.95 @@ -374,20 +341,12 @@ static bool file_rm(struct file_ops_info
   69.96  	char *filename = path_to_name(info, path);
   69.97  	struct stat st;
   69.98  
   69.99 -	if (info->transact && streq(info->transact, path)) {
  69.100 -		errno = EINVAL;
  69.101 -		return false;
  69.102 -	}
  69.103 -
  69.104  	if (lstat(filename, &st) != 0) {
  69.105  		if (lstat(parent_filename(filename), &st) != 0)
  69.106  			return false;
  69.107  		return true;
  69.108  	}
  69.109  
  69.110 -	if (!write_ok(info, path))
  69.111 -		return false;
  69.112 -
  69.113  	if (streq(path, "/")) {
  69.114  		errno = EINVAL;
  69.115  		return false;
  69.116 @@ -398,28 +357,20 @@ static bool file_rm(struct file_ops_info
  69.117  	return true;
  69.118  }
  69.119  
  69.120 -static bool file_transaction_start(struct file_ops_info *info,
  69.121 -				   const char *subtree)
  69.122 +static bool file_transaction_start(struct file_ops_info *info)
  69.123  {
  69.124  	char *cmd;
  69.125 -	char *filename = path_to_name(info, subtree);
  69.126 -	struct stat st;
  69.127  
  69.128 -	if (info->transact) {
  69.129 +	if (info->transact_base) {
  69.130  		errno = EBUSY;
  69.131  		return false;
  69.132  	}
  69.133  
  69.134 -	if (lstat(filename, &st) != 0)
  69.135 -		return false;
  69.136 -
  69.137 -	cmd = talloc_asprintf(NULL, "cp -r %s %s.transact",
  69.138 -			      info->base, info->base);
  69.139 +	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  69.140 +	cmd = talloc_asprintf(NULL, "cp -r %s %s",
  69.141 +			      info->base, info->transact_base);
  69.142  	do_command(cmd);
  69.143  	talloc_free(cmd);
  69.144 -
  69.145 -	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  69.146 -	info->transact = talloc_strdup(NULL, subtree);
  69.147  	return true;
  69.148  }
  69.149  
  69.150 @@ -427,7 +378,7 @@ static bool file_transaction_end(struct 
  69.151  {
  69.152  	char *old, *cmd;
  69.153  
  69.154 -	if (!info->transact) {
  69.155 +	if (!info->transact_base) {
  69.156  		errno = ENOENT;
  69.157  		return false;
  69.158  	}
  69.159 @@ -448,9 +399,7 @@ static bool file_transaction_end(struct 
  69.160  
  69.161  success:
  69.162  	talloc_free(cmd);
  69.163 -	talloc_free(info->transact);
  69.164  	talloc_free(info->transact_base);
  69.165 -	info->transact = NULL;
  69.166  	info->transact_base = NULL;
  69.167  	return true;
  69.168  }
  69.169 @@ -461,7 +410,6 @@ static struct file_ops_info *file_handle
  69.170  
  69.171  	info->base = dir;
  69.172  	info->transact_base = NULL;
  69.173 -	info->transact = NULL;
  69.174  	return info;
  69.175  }
  69.176  
  69.177 @@ -898,11 +846,10 @@ static char *do_next_op(struct ops *ops,
  69.178  	case 7: {
  69.179  		if (verbose)
  69.180  			printf("START %s\n", name);
  69.181 -		ret = bool_to_errstring(ops->transaction_start(h, name));
  69.182 +		ret = bool_to_errstring(ops->transaction_start(h));
  69.183  		if (streq(ret, "OK")) {
  69.184  			talloc_free(ret);
  69.185 -			ret = talloc_asprintf(NULL, "OK:START-TRANSACT:%s",
  69.186 -					      name);
  69.187 +			ret = talloc_asprintf(NULL, "OK:START-TRANSACT");
  69.188  		}
  69.189  
  69.190  		break;
  69.191 @@ -978,6 +925,8 @@ static void setup_file_ops(const char *d
  69.192  		barf_perror("Creating directory %s/tool", dir);
  69.193  	if (!file_set_perms(h, talloc_strdup(h, "/"), &perm, 1))
  69.194  		barf_perror("Setting root perms in %s", dir);
  69.195 +	if (!file_set_perms(h, talloc_strdup(h, "/tool"), &perm, 1))
  69.196 +		barf_perror("Setting root perms in %s/tool", dir);
  69.197  	file_close(h);
  69.198  }
  69.199  
  69.200 @@ -1071,7 +1020,7 @@ static unsigned int try_simple(const boo
  69.201  			goto out;
  69.202  
  69.203  		if (!data->fast) {
  69.204 -			if (strstarts(ret, "OK:START-TRANSACT:")) {
  69.205 +			if (streq(ret, "OK:START-TRANSACT")) {
  69.206  				void *pre = data->ops->handle(data->dir);
  69.207  
  69.208  				snapshot = dump(data->ops, pre);
  69.209 @@ -1303,7 +1252,7 @@ static unsigned int try_diff(const bool 
  69.210  			     void *_data)
  69.211  {
  69.212  	void *fileh, *xsh;
  69.213 -	char *transact = NULL;
  69.214 +	bool transact = false;
  69.215  	struct ops *fail;
  69.216  	struct diff_data *data = _data;
  69.217  	unsigned int i, print;
  69.218 @@ -1348,13 +1297,9 @@ static unsigned int try_diff(const bool 
  69.219  			goto out;
  69.220  
  69.221  		if (strstarts(file, "OK:START-TRANSACT:"))
  69.222 -			transact = talloc_strdup(NULL,
  69.223 -						 file +
  69.224 -						 strlen("OK:START-TRANSACT:"));
  69.225 -		else if (streq(file, "OK:STOP-TRANSACT")) {
  69.226 -			talloc_free(transact);
  69.227 -			transact = NULL;
  69.228 -		}
  69.229 +			transact = true;
  69.230 +		else if (streq(file, "OK:STOP-TRANSACT"))
  69.231 +			transact = false;
  69.232  
  69.233  		talloc_free(file);
  69.234  		talloc_free(xs);
  69.235 @@ -1379,7 +1324,7 @@ static unsigned int try_diff(const bool 
  69.236  
  69.237  			fail = NULL;
  69.238  			if (!ops_equal(&xs_ops, xsh_pre, &file_ops, fileh_pre,
  69.239 -				       transact, &fail)) {
  69.240 +				       "/", &fail)) {
  69.241  				if (fail)
  69.242  					barf("%s failed during transact\n",
  69.243  					     fail->name);
  69.244 @@ -1456,9 +1401,6 @@ static unsigned int try_fail(const bool 
  69.245  	fileh = file_handle(data->dir);
  69.246  	xsh = xs_handle(data->dir);
  69.247  
  69.248 -	sprintf(seed, "%i", data->seed);
  69.249 -	free(xs_debug_command(xsh, "failtest", seed, strlen(seed)+1));
  69.250 -
  69.251  	print = number / 76;
  69.252  	if (!print)
  69.253  		print = 1;
  69.254 @@ -1491,8 +1433,12 @@ static unsigned int try_fail(const bool 
  69.255  		if (trymap && !trymap[i])
  69.256  			continue;
  69.257  
  69.258 +		/* Turn on failure. */
  69.259 +		sprintf(seed, "%i", data->seed + i);
  69.260 +		free(xs_debug_command(xsh, "failtest",seed,strlen(seed)+1));
  69.261 +
  69.262  		if (verbose)
  69.263 -			printf("(%i) ", i);
  69.264 +			printf("(%i) seed %s ", i, seed);
  69.265  		ret = do_next_op(&xs_ops, xsh, i + data->seed, verbose);
  69.266  		if (streq(ret, "FAILED:Connection reset by peer")
  69.267  		    || streq(ret, "FAILED:Bad file descriptor")
  69.268 @@ -1549,8 +1495,6 @@ static unsigned int try_fail(const bool 
  69.269  		fail = NULL;
  69.270  		if (!ops_equal(&xs_ops, tmpxsh, &file_ops, tmpfileh, "/",
  69.271  			       &fail)) {
  69.272 -			xs_close(tmpxsh);
  69.273 -			file_close(tmpfileh);
  69.274  			if (fail) {
  69.275  				if (verbose)
  69.276  					printf("%s failed\n", fail->name);
  69.277 @@ -1561,10 +1505,16 @@ static unsigned int try_fail(const bool 
  69.278  				failed = 0;
  69.279  				if (verbose)
  69.280  					printf("(Looks like it succeeded)\n");
  69.281 +				xs_close(tmpxsh);
  69.282 +				file_close(tmpfileh);
  69.283  				goto try_applying;
  69.284  			}
  69.285  			if (verbose)
  69.286 -				printf("Two backends not equal\n");
  69.287 +				printf("Trees differ:\nXS:%s\nFILE:%s\n",
  69.288 +				       dump(&xs_ops, tmpxsh),
  69.289 +				       dump(&file_ops, tmpfileh));
  69.290 +			xs_close(tmpxsh);
  69.291 +			file_close(tmpfileh);
  69.292  			goto out;
  69.293  		}
  69.294  
  69.295 @@ -1572,8 +1522,6 @@ static unsigned int try_fail(const bool 
  69.296  		if (!xsh)
  69.297  			file_transaction_end(fileh, true);
  69.298  
  69.299 -		/* Turn failures back on. */
  69.300 -		free(xs_debug_command(tmpxsh, "failtest",  NULL, 0));
  69.301  		xs_close(tmpxsh);
  69.302  		file_close(tmpfileh);
  69.303  	}
    70.1 --- a/tools/xenstore/xs_stress.c	Thu Sep 29 13:35:13 2005 -0600
    70.2 +++ b/tools/xenstore/xs_stress.c	Thu Sep 29 16:22:02 2005 -0600
    70.3 @@ -8,6 +8,7 @@
    70.4  #include <sys/stat.h>
    70.5  #include <fcntl.h>
    70.6  #include <unistd.h>
    70.7 +#include <errno.h>
    70.8  
    70.9  #define NUM_HANDLES 2
   70.10  #define DIR_FANOUT 3
   70.11 @@ -36,24 +37,18 @@ static void work(unsigned int cycles, un
   70.12  
   70.13  	srandom(childnum);
   70.14  	for (i = 0; i < cycles; i++) {
   70.15 -		unsigned int lockdepth, j, len;
   70.16 -		char file[100] = "", lockdir[100];
   70.17 +		unsigned int j, len;
   70.18 +		char file[100] = "";
   70.19  		char *contents, tmp[100];
   70.20  		struct xs_handle *h = handles[random() % NUM_HANDLES];
   70.21  
   70.22 -		lockdepth = random() % DIR_DEPTH;
   70.23 -		for (j = 0; j < DIR_DEPTH; j++) {
   70.24 -			if (j == lockdepth)
   70.25 -				strcpy(lockdir, file);
   70.26 +		for (j = 0; j < DIR_DEPTH; j++)
   70.27  			sprintf(file + strlen(file), "/%li",
   70.28  				random()%DIR_FANOUT);
   70.29 -		}
   70.30 -		if (streq(lockdir, ""))
   70.31 -			strcpy(lockdir, "/");
   70.32  
   70.33 -		if (!xs_transaction_start(h, lockdir))
   70.34 -			barf_perror("%i: starting transaction %i on %s",
   70.35 -				    childnum, i, lockdir);
   70.36 +		if (!xs_transaction_start(h))
   70.37 +			barf_perror("%i: starting transaction %i",
   70.38 +				    childnum, i);
   70.39  
   70.40  		sprintf(file + strlen(file), "/count");
   70.41  		contents = xs_read(h, file, &len);
   70.42 @@ -68,18 +63,23 @@ static void work(unsigned int cycles, un
   70.43  		/* Abandon 1 in 10 */
   70.44  		if (random() % 10 == 0) {
   70.45  			if (!xs_transaction_end(h, true))
   70.46 -				barf_perror("%i: can't abort transact %s",
   70.47 -					    childnum, lockdir);
   70.48 +				barf_perror("%i: can't abort transact",
   70.49 +					    childnum);
   70.50  			i--;
   70.51  		} else {
   70.52 -			if (!xs_transaction_end(h, false))
   70.53 -				barf_perror("%i: can't commit transact %s",
   70.54 -					    childnum, lockdir);
   70.55 -
   70.56 -			/* Offset when we print . so kids don't all
   70.57 -			 * print at once. */
   70.58 -			if ((i + print/(childnum+1)) % print == 0)
   70.59 -				write(STDOUT_FILENO, &id, 1);
   70.60 +			if (!xs_transaction_end(h, false)) {
   70.61 +				if (errno == EAGAIN) {
   70.62 +					write(STDOUT_FILENO, "!", 1);
   70.63 +					i--;
   70.64 +				} else
   70.65 +					barf_perror("%i: can't commit trans",
   70.66 +						    childnum);
   70.67 +			} else {
   70.68 +				/* Offset when we print . so kids don't all
   70.69 +				 * print at once. */
   70.70 +				if ((i + print/(childnum+1)) % print == 0)
   70.71 +					write(STDOUT_FILENO, &id, 1);
   70.72 +			}
   70.73  		}
   70.74  	}
   70.75  }
   70.76 @@ -201,7 +201,7 @@ int main(int argc, char *argv[])
   70.77  	printf("\nCounting results...\n");
   70.78  	i = tally_counts();
   70.79  	if (i != (unsigned)atoi(argv[1]))
   70.80 -		barf("Total counts %i not %s", i, atoi(argv[1]));
   70.81 +		barf("Total counts %i not %s", i, argv[1]);
   70.82  	printf("Success!\n");
   70.83  	exit(0);
   70.84  }
    71.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    71.2 +++ b/tools/xenstore/xs_tdb_dump.c	Thu Sep 29 16:22:02 2005 -0600
    71.3 @@ -0,0 +1,82 @@
    71.4 +/* Simple program to dump out all records of TDB */
    71.5 +#include <stdint.h>
    71.6 +#include <stdlib.h>
    71.7 +#include <fcntl.h>
    71.8 +#include <stdio.h>
    71.9 +#include <stdarg.h>
   71.10 +
   71.11 +#include "xs_lib.h"
   71.12 +#include "tdb.h"
   71.13 +#include "talloc.h"
   71.14 +#include "utils.h"
   71.15 +
   71.16 +struct record_hdr {
   71.17 +	u32 num_perms;
   71.18 +	u32 datalen;
   71.19 +	u32 childlen;
   71.20 +	struct xs_permissions perms[0];
   71.21 +};
   71.22 +
   71.23 +static u32 total_size(struct record_hdr *hdr)
   71.24 +{
   71.25 +	return sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) 
   71.26 +		+ hdr->datalen + hdr->childlen;
   71.27 +}
   71.28 +
   71.29 +static char perm_to_char(enum xs_perm_type perm)
   71.30 +{
   71.31 +	return perm == XS_PERM_READ ? 'r' :
   71.32 +		perm == XS_PERM_WRITE ? 'w' :
   71.33 +		perm == XS_PERM_NONE ? '-' :
   71.34 +		perm == (XS_PERM_READ|XS_PERM_WRITE) ? 'b' :
   71.35 +		'?';
   71.36 +}
   71.37 +
   71.38 +int main(int argc, char *argv[])
   71.39 +{
   71.40 +	TDB_DATA key;
   71.41 +	TDB_CONTEXT *tdb;
   71.42 +
   71.43 +	if (argc != 2)
   71.44 +		barf("Usage: xs_tdb_dump <tdbfile>");
   71.45 +
   71.46 +	tdb = tdb_open(talloc_strdup(NULL, argv[1]), 0, 0, O_RDONLY, 0);
   71.47 +	if (!tdb)
   71.48 +		barf_perror("Could not open %s", argv[1]);
   71.49 +
   71.50 +	key = tdb_firstkey(tdb);
   71.51 +	while (key.dptr) {
   71.52 +		TDB_DATA data;
   71.53 +		struct record_hdr *hdr;
   71.54 +
   71.55 +		data = tdb_fetch(tdb, key);
   71.56 +		hdr = (void *)data.dptr;
   71.57 +		if (data.dsize < sizeof(*hdr))
   71.58 +			fprintf(stderr, "%.*s: BAD truncated\n",
   71.59 +				key.dsize, key.dptr);
   71.60 +		else if (data.dsize != total_size(hdr))
   71.61 +			fprintf(stderr, "%.*s: BAD length %i for %i/%i/%i (%i)\n",
   71.62 +				key.dsize, key.dptr, data.dsize,
   71.63 +				hdr->num_perms, hdr->datalen,
   71.64 +				hdr->childlen, total_size(hdr));
   71.65 +		else {
   71.66 +			unsigned int i;
   71.67 +			char *p;
   71.68 +
   71.69 +			printf("%.*s: ", key.dsize, key.dptr);
   71.70 +			for (i = 0; i < hdr->num_perms; i++)
   71.71 +				printf("%s%c%i",
   71.72 +				       i == 0 ? "" : ",",
   71.73 +				       perm_to_char(hdr->perms[i].perms),
   71.74 +				       hdr->perms[i].id);
   71.75 +			p = (void *)&hdr->perms[hdr->num_perms];
   71.76 +			printf(" %.*s\n", hdr->datalen, p);
   71.77 +			p += hdr->datalen;
   71.78 +			for (i = 0; i < hdr->childlen; i += strlen(p+i)+1)
   71.79 +				printf("\t-> %s\n", p+i);
   71.80 +		}
   71.81 +		key = tdb_nextkey(tdb, key);
   71.82 +	}
   71.83 +	return 0;
   71.84 +}
   71.85 +
    72.1 --- a/tools/xenstore/xs_test.c	Thu Sep 29 13:35:13 2005 -0600
    72.2 +++ b/tools/xenstore/xs_test.c	Thu Sep 29 16:22:02 2005 -0600
    72.3 @@ -562,9 +562,9 @@ static void do_unwatch(unsigned int hand
    72.4  		failed(handle);
    72.5  }
    72.6  
    72.7 -static void do_start(unsigned int handle, const char *node)
    72.8 +static void do_start(unsigned int handle)
    72.9  {
   72.10 -	if (!xs_transaction_start(handles[handle], node))
   72.11 +	if (!xs_transaction_start(handles[handle]))
   72.12  		failed(handle);
   72.13  }
   72.14  
   72.15 @@ -791,7 +791,7 @@ static void do_command(unsigned int defa
   72.16  		xs_daemon_close(handles[handle]);
   72.17  		handles[handle] = NULL;
   72.18  	} else if (streq(command, "start"))
   72.19 -		do_start(handle, arg(line, 1));
   72.20 +		do_start(handle);
   72.21  	else if (streq(command, "commit"))
   72.22  		do_end(handle, false);
   72.23  	else if (streq(command, "abort"))
    77.1 --- a/xen/arch/x86/mm.c	Thu Sep 29 13:35:13 2005 -0600
    77.2 +++ b/xen/arch/x86/mm.c	Thu Sep 29 16:22:02 2005 -0600
    77.3 @@ -2273,8 +2273,7 @@ int do_mmu_update(
    77.4  
    77.5  
    77.6  int update_grant_pte_mapping(
    77.7 -    unsigned long pte_addr, l1_pgentry_t _nl1e, 
    77.8 -    struct domain *d, struct vcpu *v)
    77.9 +    unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
   77.10  {
   77.11      int rc = GNTST_okay;
   77.12      void *va;
   77.13 @@ -2282,6 +2281,7 @@ int update_grant_pte_mapping(
   77.14      struct pfn_info *page;
   77.15      u32 type_info;
   77.16      l1_pgentry_t ol1e;
   77.17 +    struct domain *d = v->domain;
   77.18  
   77.19      ASSERT(spin_is_locked(&d->big_lock));
   77.20      ASSERT(!shadow_mode_refcounts(d));
   77.21 @@ -2319,8 +2319,6 @@ int update_grant_pte_mapping(
   77.22  
   77.23      put_page_from_l1e(ol1e, d);
   77.24  
   77.25 -    rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_all : GNTST_okay;
   77.26 -
   77.27      if ( unlikely(shadow_mode_enabled(d)) )
   77.28      {
   77.29          struct domain_mmap_cache sh_mapcache;
   77.30 @@ -2415,10 +2413,10 @@ int clear_grant_pte_mapping(
   77.31  
   77.32  
   77.33  int update_grant_va_mapping(
   77.34 -    unsigned long va, l1_pgentry_t _nl1e, struct domain *d, struct vcpu *v)
   77.35 +    unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
   77.36  {
   77.37 -    int rc = GNTST_okay;
   77.38      l1_pgentry_t *pl1e, ol1e;
   77.39 +    struct domain *d = v->domain;
   77.40      
   77.41      ASSERT(spin_is_locked(&d->big_lock));
   77.42      ASSERT(!shadow_mode_refcounts(d));
   77.43 @@ -2439,12 +2437,10 @@ int update_grant_va_mapping(
   77.44  
   77.45      put_page_from_l1e(ol1e, d);
   77.46  
   77.47 -    rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_one : GNTST_okay;
   77.48 -
   77.49      if ( unlikely(shadow_mode_enabled(d)) )
   77.50          shadow_do_update_va_mapping(va, _nl1e, v);
   77.51  
   77.52 -    return rc;
   77.53 +    return GNTST_okay;
   77.54  }
   77.55  
   77.56  int clear_grant_va_mapping(unsigned long addr, unsigned long frame)
    78.1 --- a/xen/arch/x86/vmx_vmcs.c	Thu Sep 29 13:35:13 2005 -0600
    78.2 +++ b/xen/arch/x86/vmx_vmcs.c	Thu Sep 29 16:22:02 2005 -0600
    78.3 @@ -37,19 +37,19 @@
    78.4  #endif
    78.5  #ifdef CONFIG_VMX
    78.6  
    78.7 -struct vmcs_struct *alloc_vmcs(void) 
    78.8 +struct vmcs_struct *alloc_vmcs(void)
    78.9  {
   78.10      struct vmcs_struct *vmcs;
   78.11      u32 vmx_msr_low, vmx_msr_high;
   78.12  
   78.13      rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
   78.14      vmcs_size = vmx_msr_high & 0x1fff;
   78.15 -    vmcs = alloc_xenheap_pages(get_order_from_bytes(vmcs_size)); 
   78.16 +    vmcs = alloc_xenheap_pages(get_order_from_bytes(vmcs_size));
   78.17      memset((char *)vmcs, 0, vmcs_size); /* don't remove this */
   78.18  
   78.19      vmcs->vmcs_revision_id = vmx_msr_low;
   78.20      return vmcs;
   78.21 -} 
   78.22 +}
   78.23  
   78.24  void free_vmcs(struct vmcs_struct *vmcs)
   78.25  {
   78.26 @@ -65,7 +65,7 @@ static inline int construct_vmcs_control
   78.27      void *io_bitmap_a;
   78.28      void *io_bitmap_b;
   78.29  
   78.30 -    error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL, 
   78.31 +    error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
   78.32                         MONITOR_PIN_BASED_EXEC_CONTROLS);
   78.33  
   78.34      error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS);
   78.35 @@ -73,8 +73,8 @@ static inline int construct_vmcs_control
   78.36      error |= __vmwrite(VM_ENTRY_CONTROLS, MONITOR_VM_ENTRY_CONTROLS);
   78.37  
   78.38      /* need to use 0x1000 instead of PAGE_SIZE */
   78.39 -    io_bitmap_a = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000)); 
   78.40 -    io_bitmap_b = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000)); 
   78.41 +    io_bitmap_a = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000));
   78.42 +    io_bitmap_b = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000));
   78.43      memset(io_bitmap_a, 0xff, 0x1000);
   78.44      /* don't bother debug port access */
   78.45      clear_bit(PC_DEBUG_PORT, io_bitmap_a);
   78.46 @@ -89,8 +89,10 @@ static inline int construct_vmcs_control
   78.47      return error;
   78.48  }
   78.49  
   78.50 -#define GUEST_SEGMENT_LIMIT     0xffffffff      
   78.51 -#define HOST_SEGMENT_LIMIT      0xffffffff      
   78.52 +#define GUEST_LAUNCH_DS         0x08
   78.53 +#define GUEST_LAUNCH_CS         0x10
   78.54 +#define GUEST_SEGMENT_LIMIT     0xffffffff
   78.55 +#define HOST_SEGMENT_LIMIT      0xffffffff
   78.56  
   78.57  struct host_execution_env {
   78.58      /* selectors */
   78.59 @@ -110,72 +112,76 @@ struct host_execution_env {
   78.60      unsigned long tr_base;
   78.61      unsigned long ds_base;
   78.62      unsigned long cs_base;
   78.63 -#ifdef __x86_64__ 
   78.64 -    unsigned long fs_base; 
   78.65 -    unsigned long gs_base; 
   78.66 -#endif 
   78.67 +#ifdef __x86_64__
   78.68 +    unsigned long fs_base;
   78.69 +    unsigned long gs_base;
   78.70 +#endif
   78.71  };
   78.72  
   78.73 -#define round_pgdown(_p) ((_p)&PAGE_MASK) /* coped from domain.c */
   78.74 -
   78.75 -int vmx_setup_platform(struct vcpu *d, struct cpu_user_regs *regs)
   78.76 +static void vmx_setup_platform(struct vcpu *v, struct cpu_user_regs *regs)
   78.77  {
   78.78      int i;
   78.79 -    unsigned int n;
   78.80 -    unsigned long *p, mpfn, offset, addr;
   78.81 -    struct e820entry *e820p;
   78.82 +    unsigned char e820_map_nr;
   78.83 +    struct e820entry *e820entry;
   78.84 +    unsigned char *p;
   78.85 +    unsigned long mpfn;
   78.86      unsigned long gpfn = 0;
   78.87  
   78.88      local_flush_tlb_pge();
   78.89 -    regs->ebx = 0;   /* Linux expects ebx to be 0 for boot proc */
   78.90  
   78.91 -    n = regs->ecx;
   78.92 -    if (n > 32) {
   78.93 -        VMX_DBG_LOG(DBG_LEVEL_1, "Too many e820 entries: %d", n);
   78.94 -        return -1;
   78.95 +    mpfn = get_mfn_from_pfn(E820_MAP_PAGE >> PAGE_SHIFT);
   78.96 +    if (mpfn == INVALID_MFN) {
   78.97 +        printk("Can not find E820 memory map page for VMX domain.\n");
   78.98 +        domain_crash();
   78.99      }
  78.100  
  78.101 -    addr = regs->edi;
  78.102 -    offset = (addr & ~PAGE_MASK);
  78.103 -    addr = round_pgdown(addr);
  78.104 -
  78.105 -    mpfn = get_mfn_from_pfn(addr >> PAGE_SHIFT);
  78.106      p = map_domain_page(mpfn);
  78.107 -
  78.108 -    e820p = (struct e820entry *) ((unsigned long) p + offset); 
  78.109 +    if (p == NULL) {
  78.110 +        printk("Can not map E820 memory map page for VMX domain.\n");
  78.111 +        domain_crash();
  78.112 +    }
  78.113  
  78.114 -#ifndef NDEBUG
  78.115 -    print_e820_memory_map(e820p, n);
  78.116 -#endif
  78.117 +    e820_map_nr = *(p + E820_MAP_NR_OFFSET);
  78.118 +    e820entry = (struct e820entry *)(p + E820_MAP_OFFSET);
  78.119  
  78.120 -    for ( i = 0; i < n; i++ )
  78.121 +    for ( i = 0; i < e820_map_nr; i++ )
  78.122      {
  78.123 -        if ( e820p[i].type == E820_SHARED_PAGE )
  78.124 +        if (e820entry[i].type == E820_SHARED_PAGE)
  78.125          {
  78.126 -            gpfn = (e820p[i].addr >> PAGE_SHIFT);
  78.127 +            gpfn = (e820entry[i].addr >> PAGE_SHIFT);
  78.128              break;
  78.129          }
  78.130      }
  78.131  
  78.132 -    if ( gpfn == 0 )
  78.133 -    {
  78.134 -        unmap_domain_page(p);        
  78.135 -        return -1;
  78.136 -    }   
  78.137 +    if ( gpfn == 0 ) {
  78.138 +        printk("Can not get io request shared page"
  78.139 +               " from E820 memory map for VMX domain.\n");
  78.140 +        unmap_domain_page(p);
  78.141 +        domain_crash();
  78.142 +    }
  78.143 +    unmap_domain_page(p);
  78.144  
  78.145 -    unmap_domain_page(p);        
  78.146 +    if (v->vcpu_id)
  78.147 +        return;
  78.148  
  78.149      /* Initialise shared page */
  78.150      mpfn = get_mfn_from_pfn(gpfn);
  78.151 -    p = map_domain_page(mpfn);
  78.152 -    d->domain->arch.vmx_platform.shared_page_va = (unsigned long)p;
  78.153 -
  78.154 -    VMX_DBG_LOG(DBG_LEVEL_1, "eport: %x\n", iopacket_port(d->domain));
  78.155 +    if (mpfn == INVALID_MFN) {
  78.156 +        printk("Can not find io request shared page for VMX domain.\n");
  78.157 +        domain_crash();
  78.158 +    }
  78.159  
  78.160 -    clear_bit(iopacket_port(d->domain), 
  78.161 -              &d->domain->shared_info->evtchn_mask[0]);
  78.162 +    p = map_domain_page(mpfn);
  78.163 +    if (p == NULL) {
  78.164 +        printk("Can not map io request shared page for VMX domain.\n");
  78.165 +        domain_crash();
  78.166 +    }
  78.167 +    v->domain->arch.vmx_platform.shared_page_va = (unsigned long)p;
  78.168  
  78.169 -    return 0;
  78.170 +    VMX_DBG_LOG(DBG_LEVEL_1, "eport: %x\n", iopacket_port(v->domain));
  78.171 +
  78.172 +    clear_bit(iopacket_port(v->domain),
  78.173 +              &v->domain->shared_info->evtchn_mask[0]);
  78.174  }
  78.175  
  78.176  void vmx_set_host_env(struct vcpu *v)
  78.177 @@ -203,7 +209,7 @@ void vmx_set_host_env(struct vcpu *v)
  78.178      error |= __vmwrite(HOST_TR_BASE, host_env.tr_base);
  78.179  }
  78.180  
  78.181 -void vmx_do_launch(struct vcpu *v) 
  78.182 +void vmx_do_launch(struct vcpu *v)
  78.183  {
  78.184  /* Update CR3, GDT, LDT, TR */
  78.185      unsigned int  error = 0;
  78.186 @@ -217,7 +223,7 @@ void vmx_do_launch(struct vcpu *v)
  78.187      error |= __vmwrite(GUEST_CR0, cr0);
  78.188      cr0 &= ~X86_CR0_PG;
  78.189      error |= __vmwrite(CR0_READ_SHADOW, cr0);
  78.190 -    error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, 
  78.191 +    error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
  78.192                         MONITOR_CPU_BASED_EXEC_CONTROLS);
  78.193  
  78.194      __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (cr4) : );
  78.195 @@ -247,7 +253,7 @@ void vmx_do_launch(struct vcpu *v)
  78.196      error |= __vmwrite(GUEST_LDTR_SELECTOR, 0);
  78.197      error |= __vmwrite(GUEST_LDTR_BASE, 0);
  78.198      error |= __vmwrite(GUEST_LDTR_LIMIT, 0);
  78.199 -        
  78.200 +
  78.201      error |= __vmwrite(GUEST_TR_BASE, 0);
  78.202      error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
  78.203  
  78.204 @@ -261,10 +267,8 @@ void vmx_do_launch(struct vcpu *v)
  78.205  /*
  78.206   * Initially set the same environement as host.
  78.207   */
  78.208 -static inline int 
  78.209 -construct_init_vmcs_guest(struct cpu_user_regs *regs, 
  78.210 -                          struct vcpu_guest_context *ctxt,
  78.211 -                          struct host_execution_env *host_env)
  78.212 +static inline int
  78.213 +construct_init_vmcs_guest(struct cpu_user_regs *regs)
  78.214  {
  78.215      int error = 0;
  78.216      union vmcs_arbytes arbytes;
  78.217 @@ -292,31 +296,37 @@ construct_init_vmcs_guest(struct cpu_use
  78.218      error |= __vmwrite(CR3_TARGET_COUNT, 0);
  78.219  
  78.220      /* Guest Selectors */
  78.221 -    error |= __vmwrite(GUEST_CS_SELECTOR, regs->cs);
  78.222 -    error |= __vmwrite(GUEST_ES_SELECTOR, regs->es);
  78.223 -    error |= __vmwrite(GUEST_SS_SELECTOR, regs->ss);
  78.224 -    error |= __vmwrite(GUEST_DS_SELECTOR, regs->ds);
  78.225 -    error |= __vmwrite(GUEST_FS_SELECTOR, regs->fs);
  78.226 -    error |= __vmwrite(GUEST_GS_SELECTOR, regs->gs);
  78.227 +    error |= __vmwrite(GUEST_ES_SELECTOR, GUEST_LAUNCH_DS);
  78.228 +    error |= __vmwrite(GUEST_SS_SELECTOR, GUEST_LAUNCH_DS);
  78.229 +    error |= __vmwrite(GUEST_DS_SELECTOR, GUEST_LAUNCH_DS);
  78.230 +    error |= __vmwrite(GUEST_FS_SELECTOR, GUEST_LAUNCH_DS);
  78.231 +    error |= __vmwrite(GUEST_GS_SELECTOR, GUEST_LAUNCH_DS);
  78.232 +    error |= __vmwrite(GUEST_CS_SELECTOR, GUEST_LAUNCH_CS);
  78.233 +
  78.234 +    /* Guest segment bases */
  78.235 +    error |= __vmwrite(GUEST_ES_BASE, 0);
  78.236 +    error |= __vmwrite(GUEST_SS_BASE, 0);
  78.237 +    error |= __vmwrite(GUEST_DS_BASE, 0);
  78.238 +    error |= __vmwrite(GUEST_FS_BASE, 0);
  78.239 +    error |= __vmwrite(GUEST_GS_BASE, 0);
  78.240 +    error |= __vmwrite(GUEST_CS_BASE, 0);
  78.241  
  78.242      /* Guest segment Limits */
  78.243 -    error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
  78.244      error |= __vmwrite(GUEST_ES_LIMIT, GUEST_SEGMENT_LIMIT);
  78.245      error |= __vmwrite(GUEST_SS_LIMIT, GUEST_SEGMENT_LIMIT);
  78.246      error |= __vmwrite(GUEST_DS_LIMIT, GUEST_SEGMENT_LIMIT);
  78.247      error |= __vmwrite(GUEST_FS_LIMIT, GUEST_SEGMENT_LIMIT);
  78.248      error |= __vmwrite(GUEST_GS_LIMIT, GUEST_SEGMENT_LIMIT);
  78.249 +    error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
  78.250  
  78.251 -    error |= __vmwrite(GUEST_IDTR_LIMIT, host_env->idtr_limit);
  78.252 -
  78.253 -    /* AR bytes */
  78.254 +    /* Guest segment AR bytes */
  78.255      arbytes.bytes = 0;
  78.256      arbytes.fields.seg_type = 0x3;          /* type = 3 */
  78.257      arbytes.fields.s = 1;                   /* code or data, i.e. not system */
  78.258      arbytes.fields.dpl = 0;                 /* DPL = 3 */
  78.259      arbytes.fields.p = 1;                   /* segment present */
  78.260      arbytes.fields.default_ops_size = 1;    /* 32-bit */
  78.261 -    arbytes.fields.g = 1;   
  78.262 +    arbytes.fields.g = 1;
  78.263      arbytes.fields.null_bit = 0;            /* not null */
  78.264  
  78.265      error |= __vmwrite(GUEST_ES_AR_BYTES, arbytes.bytes);
  78.266 @@ -328,35 +338,31 @@ construct_init_vmcs_guest(struct cpu_use
  78.267      arbytes.fields.seg_type = 0xb;          /* type = 0xb */
  78.268      error |= __vmwrite(GUEST_CS_AR_BYTES, arbytes.bytes);
  78.269  
  78.270 -    error |= __vmwrite(GUEST_GDTR_BASE, regs->edx);
  78.271 -    regs->edx = 0;
  78.272 -    error |= __vmwrite(GUEST_GDTR_LIMIT, regs->eax);
  78.273 -    regs->eax = 0;
  78.274 +    /* Guest GDT */
  78.275 +    error |= __vmwrite(GUEST_GDTR_BASE, 0);
  78.276 +    error |= __vmwrite(GUEST_GDTR_LIMIT, 0);
  78.277  
  78.278 +    /* Guest IDT */
  78.279 +    error |= __vmwrite(GUEST_IDTR_BASE, 0);
  78.280 +    error |= __vmwrite(GUEST_IDTR_LIMIT, 0);
  78.281 +
  78.282 +    /* Guest LDT & TSS */
  78.283      arbytes.fields.s = 0;                   /* not code or data segement */
  78.284      arbytes.fields.seg_type = 0x2;          /* LTD */
  78.285      arbytes.fields.default_ops_size = 0;    /* 16-bit */
  78.286 -    arbytes.fields.g = 0;   
  78.287 +    arbytes.fields.g = 0;
  78.288      error |= __vmwrite(GUEST_LDTR_AR_BYTES, arbytes.bytes);
  78.289  
  78.290      arbytes.fields.seg_type = 0xb;          /* 32-bit TSS (busy) */
  78.291      error |= __vmwrite(GUEST_TR_AR_BYTES, arbytes.bytes);
  78.292      /* CR3 is set in vmx_final_setup_guest */
  78.293  
  78.294 -    error |= __vmwrite(GUEST_ES_BASE, host_env->ds_base);
  78.295 -    error |= __vmwrite(GUEST_CS_BASE, host_env->cs_base);
  78.296 -    error |= __vmwrite(GUEST_SS_BASE, host_env->ds_base);
  78.297 -    error |= __vmwrite(GUEST_DS_BASE, host_env->ds_base);
  78.298 -    error |= __vmwrite(GUEST_FS_BASE, host_env->ds_base);
  78.299 -    error |= __vmwrite(GUEST_GS_BASE, host_env->ds_base);
  78.300 -    error |= __vmwrite(GUEST_IDTR_BASE, host_env->idtr_base);
  78.301 -
  78.302 -    error |= __vmwrite(GUEST_RSP, regs->esp);
  78.303 +    error |= __vmwrite(GUEST_RSP, 0);
  78.304      error |= __vmwrite(GUEST_RIP, regs->eip);
  78.305  
  78.306 +    /* Guest EFLAGS */
  78.307      eflags = regs->eflags & ~VMCS_EFLAGS_RESERVED_0; /* clear 0s */
  78.308      eflags |= VMCS_EFLAGS_RESERVED_1; /* set 1s */
  78.309 -
  78.310      error |= __vmwrite(GUEST_RFLAGS, eflags);
  78.311  
  78.312      error |= __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
  78.313 @@ -381,14 +387,14 @@ static inline int construct_vmcs_host(st
  78.314  #if defined (__i386__)
  78.315      error |= __vmwrite(HOST_FS_SELECTOR, host_env->ds_selector);
  78.316      error |= __vmwrite(HOST_GS_SELECTOR, host_env->ds_selector);
  78.317 -    error |= __vmwrite(HOST_FS_BASE, host_env->ds_base); 
  78.318 -    error |= __vmwrite(HOST_GS_BASE, host_env->ds_base); 
  78.319 +    error |= __vmwrite(HOST_FS_BASE, host_env->ds_base);
  78.320 +    error |= __vmwrite(HOST_GS_BASE, host_env->ds_base);
  78.321  
  78.322  #else
  78.323 -    rdmsrl(MSR_FS_BASE, host_env->fs_base); 
  78.324 -    rdmsrl(MSR_GS_BASE, host_env->gs_base); 
  78.325 -    error |= __vmwrite(HOST_FS_BASE, host_env->fs_base); 
  78.326 -    error |= __vmwrite(HOST_GS_BASE, host_env->gs_base); 
  78.327 +    rdmsrl(MSR_FS_BASE, host_env->fs_base);
  78.328 +    rdmsrl(MSR_GS_BASE, host_env->gs_base);
  78.329 +    error |= __vmwrite(HOST_FS_BASE, host_env->fs_base);
  78.330 +    error |= __vmwrite(HOST_GS_BASE, host_env->gs_base);
  78.331  
  78.332  #endif
  78.333      host_env->cs_selector = __HYPERVISOR_CS;
  78.334 @@ -401,16 +407,16 @@ static inline int construct_vmcs_host(st
  78.335      error |= __vmwrite(HOST_CR0, crn); /* same CR0 */
  78.336  
  78.337      /* CR3 is set in vmx_final_setup_hostos */
  78.338 -    __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) : ); 
  78.339 +    __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) : );
  78.340      error |= __vmwrite(HOST_CR4, crn);
  78.341  
  78.342      error |= __vmwrite(HOST_RIP, (unsigned long) vmx_asm_vmexit_handler);
  78.343 -#ifdef __x86_64__ 
  78.344 -    /* TBD: support cr8 for 64-bit guest */ 
  78.345 -    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0); 
  78.346 -    __vmwrite(TPR_THRESHOLD, 0); 
  78.347 -    __vmwrite(SECONDARY_VM_EXEC_CONTROL, 0); 
  78.348 -#endif 
  78.349 +#ifdef __x86_64__
  78.350 +    /* TBD: support cr8 for 64-bit guest */
  78.351 +    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0);
  78.352 +    __vmwrite(TPR_THRESHOLD, 0);
  78.353 +    __vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
  78.354 +#endif
  78.355  
  78.356      return error;
  78.357  }
  78.358 @@ -440,37 +446,37 @@ int construct_vmcs(struct arch_vmx_struc
  78.359  
  78.360      if ((error = __vmpclear (vmcs_phys_ptr))) {
  78.361          printk("construct_vmcs: VMCLEAR failed\n");
  78.362 -        return -EINVAL;         
  78.363 +        return -EINVAL;
  78.364      }
  78.365      if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
  78.366          printk("construct_vmcs: load_vmcs failed: VMCS = %lx\n",
  78.367                 (unsigned long) vmcs_phys_ptr);
  78.368 -        return -EINVAL; 
  78.369 +        return -EINVAL;
  78.370      }
  78.371      if ((error = construct_vmcs_controls(arch_vmx))) {
  78.372          printk("construct_vmcs: construct_vmcs_controls failed\n");
  78.373 -        return -EINVAL;         
  78.374 +        return -EINVAL;
  78.375      }
  78.376      /* host selectors */
  78.377      if ((error = construct_vmcs_host(&host_env))) {
  78.378          printk("construct_vmcs: construct_vmcs_host failed\n");
  78.379 -        return -EINVAL;         
  78.380 +        return -EINVAL;
  78.381      }
  78.382      /* guest selectors */
  78.383 -    if ((error = construct_init_vmcs_guest(regs, ctxt, &host_env))) {
  78.384 +    if ((error = construct_init_vmcs_guest(regs))) {
  78.385          printk("construct_vmcs: construct_vmcs_guest failed\n");
  78.386 -        return -EINVAL;         
  78.387 -    }       
  78.388 +        return -EINVAL;
  78.389 +    }
  78.390  
  78.391 -    if ((error |= __vmwrite(EXCEPTION_BITMAP, 
  78.392 +    if ((error |= __vmwrite(EXCEPTION_BITMAP,
  78.393                              MONITOR_DEFAULT_EXCEPTION_BITMAP))) {
  78.394          printk("construct_vmcs: setting Exception bitmap failed\n");
  78.395 -        return -EINVAL;         
  78.396 +        return -EINVAL;
  78.397      }
  78.398  
  78.399      if (regs->eflags & EF_TF)
  78.400          __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
  78.401 -    else 
  78.402 +    else
  78.403          __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
  78.404  
  78.405      return 0;
  78.406 @@ -491,7 +497,7 @@ int modify_vmcs(struct arch_vmx_struct *
  78.407      if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
  78.408          printk("modify_vmcs: load_vmcs failed: VMCS = %lx\n",
  78.409                 (unsigned long) vmcs_phys_ptr);
  78.410 -        return -EINVAL; 
  78.411 +        return -EINVAL;
  78.412      }
  78.413      load_cpu_user_regs(regs);
  78.414  
  78.415 @@ -500,23 +506,23 @@ int modify_vmcs(struct arch_vmx_struct *
  78.416      return 0;
  78.417  }
  78.418  
  78.419 -int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
  78.420 +int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
  78.421  {
  78.422      int error;
  78.423  
  78.424      if ((error = __vmptrld(phys_ptr))) {
  78.425 -        clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  78.426 +        clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  78.427          return error;
  78.428      }
  78.429 -    set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  78.430 +    set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  78.431      return 0;
  78.432  }
  78.433  
  78.434 -int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
  78.435 +int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
  78.436  {
  78.437      /* take the current VMCS */
  78.438      __vmptrst(phys_ptr);
  78.439 -    clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  78.440 +    clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  78.441      return 0;
  78.442  }
  78.443  
  78.444 @@ -536,7 +542,7 @@ void vm_resume_fail(unsigned long eflags
  78.445      __vmx_bug(guest_cpu_user_regs());
  78.446  }
  78.447  
  78.448 -void arch_vmx_do_resume(struct vcpu *v) 
  78.449 +void arch_vmx_do_resume(struct vcpu *v)
  78.450  {
  78.451      u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
  78.452  
  78.453 @@ -545,7 +551,7 @@ void arch_vmx_do_resume(struct vcpu *v)
  78.454      reset_stack_and_jump(vmx_asm_do_resume);
  78.455  }
  78.456  
  78.457 -void arch_vmx_do_launch(struct vcpu *v) 
  78.458 +void arch_vmx_do_launch(struct vcpu *v)
  78.459  {
  78.460      u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
  78.461  
    79.1 --- a/xen/common/grant_table.c	Thu Sep 29 13:35:13 2005 -0600
    79.2 +++ b/xen/common/grant_table.c	Thu Sep 29 16:22:02 2005 -0600
    79.3 @@ -24,10 +24,6 @@
    79.4   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    79.5   */
    79.6  
    79.7 -#define GRANT_DEBUG 0
    79.8 -#define GRANT_DEBUG_VERBOSE 0
    79.9 -
   79.10 -#include <xen/config.h>
   79.11  #include <xen/lib.h>
   79.12  #include <xen/sched.h>
   79.13  #include <xen/shadow.h>
   79.14 @@ -68,39 +64,32 @@ put_maptrack_handle(
   79.15      t->map_count--;
   79.16  }
   79.17  
   79.18 +/*
   79.19 + * Returns 0 if TLB flush / invalidate required by caller.
   79.20 + * va will indicate the address to be invalidated.
   79.21 + * 
   79.22 + * addr is _either_ a host virtual address, or the address of the pte to
   79.23 + * update, as indicated by the GNTMAP_contains_pte flag.
   79.24 + */
   79.25  static int
   79.26 -__gnttab_activate_grant_ref(
   79.27 -    struct domain   *mapping_d,          /* IN */
   79.28 -    struct vcpu     *mapping_ed,
   79.29 -    struct domain   *granting_d,
   79.30 -    grant_ref_t      ref,
   79.31 -    u16              dev_hst_ro_flags,
   79.32 -    u64              addr,
   79.33 -    unsigned long   *pframe )            /* OUT */
   79.34 +__gnttab_map_grant_ref(
   79.35 +    gnttab_map_grant_ref_t *uop)
   79.36  {
   79.37 -    domid_t               sdom;
   79.38 -    u16                   sflags;
   79.39 +    domid_t        dom;
   79.40 +    grant_ref_t    ref;
   79.41 +    struct domain *ld, *rd;
   79.42 +    struct vcpu   *led;
   79.43 +    u16            dev_hst_ro_flags;
   79.44 +    int            handle;
   79.45 +    u64            addr;
   79.46 +    unsigned long  frame = 0;
   79.47 +    int            rc = GNTST_okay;
   79.48      active_grant_entry_t *act;
   79.49 -    grant_entry_t        *sha;
   79.50 -    s16                   rc = 1;
   79.51 -    unsigned long         frame = 0;
   79.52 -    int                   retries = 0;
   79.53  
   79.54 -    /*
   79.55 -     * Objectives of this function:
   79.56 -     * . Make the record ( granting_d, ref ) active, if not already.
   79.57 -     * . Update shared grant entry of owner, indicating frame is mapped.
   79.58 -     * . Increment the owner act->pin reference counts.
   79.59 -     * . get_page on shared frame if new mapping.
   79.60 -     * . get_page_type if this is first RW mapping of frame.
   79.61 -     * . Add PTE to virtual address space of mapping_d, if necessary.
   79.62 -     * Returns:
   79.63 -     * .  -ve: error
   79.64 -     * .    1: ok
   79.65 -     * .    0: ok and TLB invalidate of host_addr needed.
   79.66 -     *
   79.67 -     * On success, *pframe contains mfn.
   79.68 -     */
   79.69 +    /* Entry details from @rd's shared grant table. */
   79.70 +    grant_entry_t *sha;
   79.71 +    domid_t        sdom;
   79.72 +    u16            sflags;
   79.73  
   79.74      /*
   79.75       * We bound the number of times we retry CMPXCHG on memory locations that
   79.76 @@ -110,11 +99,88 @@ static int
   79.77       * the guest to race our updates (e.g., to change the GTF_readonly flag),
   79.78       * so we allow a few retries before failing.
   79.79       */
   79.80 +    int retries = 0;
   79.81  
   79.82 -    act = &granting_d->grant_table->active[ref];
   79.83 -    sha = &granting_d->grant_table->shared[ref];
   79.84 +    led = current;
   79.85 +    ld = led->domain;
   79.86 +
   79.87 +    /* Bitwise-OR avoids short-circuiting which screws control flow. */
   79.88 +    if ( unlikely(__get_user(dom, &uop->dom) |
   79.89 +                  __get_user(ref, &uop->ref) |
   79.90 +                  __get_user(addr, &uop->host_addr) |
   79.91 +                  __get_user(dev_hst_ro_flags, &uop->flags)) )
   79.92 +    {
   79.93 +        DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n");
   79.94 +        return -EFAULT; /* don't set status */
   79.95 +    }
   79.96 +
   79.97 +    if ( unlikely(ref >= NR_GRANT_ENTRIES) ||
   79.98 +         unlikely((dev_hst_ro_flags &
   79.99 +                   (GNTMAP_device_map|GNTMAP_host_map)) == 0) )
  79.100 +    {
  79.101 +        DPRINTK("Bad ref (%d) or flags (%x).\n", ref, dev_hst_ro_flags);
  79.102 +        (void)__put_user(GNTST_bad_gntref, &uop->handle);
  79.103 +        return GNTST_bad_gntref;
  79.104 +    }
  79.105 +
  79.106 +    if ( acm_pre_grant_map_ref(dom) )
  79.107 +    {
  79.108 +        (void)__put_user(GNTST_permission_denied, &uop->handle);
  79.109 +        return GNTST_permission_denied;
  79.110 +    }
  79.111 +
  79.112 +    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
  79.113 +         unlikely(ld == rd) )
  79.114 +    {
  79.115 +        if ( rd != NULL )
  79.116 +            put_domain(rd);
  79.117 +        DPRINTK("Could not find domain %d\n", dom);
  79.118 +        (void)__put_user(GNTST_bad_domain, &uop->handle);
  79.119 +        return GNTST_bad_domain;
  79.120 +    }
  79.121  
  79.122 -    spin_lock(&granting_d->grant_table->lock);
  79.123 +    /* Get a maptrack handle. */
  79.124 +    if ( unlikely((handle = get_maptrack_handle(ld->grant_table)) == -1) )
  79.125 +    {
  79.126 +        int              i;
  79.127 +        grant_mapping_t *new_mt;
  79.128 +        grant_table_t   *lgt = ld->grant_table;
  79.129 +
  79.130 +        if ( (lgt->maptrack_limit << 1) > MAPTRACK_MAX_ENTRIES )
  79.131 +        {
  79.132 +            put_domain(rd);
  79.133 +            DPRINTK("Maptrack table is at maximum size.\n");
  79.134 +            (void)__put_user(GNTST_no_device_space, &uop->handle);
  79.135 +            return GNTST_no_device_space;
  79.136 +        }
  79.137 +
  79.138 +        /* Grow the maptrack table. */
  79.139 +        new_mt = alloc_xenheap_pages(lgt->maptrack_order + 1);
  79.140 +        if ( new_mt == NULL )
  79.141 +        {
  79.142 +            put_domain(rd);
  79.143 +            DPRINTK("No more map handles available.\n");
  79.144 +            (void)__put_user(GNTST_no_device_space, &uop->handle);
  79.145 +            return GNTST_no_device_space;
  79.146 +        }
  79.147 +
  79.148 +        memcpy(new_mt, lgt->maptrack, PAGE_SIZE << lgt->maptrack_order);
  79.149 +        for ( i = lgt->maptrack_limit; i < (lgt->maptrack_limit << 1); i++ )
  79.150 +            new_mt[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;
  79.151 +
  79.152 +        free_xenheap_pages(lgt->maptrack, lgt->maptrack_order);
  79.153 +        lgt->maptrack          = new_mt;
  79.154 +        lgt->maptrack_order   += 1;
  79.155 +        lgt->maptrack_limit  <<= 1;
  79.156 +
  79.157 +        DPRINTK("Doubled maptrack size\n");
  79.158 +        handle = get_maptrack_handle(ld->grant_table);
  79.159 +    }
  79.160 +
  79.161 +    act = &rd->grant_table->active[ref];
  79.162 +    sha = &rd->grant_table->shared[ref];
  79.163 +
  79.164 +    spin_lock(&rd->grant_table->lock);
  79.165  
  79.166      if ( act->pin == 0 )
  79.167      {
  79.168 @@ -132,10 +198,10 @@ static int
  79.169              u32 scombo, prev_scombo, new_scombo;
  79.170  
  79.171              if ( unlikely((sflags & GTF_type_mask) != GTF_permit_access) ||
  79.172 -                 unlikely(sdom != mapping_d->domain_id) )
  79.173 +                 unlikely(sdom != led->domain->domain_id) )
  79.174                  PIN_FAIL(unlock_out, GNTST_general_error,
  79.175                           "Bad flags (%x) or dom (%d). (NB. expected dom %d)\n",
  79.176 -                        sflags, sdom, mapping_d->domain_id);
  79.177 +                        sflags, sdom, led->domain->domain_id);
  79.178  
  79.179              /* Merge two 16-bit values into a 32-bit combined update. */
  79.180              /* NB. Endianness! */
  79.181 @@ -173,12 +239,12 @@ static int
  79.182  
  79.183          /* rmb(); */ /* not on x86 */
  79.184  
  79.185 -        frame = __gpfn_to_mfn_foreign(granting_d, sha->frame);
  79.186 +        frame = __gpfn_to_mfn_foreign(rd, sha->frame);
  79.187  
  79.188          if ( unlikely(!pfn_valid(frame)) ||
  79.189               unlikely(!((dev_hst_ro_flags & GNTMAP_readonly) ?
  79.190 -                        get_page(&frame_table[frame], granting_d) :
  79.191 -                        get_page_and_type(&frame_table[frame], granting_d,
  79.192 +                        get_page(&frame_table[frame], rd) :
  79.193 +                        get_page_and_type(&frame_table[frame], rd,
  79.194                                            PGT_writable_page))) )
  79.195          {
  79.196              clear_bit(_GTF_writing, &sha->flags);
  79.197 @@ -208,10 +274,11 @@ static int
  79.198              PIN_FAIL(unlock_out, ENOSPC,
  79.199                       "Risk of counter overflow %08x\n", act->pin);
  79.200  
  79.201 -        frame = act->frame;
  79.202 +        sflags = sha->flags;
  79.203 +        frame  = act->frame;
  79.204  
  79.205 -        if ( !(dev_hst_ro_flags & GNTMAP_readonly) && 
  79.206 -             !((sflags = sha->flags) & GTF_writing) )
  79.207 +        if ( !(dev_hst_ro_flags & GNTMAP_readonly) &&
  79.208 +             !(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) )
  79.209          {
  79.210              for ( ; ; )
  79.211              {
  79.212 @@ -264,9 +331,9 @@ static int
  79.213       * frame contains the mfn.
  79.214       */
  79.215  
  79.216 -    spin_unlock(&granting_d->grant_table->lock);
  79.217 +    spin_unlock(&rd->grant_table->lock);
  79.218  
  79.219 -    if ( (addr != 0) && (dev_hst_ro_flags & GNTMAP_host_map) )
  79.220 +    if ( dev_hst_ro_flags & GNTMAP_host_map )
  79.221      {
  79.222          /* Write update into the pagetable. */
  79.223          l1_pgentry_t pte;
  79.224 @@ -278,18 +345,15 @@ static int
  79.225              l1e_add_flags(pte,_PAGE_RW);
  79.226  
  79.227          if ( dev_hst_ro_flags & GNTMAP_contains_pte )
  79.228 -            rc = update_grant_pte_mapping(addr, pte, mapping_d, mapping_ed);
  79.229 +            rc = update_grant_pte_mapping(addr, pte, led);
  79.230          else
  79.231 -            rc = update_grant_va_mapping(addr, pte, mapping_d, mapping_ed);
  79.232 +            rc = update_grant_va_mapping(addr, pte, led);
  79.233  
  79.234 -        /* IMPORTANT: rc indicates the degree of TLB flush that is required.
  79.235 -         * GNTST_flush_one (1) or GNTST_flush_all (2). This is done in the 
  79.236 -         * outer gnttab_map_grant_ref. */
  79.237          if ( rc < 0 )
  79.238          {
  79.239              /* Failure: undo and abort. */
  79.240  
  79.241 -            spin_lock(&granting_d->grant_table->lock);
  79.242 +            spin_lock(&rd->grant_table->lock);
  79.243  
  79.244              if ( dev_hst_ro_flags & GNTMAP_readonly )
  79.245              {
  79.246 @@ -311,186 +375,44 @@ static int
  79.247                  put_page(&frame_table[frame]);
  79.248              }
  79.249  
  79.250 -            spin_unlock(&granting_d->grant_table->lock);
  79.251 +            spin_unlock(&rd->grant_table->lock);
  79.252          }
  79.253 -
  79.254 -    }
  79.255 -
  79.256 -    *pframe = frame;
  79.257 -    return rc;
  79.258 -
  79.259 - unlock_out:
  79.260 -    spin_unlock(&granting_d->grant_table->lock);
  79.261 -    return rc;
  79.262 -}
  79.263 -
  79.264 -/*
  79.265 - * Returns 0 if TLB flush / invalidate required by caller.
  79.266 - * va will indicate the address to be invalidated.
  79.267 - * 
  79.268 - * addr is _either_ a host virtual address, or the address of the pte to
  79.269 - * update, as indicated by the GNTMAP_contains_pte flag.
  79.270 - */
  79.271 -static int
  79.272 -__gnttab_map_grant_ref(
  79.273 -    gnttab_map_grant_ref_t *uop,
  79.274 -    unsigned long *va)
  79.275 -{
  79.276 -    domid_t        dom;
  79.277 -    grant_ref_t    ref;
  79.278 -    struct domain *ld, *rd;
  79.279 -    struct vcpu   *led;
  79.280 -    u16            dev_hst_ro_flags;
  79.281 -    int            handle;
  79.282 -    u64            addr;
  79.283 -    unsigned long  frame = 0;
  79.284 -    int            rc;
  79.285 -
  79.286 -    led = current;
  79.287 -    ld = led->domain;
  79.288 -
  79.289 -    /* Bitwise-OR avoids short-circuiting which screws control flow. */
  79.290 -    if ( unlikely(__get_user(dom, &uop->dom) |
  79.291 -                  __get_user(ref, &uop->ref) |
  79.292 -                  __get_user(addr, &uop->host_addr) |
  79.293 -                  __get_user(dev_hst_ro_flags, &uop->flags)) )
  79.294 -    {
  79.295 -        DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n");
  79.296 -        return -EFAULT; /* don't set status */
  79.297 -    }
  79.298 -
  79.299 -    if ( (dev_hst_ro_flags & GNTMAP_host_map) &&
  79.300 -         ( (addr == 0) ||
  79.301 -           (!(dev_hst_ro_flags & GNTMAP_contains_pte) && 
  79.302 -            unlikely(!__addr_ok(addr))) ) )
  79.303 -    {
  79.304 -        DPRINTK("Bad virtual address (%"PRIx64") or flags (%"PRIx16").\n",
  79.305 -                addr, dev_hst_ro_flags);
  79.306 -        (void)__put_user(GNTST_bad_virt_addr, &uop->handle);
  79.307 -        return GNTST_bad_gntref;
  79.308 -    }
  79.309 -
  79.310 -    if ( unlikely(ref >= NR_GRANT_ENTRIES) ||
  79.311 -         unlikely((dev_hst_ro_flags &
  79.312 -                   (GNTMAP_device_map|GNTMAP_host_map)) == 0) )
  79.313 -    {
  79.314 -        DPRINTK("Bad ref (%d) or flags (%x).\n", ref, dev_hst_ro_flags);
  79.315 -        (void)__put_user(GNTST_bad_gntref, &uop->handle);
  79.316 -        return GNTST_bad_gntref;
  79.317 -    }
  79.318 -
  79.319 -    if (acm_pre_grant_map_ref(dom)) {
  79.320 -        (void)__put_user(GNTST_permission_denied, &uop->handle);
  79.321 -        return GNTST_permission_denied;
  79.322      }
  79.323  
  79.324 -    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
  79.325 -         unlikely(ld == rd) )
  79.326 -    {
  79.327 -        if ( rd != NULL )
  79.328 -            put_domain(rd);
  79.329 -        DPRINTK("Could not find domain %d\n", dom);
  79.330 -        (void)__put_user(GNTST_bad_domain, &uop->handle);
  79.331 -        return GNTST_bad_domain;
  79.332 -    }
  79.333 -
  79.334 -    /* Get a maptrack handle. */
  79.335 -    if ( unlikely((handle = get_maptrack_handle(ld->grant_table)) == -1) )
  79.336 -    {
  79.337 -        int              i;
  79.338 -        grant_mapping_t *new_mt;
  79.339 -        grant_table_t   *lgt = ld->grant_table;
  79.340 -
  79.341 -        if ( (lgt->maptrack_limit << 1) > MAPTRACK_MAX_ENTRIES )
  79.342 -        {
  79.343 -            put_domain(rd);
  79.344 -            DPRINTK("Maptrack table is at maximum size.\n");
  79.345 -            (void)__put_user(GNTST_no_device_space, &uop->handle);
  79.346 -            return GNTST_no_device_space;
  79.347 -        }
  79.348 -
  79.349 -        /* Grow the maptrack table. */
  79.350 -        new_mt = alloc_xenheap_pages(lgt->maptrack_order + 1);
  79.351 -        if ( new_mt == NULL )
  79.352 -        {
  79.353 -            put_domain(rd);
  79.354 -            DPRINTK("No more map handles available.\n");
  79.355 -            (void)__put_user(GNTST_no_device_space, &uop->handle);
  79.356 -            return GNTST_no_device_space;
  79.357 -        }
  79.358 -
  79.359 -        memcpy(new_mt, lgt->maptrack, PAGE_SIZE << lgt->maptrack_order);
  79.360 -        for ( i = lgt->maptrack_limit; i < (lgt->maptrack_limit << 1); i++ )
  79.361 -            new_mt[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;
  79.362 +    ld->grant_table->maptrack[handle].domid         = dom;
  79.363 +    ld->grant_table->maptrack[handle].ref_and_flags =
  79.364 +        (ref << MAPTRACK_REF_SHIFT) |
  79.365 +        (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK);
  79.366  
  79.367 -        free_xenheap_pages(lgt->maptrack, lgt->maptrack_order);
  79.368 -        lgt->maptrack          = new_mt;
  79.369 -        lgt->maptrack_order   += 1;
  79.370 -        lgt->maptrack_limit  <<= 1;
  79.371 -
  79.372 -        DPRINTK("Doubled maptrack size\n");
  79.373 -        handle = get_maptrack_handle(ld->grant_table);
  79.374 -    }
  79.375 -
  79.376 -#if GRANT_DEBUG_VERBOSE
  79.377 -    DPRINTK("Mapping grant ref (%hu) for domain (%hu) with flags (%x)\n",
  79.378 -            ref, dom, dev_hst_ro_flags);
  79.379 -#endif
  79.380 -
  79.381 -    if ( (rc = __gnttab_activate_grant_ref(ld, led, rd, ref, dev_hst_ro_flags,
  79.382 -                                           addr, &frame)) >= 0 )
  79.383 -    {
  79.384 -        /*
  79.385 -         * Only make the maptrack live _after_ writing the pte, in case we 
  79.386 -         * overwrite the same frame number, causing a maptrack walk to find it
  79.387 -         */
  79.388 -        ld->grant_table->maptrack[handle].domid = dom;
  79.389 -
  79.390 -        ld->grant_table->maptrack[handle].ref_and_flags
  79.391 -            = (ref << MAPTRACK_REF_SHIFT) |
  79.392 -              (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK);
  79.393 -
  79.394 -        (void)__put_user((u64)frame << PAGE_SHIFT, &uop->dev_bus_addr);
  79.395 -
  79.396 -        if ( ( dev_hst_ro_flags & GNTMAP_host_map ) &&
  79.397 -             !( dev_hst_ro_flags & GNTMAP_contains_pte) )
  79.398 -            *va = addr;
  79.399 -
  79.400 -        (void)__put_user(handle, &uop->handle);
  79.401 -    }
  79.402 -    else
  79.403 -    {
  79.404 -        (void)__put_user(rc, &uop->handle);
  79.405 -        put_maptrack_handle(ld->grant_table, handle);
  79.406 -    }
  79.407 +    (void)__put_user((u64)frame << PAGE_SHIFT, &uop->dev_bus_addr);
  79.408 +    (void)__put_user(handle, &uop->handle);
  79.409  
  79.410      put_domain(rd);
  79.411      return rc;
  79.412 +
  79.413 +
  79.414 + unlock_out:
  79.415 +    spin_unlock(&rd->grant_table->lock);
  79.416 +    (void)__put_user(rc, &uop->handle);
  79.417 +    put_maptrack_handle(ld->grant_table, handle);
  79.418 +    return rc;
  79.419  }
  79.420  
  79.421  static long
  79.422  gnttab_map_grant_ref(
  79.423      gnttab_map_grant_ref_t *uop, unsigned int count)
  79.424  {
  79.425 -    int i, rc, flush = 0;
  79.426 -    unsigned long va = 0;
  79.427 +    int i;
  79.428  
  79.429      for ( i = 0; i < count; i++ )
  79.430 -        if ( (rc =__gnttab_map_grant_ref(&uop[i], &va)) >= 0 )
  79.431 -            flush += rc;
  79.432 -
  79.433 -    if ( flush == 1 )
  79.434 -        flush_tlb_one_mask(current->domain->cpumask, va);
  79.435 -    else if ( flush != 0 ) 
  79.436 -        flush_tlb_mask(current->domain->cpumask);
  79.437 +        (void)__gnttab_map_grant_ref(&uop[i]);
  79.438  
  79.439      return 0;
  79.440  }
  79.441  
  79.442  static int
  79.443  __gnttab_unmap_grant_ref(
  79.444 -    gnttab_unmap_grant_ref_t *uop,
  79.445 -    unsigned long *va)
  79.446 +    gnttab_unmap_grant_ref_t *uop)
  79.447  {
  79.448      domid_t          dom;
  79.449      grant_ref_t      ref;
  79.450 @@ -500,7 +422,7 @@ static int
  79.451      grant_entry_t   *sha;
  79.452      grant_mapping_t *map;
  79.453      u16              flags;
  79.454 -    s16              rc = 1;
  79.455 +    s16              rc = 0;
  79.456      u64              addr, dev_bus_addr;
  79.457      unsigned long    frame;
  79.458  
  79.459 @@ -541,11 +463,6 @@ static int
  79.460          return GNTST_bad_domain;
  79.461      }
  79.462  
  79.463 -#if GRANT_DEBUG_VERBOSE
  79.464 -    DPRINTK("Unmapping grant ref (%hu) for domain (%hu) with handle (%hu)\n",
  79.465 -            ref, dom, handle);
  79.466 -#endif
  79.467 -
  79.468      act = &rd->grant_table->active[ref];
  79.469      sha = &rd->grant_table->shared[ref];
  79.470  
  79.471 @@ -566,8 +483,6 @@ static int
  79.472  
  79.473          map->ref_and_flags &= ~GNTMAP_device_map;
  79.474          (void)__put_user(0, &uop->dev_bus_addr);
  79.475 -
  79.476 -        /* Frame is now unmapped for device access. */
  79.477      }
  79.478  
  79.479      if ( (addr != 0) &&
  79.480 @@ -589,10 +504,6 @@ static int
  79.481  
  79.482          act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_hstr_inc
  79.483                                                : GNTPIN_hstw_inc;
  79.484 -
  79.485 -        rc = 0;
  79.486 -        if ( !( flags & GNTMAP_contains_pte) )
  79.487 -            *va = addr;
  79.488      }
  79.489  
  79.490      if ( (map->ref_and_flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0)
  79.491 @@ -632,17 +543,12 @@ static long
  79.492  gnttab_unmap_grant_ref(
  79.493      gnttab_unmap_grant_ref_t *uop, unsigned int count)
  79.494  {
  79.495 -    int i, flush = 0;
  79.496 -    unsigned long va = 0;
  79.497 +    int i;
  79.498  
  79.499      for ( i = 0; i < count; i++ )
  79.500 -        if ( __gnttab_unmap_grant_ref(&uop[i], &va) == 0 )
  79.501 -            flush++;
  79.502 +        (void)__gnttab_unmap_grant_ref(&uop[i]);
  79.503  
  79.504 -    if ( flush == 1 )
  79.505 -        flush_tlb_one_mask(current->domain->cpumask, va);
  79.506 -    else if ( flush != 0 ) 
  79.507 -        flush_tlb_mask(current->domain->cpumask);
  79.508 +    flush_tlb_mask(current->domain->cpumask);
  79.509  
  79.510      return 0;
  79.511  }
  79.512 @@ -703,9 +609,9 @@ gnttab_setup_table(
  79.513      return 0;
  79.514  }
  79.515  
  79.516 -#if GRANT_DEBUG
  79.517  static int
  79.518 -gnttab_dump_table(gnttab_dump_table_t *uop)
  79.519 +gnttab_dump_table(
  79.520 +    gnttab_dump_table_t *uop)
  79.521  {
  79.522      grant_table_t        *gt;
  79.523      gnttab_dump_table_t   op;
  79.524 @@ -716,6 +622,8 @@ gnttab_dump_table(gnttab_dump_table_t *u
  79.525      grant_mapping_t      *maptrack;
  79.526      int                   i;
  79.527  
  79.528 +    if ( !IS_PRIV(current->domain) )
  79.529 +        return -EPERM;
  79.530  
  79.531      if ( unlikely(copy_from_user(&op, uop, sizeof(op)) != 0) )
  79.532      {
  79.533 @@ -724,9 +632,7 @@ gnttab_dump_table(gnttab_dump_table_t *u
  79.534      }
  79.535  
  79.536      if ( op.dom == DOMID_SELF )
  79.537 -    {
  79.538          op.dom = current->domain->domain_id;
  79.539 -    }
  79.540  
  79.541      if ( unlikely((d = find_domain_by_id(op.dom)) == NULL) )
  79.542      {
  79.543 @@ -750,14 +656,11 @@ gnttab_dump_table(gnttab_dump_table_t *u
  79.544  
  79.545      for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
  79.546      {
  79.547 -        sha_copy =  gt->shared[i];
  79.548 -
  79.549 +        sha_copy = gt->shared[i];
  79.550          if ( sha_copy.flags )
  79.551 -        {
  79.552              DPRINTK("Grant: dom (%hu) SHARED (%d) flags:(%hx) "
  79.553                      "dom:(%hu) frame:(%x)\n",
  79.554                      op.dom, i, sha_copy.flags, sha_copy.domid, sha_copy.frame);
  79.555 -        }
  79.556      }
  79.557  
  79.558      spin_lock(&gt->lock);
  79.559 @@ -765,28 +668,22 @@ gnttab_dump_table(gnttab_dump_table_t *u
  79.560      for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
  79.561      {
  79.562          act = &gt->active[i];
  79.563 -
  79.564          if ( act->pin )
  79.565 -        {
  79.566              DPRINTK("Grant: dom (%hu) ACTIVE (%d) pin:(%x) "
  79.567                      "dom:(%hu) frame:(%lx)\n",
  79.568                      op.dom, i, act->pin, act->domid, act->frame);
  79.569 -        }
  79.570      }
  79.571  
  79.572      for ( i = 0; i < gt->maptrack_limit; i++ )
  79.573      {
  79.574          maptrack = &gt->maptrack[i];
  79.575 -
  79.576          if ( maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK )
  79.577 -        {
  79.578              DPRINTK("Grant: dom (%hu) MAP (%d) ref:(%hu) flags:(%x) "
  79.579                      "dom:(%hu)\n",
  79.580                      op.dom, i,
  79.581                      maptrack->ref_and_flags >> MAPTRACK_REF_SHIFT,
  79.582                      maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK,
  79.583                      maptrack->domid);
  79.584 -        }
  79.585      }
  79.586  
  79.587      spin_unlock(&gt->lock);
  79.588 @@ -794,10 +691,10 @@ gnttab_dump_table(gnttab_dump_table_t *u
  79.589      put_domain(d);
  79.590      return 0;
  79.591  }
  79.592 -#endif
  79.593  
  79.594  static long
  79.595 -gnttab_transfer(gnttab_transfer_t *uop, unsigned int count)
  79.596 +gnttab_transfer(
  79.597 +    gnttab_transfer_t *uop, unsigned int count)
  79.598  {
  79.599      struct domain *d = current->domain;
  79.600      struct domain *e;
  79.601 @@ -810,10 +707,7 @@ gnttab_transfer(gnttab_transfer_t *uop, 
  79.602      for ( i = 0; i < count; i++ )
  79.603      {
  79.604          gnttab_transfer_t *gop = &uop[i];
  79.605 -#if GRANT_DEBUG
  79.606 -        printk("gnttab_transfer: i=%d mfn=%lx domid=%d gref=%08x\n",
  79.607 -               i, gop->mfn, gop->domid, gop->handle);
  79.608 -#endif
  79.609 +
  79.610          page = &frame_table[gop->mfn];
  79.611          
  79.612          if ( unlikely(IS_XEN_HEAP_FRAME(page)))
  79.613 @@ -956,11 +850,9 @@ do_grant_table_op(
  79.614      case GNTTABOP_setup_table:
  79.615          rc = gnttab_setup_table((gnttab_setup_table_t *)uop, count);
  79.616          break;
  79.617 -#if GRANT_DEBUG
  79.618      case GNTTABOP_dump_table:
  79.619          rc = gnttab_dump_table((gnttab_dump_table_t *)uop);
  79.620          break;
  79.621 -#endif
  79.622      case GNTTABOP_transfer:
  79.623          if (unlikely(!array_access_ok(
  79.624              uop, count, sizeof(gnttab_transfer_t))))
  79.625 @@ -1002,12 +894,6 @@ gnttab_check_unmap(
  79.626      
  79.627      lgt = ld->grant_table;
  79.628      
  79.629 -#if GRANT_DEBUG_VERBOSE
  79.630 -    if ( ld->domain_id != 0 )
  79.631 -        DPRINTK("Foreign unref rd(%d) ld(%d) frm(%lx) flgs(%x).\n",
  79.632 -                rd->domain_id, ld->domain_id, frame, readonly);
  79.633 -#endif
  79.634 -    
  79.635      /* Fast exit if we're not mapping anything using grant tables */
  79.636      if ( lgt->map_count == 0 )
  79.637          return 0;
  79.638 @@ -1098,11 +984,6 @@ gnttab_prepare_for_transfer(
  79.639      int            retries = 0;
  79.640      unsigned long  target_pfn;
  79.641  
  79.642 -#if GRANT_DEBUG_VERBOSE
  79.643 -    DPRINTK("gnttab_prepare_for_transfer rd(%hu) ld(%hu) ref(%hu).\n",
  79.644 -            rd->domain_id, ld->domain_id, ref);
  79.645 -#endif
  79.646 -
  79.647      if ( unlikely((rgt = rd->grant_table) == NULL) ||
  79.648           unlikely(ref >= NR_GRANT_ENTRIES) )
  79.649      {
    81.1 --- a/xen/include/asm-x86/e820.h	Thu Sep 29 13:35:13 2005 -0600
    81.2 +++ b/xen/include/asm-x86/e820.h	Thu Sep 29 16:22:02 2005 -0600
    81.3 @@ -11,6 +11,11 @@
    81.4  #define E820_NVS          4
    81.5  #define E820_IO          16
    81.6  #define E820_SHARED_PAGE 17
    81.7 +#define E820_XENSTORE    18
    81.8 +
    81.9 +#define E820_MAP_PAGE        0x00090000
   81.10 +#define E820_MAP_NR_OFFSET   0x000001E8
   81.11 +#define E820_MAP_OFFSET      0x000002D0
   81.12  
   81.13  #ifndef __ASSEMBLY__
   81.14  struct e820entry {
    82.1 --- a/xen/include/asm-x86/mm.h	Thu Sep 29 13:35:13 2005 -0600
    82.2 +++ b/xen/include/asm-x86/mm.h	Thu Sep 29 16:22:02 2005 -0600
    82.3 @@ -380,11 +380,9 @@ extern int __sync_lazy_execstate(void);
    82.4   * hold a reference to the page.
    82.5   */
    82.6  int update_grant_va_mapping(
    82.7 -    unsigned long va, l1_pgentry_t _nl1e, 
    82.8 -    struct domain *d, struct vcpu *v);
    82.9 +    unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v);
   82.10  int update_grant_pte_mapping(
   82.11 -    unsigned long pte_addr, l1_pgentry_t _nl1e, 
   82.12 -    struct domain *d, struct vcpu *v);
   82.13 +    unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v);
   82.14  int clear_grant_va_mapping(unsigned long addr, unsigned long frame);
   82.15  int clear_grant_pte_mapping(
   82.16      unsigned long addr, unsigned long frame, struct domain *d);
    83.1 --- a/xen/include/asm-x86/vmx_platform.h	Thu Sep 29 13:35:13 2005 -0600
    83.2 +++ b/xen/include/asm-x86/vmx_platform.h	Thu Sep 29 16:22:02 2005 -0600
    83.3 @@ -93,7 +93,6 @@ struct virtual_platform_def {
    83.4  
    83.5  extern void handle_mmio(unsigned long, unsigned long);
    83.6  extern void vmx_wait_io(void);
    83.7 -extern int vmx_setup_platform(struct vcpu *, struct cpu_user_regs *);
    83.8  extern void vmx_io_assist(struct vcpu *v);
    83.9  
   83.10  // XXX - think about this -- maybe use bit 30 of the mfn to signify an MMIO frame.
    84.1 --- a/xen/include/xen/grant_table.h	Thu Sep 29 13:35:13 2005 -0600
    84.2 +++ b/xen/include/xen/grant_table.h	Thu Sep 29 16:22:02 2005 -0600
    84.3 @@ -110,8 +110,4 @@ gnttab_prepare_for_transfer(
    84.4  void
    84.5  gnttab_release_dev_mappings(grant_table_t *gt);
    84.6