ia64/xen-unstable

changeset 7063:ef9591d03fdd

Merge latest xen-unstable into xen-ia64-unstable
author djm@kirby.fc.hp.com
date Mon Sep 26 11:07:49 2005 -0600 (2005-09-26)
parents eaedc6b4ec0f 811559fb02ab
children 4e1031ce3bc2
files Makefile docs/src/user/installation.tex linux-2.6-xen-sparse/arch/ia64/Kconfig linux-2.6-xen-sparse/arch/ia64/xen-mkbuildtree-pre linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_ia64 linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c linux-2.6-xen-sparse/arch/xen/kernel/reboot.c linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c linux-2.6-xen-sparse/include/asm-xen/asm-ia64/hypervisor.h linux-2.6-xen-sparse/include/asm-xen/xenbus.h tools/check/check_hotplug tools/examples/Makefile tools/examples/xmexample.vmx tools/firmware/vmxassist/Makefile tools/firmware/vmxassist/vmxloader.c tools/ioemu/hw/cirrus_vga.c tools/ioemu/hw/pc.c tools/ioemu/hw/vga.c tools/ioemu/target-i386-dm/helper2.c tools/ioemu/vl.c tools/libxc/xc_vmx_build.c tools/libxc/xenguest.h tools/libxc/xg_private.h tools/python/xen/lowlevel/xc/xc.c tools/python/xen/lowlevel/xs/xs.c tools/python/xen/xend/PrettyPrint.py tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/image.py tools/python/xen/xend/server/DevController.py tools/python/xen/xend/xenstore/xsnode.py tools/python/xen/xend/xenstore/xstransact.py tools/python/xen/xm/main.py tools/xenstore/Makefile tools/xenstore/speedtest.c tools/xenstore/tdb.c tools/xenstore/tdb.h tools/xenstore/testsuite/04rm.test tools/xenstore/testsuite/08transaction.slowtest tools/xenstore/testsuite/08transaction.test tools/xenstore/testsuite/12readonly.test tools/xenstore/testsuite/14complexperms.test tools/xenstore/testsuite/16block-watch-crash.test tools/xenstore/xenstore_client.c tools/xenstore/xenstored.h tools/xenstore/xenstored_core.c tools/xenstore/xenstored_core.h tools/xenstore/xenstored_domain.c tools/xenstore/xenstored_transaction.c tools/xenstore/xenstored_transaction.h tools/xenstore/xenstored_watch.c tools/xenstore/xenstored_watch.h tools/xenstore/xs.c tools/xenstore/xs.h tools/xenstore/xs_lib.c tools/xenstore/xs_lib.h tools/xenstore/xs_random.c tools/xenstore/xs_stress.c tools/xenstore/xs_tdb_dump.c tools/xenstore/xs_test.c xen/arch/x86/mm.c xen/arch/x86/vmx_vmcs.c xen/common/grant_table.c xen/include/asm-x86/e820.h xen/include/asm-x86/mm.h xen/include/asm-x86/vmx_platform.h xen/include/xen/grant_table.h
line diff
     1.1 --- a/Makefile	Fri Sep 23 15:41:28 2005 -0600
     1.2 +++ b/Makefile	Mon Sep 26 11:07:49 2005 -0600
     1.3 @@ -164,7 +164,7 @@ help:
     1.4  uninstall: DESTDIR=
     1.5  uninstall: D=$(DESTDIR)
     1.6  uninstall:
     1.7 -	[ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s`
     1.8 +	[ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s` || true
     1.9  	rm -rf $(D)/etc/init.d/xend*
    1.10  	rm -rf $(D)/etc/hotplug/xen-backend.agent
    1.11  	rm -rf $(D)/var/run/xen* $(D)/var/lib/xen*
     2.1 --- a/docs/src/user/installation.tex	Fri Sep 23 15:41:28 2005 -0600
     2.2 +++ b/docs/src/user/installation.tex	Mon Sep 26 11:07:49 2005 -0600
     2.3 @@ -21,6 +21,9 @@ required if you wish to build from sourc
     2.4  \item [$\dag$] The \path{iproute2} package.
     2.5  \item [$\dag$] The Linux bridge-utils\footnote{Available from {\tt
     2.6        http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl})
     2.7 +\item [$\dag$] The Linux hotplug system\footnote{Available from {\tt
     2.8 +      http://linux-hotplug.sourceforge.net/}} (e.g., \path{/sbin/hotplug}
     2.9 +      and related scripts)
    2.10  \item [$\dag$] An installation of Twisted~v1.3 or
    2.11    above\footnote{Available from {\tt http://www.twistedmatrix.com}}.
    2.12    There may be a binary package available for your distribution;
     3.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c	Fri Sep 23 15:41:28 2005 -0600
     3.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c	Mon Sep 26 11:07:49 2005 -0600
     3.3 @@ -1394,9 +1394,7 @@ static void handle_vcpu_hotplug_event(st
     3.4  			return;
     3.5  
     3.6  		/* get the state value */
     3.7 -		xenbus_transaction_start("cpu");
     3.8  		err = xenbus_scanf(dir, "availability", "%s", state);
     3.9 -		xenbus_transaction_end(0);
    3.10  
    3.11  		if (err != 1) {
    3.12  			printk(KERN_ERR
     4.1 --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Fri Sep 23 15:41:28 2005 -0600
     4.2 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Mon Sep 26 11:07:49 2005 -0600
     4.3 @@ -324,7 +324,7 @@ static void shutdown_handler(struct xenb
     4.4      int err;
     4.5  
     4.6   again:
     4.7 -    err = xenbus_transaction_start("control");
     4.8 +    err = xenbus_transaction_start();
     4.9      if (err)
    4.10  	return;
    4.11      str = (char *)xenbus_read("control", "shutdown", NULL);
    4.12 @@ -337,7 +337,7 @@ static void shutdown_handler(struct xenb
    4.13      xenbus_write("control", "shutdown", "");
    4.14  
    4.15      err = xenbus_transaction_end(0);
    4.16 -    if (err == -ETIMEDOUT) {
    4.17 +    if (err == -EAGAIN) {
    4.18  	kfree(str);
    4.19  	goto again;
    4.20      }
    4.21 @@ -366,7 +366,7 @@ static void sysrq_handler(struct xenbus_
    4.22      int err;
    4.23  
    4.24   again:
    4.25 -    err = xenbus_transaction_start("control");
    4.26 +    err = xenbus_transaction_start();
    4.27      if (err)
    4.28  	return;
    4.29      if (!xenbus_scanf("control", "sysrq", "%c", &sysrq_key)) {
    4.30 @@ -379,7 +379,7 @@ static void sysrq_handler(struct xenbus_
    4.31  	xenbus_printf("control", "sysrq", "%c", '\0');
    4.32  
    4.33      err = xenbus_transaction_end(0);
    4.34 -    if (err == -ETIMEDOUT)
    4.35 +    if (err == -EAGAIN)
    4.36  	goto again;
    4.37  
    4.38      if (sysrq_key != '\0') {
     5.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Fri Sep 23 15:41:28 2005 -0600
     5.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Mon Sep 26 11:07:49 2005 -0600
     5.3 @@ -80,8 +80,9 @@ static void frontend_changed(struct xenb
     5.4  		return;
     5.5  	}
     5.6  
     5.7 +again:
     5.8  	/* Supply the information about the device the frontend needs */
     5.9 -	err = xenbus_transaction_start(be->dev->nodename);
    5.10 +	err = xenbus_transaction_start();
    5.11  	if (err) {
    5.12  		xenbus_dev_error(be->dev, err, "starting transaction");
    5.13  		return;
    5.14 @@ -119,7 +120,15 @@ static void frontend_changed(struct xenb
    5.15  		goto abort;
    5.16  	}
    5.17  
    5.18 -	xenbus_transaction_end(0);
    5.19 +	err = xenbus_transaction_end(0);
    5.20 +	if (err == -EAGAIN)
    5.21 +		goto again;
    5.22 +	if (err) {
    5.23 +		xenbus_dev_error(be->dev, err, "ending transaction",
    5.24 +				 ring_ref, evtchn);
    5.25 +		goto abort;
    5.26 +	}
    5.27 +
    5.28  	xenbus_dev_ok(be->dev);
    5.29  
    5.30  	return;
     6.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c	Fri Sep 23 15:41:28 2005 -0600
     6.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c	Mon Sep 26 11:07:49 2005 -0600
     6.3 @@ -572,7 +572,8 @@ static int talk_to_backend(struct xenbus
     6.4  		goto out;
     6.5  	}
     6.6  
     6.7 -	err = xenbus_transaction_start(dev->nodename);
     6.8 +again:
     6.9 +	err = xenbus_transaction_start();
    6.10  	if (err) {
    6.11  		xenbus_dev_error(dev, err, "starting transaction");
    6.12  		goto destroy_blkring;
    6.13 @@ -603,6 +604,8 @@ static int talk_to_backend(struct xenbus
    6.14  
    6.15  	err = xenbus_transaction_end(0);
    6.16  	if (err) {
    6.17 +		if (err == -EAGAIN)
    6.18 +			goto again;
    6.19  		xenbus_dev_error(dev, err, "completing transaction");
    6.20  		goto destroy_blkring;
    6.21  	}
     7.1 --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Fri Sep 23 15:41:28 2005 -0600
     7.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Mon Sep 26 11:07:49 2005 -0600
     7.3 @@ -1122,7 +1122,8 @@ static int talk_to_backend(struct xenbus
     7.4  		goto out;
     7.5  	}
     7.6  
     7.7 -	err = xenbus_transaction_start(dev->nodename);
     7.8 +again:
     7.9 +	err = xenbus_transaction_start();
    7.10  	if (err) {
    7.11  		xenbus_dev_error(dev, err, "starting transaction");
    7.12  		goto destroy_ring;
    7.13 @@ -1160,6 +1161,8 @@ static int talk_to_backend(struct xenbus
    7.14  
    7.15  	err = xenbus_transaction_end(0);
    7.16  	if (err) {
    7.17 +		if (err == -EAGAIN)
    7.18 +			goto again;
    7.19  		xenbus_dev_error(dev, err, "completing transaction");
    7.20  		goto destroy_ring;
    7.21  	}
     8.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Fri Sep 23 15:41:28 2005 -0600
     8.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Mon Sep 26 11:07:49 2005 -0600
     8.3 @@ -93,7 +93,8 @@ static void frontend_changed(struct xenb
     8.4  	 * Tell the front-end that we are ready to go -
     8.5  	 * unless something bad happens
     8.6  	 */
     8.7 -	err = xenbus_transaction_start(be->dev->nodename);
     8.8 +again:
     8.9 +	err = xenbus_transaction_start();
    8.10  	if (err) {
    8.11  		xenbus_dev_error(be->dev, err, "starting transaction");
    8.12  		return;
    8.13 @@ -127,7 +128,14 @@ static void frontend_changed(struct xenb
    8.14  		goto abort;
    8.15  	}
    8.16  
    8.17 -	xenbus_transaction_end(0);
    8.18 +	err = xenbus_transaction_end(0);
    8.19 +	if (err == -EAGAIN)
    8.20 +		goto again;
    8.21 +	if (err) {
    8.22 +		xenbus_dev_error(be->dev, err, "end of transaction");
    8.23 +		goto abort;
    8.24 +	}
    8.25 +
    8.26  	xenbus_dev_ok(be->dev);
    8.27  	return;
    8.28  abort:
     9.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Fri Sep 23 15:41:28 2005 -0600
     9.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Mon Sep 26 11:07:49 2005 -0600
     9.3 @@ -331,7 +331,8 @@ static int talk_to_backend(struct xenbus
     9.4  		goto out;
     9.5  	}
     9.6  
     9.7 -	err = xenbus_transaction_start(dev->nodename);
     9.8 +again:
     9.9 +	err = xenbus_transaction_start();
    9.10  	if (err) {
    9.11  		xenbus_dev_error(dev, err, "starting transaction");
    9.12  		goto destroy_tpmring;
    9.13 @@ -363,6 +364,8 @@ static int talk_to_backend(struct xenbus
    9.14  	}
    9.15  
    9.16  	err = xenbus_transaction_end(0);
    9.17 +	if (err == -EAGAIN)
    9.18 +		goto again;
    9.19  	if (err) {
    9.20  		xenbus_dev_error(dev, err, "completing transaction");
    9.21  		goto destroy_tpmring;
    10.1 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c	Fri Sep 23 15:41:28 2005 -0600
    10.2 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c	Mon Sep 26 11:07:49 2005 -0600
    10.3 @@ -287,12 +287,11 @@ EXPORT_SYMBOL(xenbus_rm);
    10.4  
    10.5  /* Start a transaction: changes by others will not be seen during this
    10.6   * transaction, and changes will not be visible to others until end.
    10.7 - * Transaction only applies to the given subtree.
    10.8   * You can only have one transaction at any time.
    10.9   */
   10.10 -int xenbus_transaction_start(const char *subtree)
   10.11 +int xenbus_transaction_start(void)
   10.12  {
   10.13 -	return xs_error(xs_single(XS_TRANSACTION_START, subtree, NULL));
   10.14 +	return xs_error(xs_single(XS_TRANSACTION_START, "", NULL));
   10.15  }
   10.16  EXPORT_SYMBOL(xenbus_transaction_start);
   10.17  
    11.1 --- a/linux-2.6-xen-sparse/include/asm-xen/xenbus.h	Fri Sep 23 15:41:28 2005 -0600
    11.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/xenbus.h	Mon Sep 26 11:07:49 2005 -0600
    11.3 @@ -87,7 +87,7 @@ int xenbus_write(const char *dir, const 
    11.4  int xenbus_mkdir(const char *dir, const char *node);
    11.5  int xenbus_exists(const char *dir, const char *node);
    11.6  int xenbus_rm(const char *dir, const char *node);
    11.7 -int xenbus_transaction_start(const char *subtree);
    11.8 +int xenbus_transaction_start(void);
    11.9  int xenbus_transaction_end(int abort);
   11.10  
   11.11  /* Single read and scanf: returns -errno or num scanned if > 0. */
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/tools/check/check_hotplug	Mon Sep 26 11:07:49 2005 -0600
    12.3 @@ -0,0 +1,10 @@
    12.4 +#!/bin/bash
    12.5 +# CHECK-INSTALL
    12.6 +
    12.7 +function error {
    12.8 +   echo
    12.9 +   echo '  *** Check for the hotplug scripts (hotplug) FAILED'
   12.10 +   exit 1
   12.11 +}
   12.12 +
   12.13 +which hotplug 1>/dev/null 2>&1 || error
    13.1 --- a/tools/examples/Makefile	Fri Sep 23 15:41:28 2005 -0600
    13.2 +++ b/tools/examples/Makefile	Mon Sep 26 11:07:49 2005 -0600
    13.3 @@ -25,19 +25,13 @@ XEN_SCRIPTS += block-phy
    13.4  XEN_SCRIPTS += block-file
    13.5  XEN_SCRIPTS += block-enbd
    13.6  
    13.7 -# no 64-bit specifics in mem-map.sxp
    13.8 -# so place in /usr/lib, not /usr/lib64
    13.9 -XEN_BOOT_DIR = /usr/lib/xen/boot
   13.10 -XEN_BOOT = mem-map.sxp
   13.11 -
   13.12  XEN_HOTPLUG_DIR = /etc/hotplug
   13.13  XEN_HOTPLUG_SCRIPTS = xen-backend.agent
   13.14  
   13.15  all:
   13.16  build:
   13.17  
   13.18 -install: all install-initd install-configs install-scripts install-boot \
   13.19 -	 install-hotplug
   13.20 +install: all install-initd install-configs install-scripts install-hotplug
   13.21  
   13.22  install-initd:
   13.23  	[ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d
   13.24 @@ -62,14 +56,6 @@ install-scripts:
   13.25  	    $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
   13.26  	done
   13.27  
   13.28 -install-boot:
   13.29 -	[ -d $(DESTDIR)$(XEN_BOOT_DIR) ] || \
   13.30 -		$(INSTALL_DIR) $(DESTDIR)$(XEN_BOOT_DIR)
   13.31 -	for i in $(XEN_BOOT); \
   13.32 -	    do [ -a $(DESTDIR)$(XEN_BOOT_DIR)/$$i ] || \
   13.33 -	    $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_BOOT_DIR); \
   13.34 -	done
   13.35 -
   13.36  install-hotplug:
   13.37  	[ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \
   13.38  		$(INSTALL_DIR) $(DESTDIR)$(XEN_HOTPLUG_DIR)
    14.1 --- a/tools/examples/mem-map.sxp	Fri Sep 23 15:41:28 2005 -0600
    14.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.3 @@ -1,10 +0,0 @@
    14.4 -(memmap
    14.5 - (0000000000000000  000000000009f800 "AddressRangeMemory"   WB)
    14.6 - (000000000009f800  00000000000a0000 "AddressRangeReserved" UC)
    14.7 - (00000000000a0000  00000000000bffff "AddressRangeIO"       UC)
    14.8 - (00000000000f0000  0000000000100000 "AddressRangeReserved" UC)
    14.9 - (0000000000100000  0000000008000000 "AddressRangeMemory"   WB)
   14.10 - (0000000007fff000  0000000008000000 "AddressRangeShared"   WB)
   14.11 - (0000000008000000  0000000008003000 "AddressRangeNVS"      UC)
   14.12 - (0000000008003000  000000000800d000 "AddressRangeACPI"     WB)
   14.13 - (00000000fec00000  0000000100000000 "AddressRangeIO"       UC))
    15.1 --- a/tools/examples/xmexample.vmx	Fri Sep 23 15:41:28 2005 -0600
    15.2 +++ b/tools/examples/xmexample.vmx	Mon Sep 26 11:07:49 2005 -0600
    15.3 @@ -60,9 +60,6 @@ disk = [ 'file:/var/images/min-el3-i386.
    15.4  # New stuff
    15.5  device_model = '/usr/' + arch_libdir + '/xen/bin/qemu-dm'
    15.6  
    15.7 -# Advanced users only. Don't touch if you don't know what you're doing
    15.8 -memmap = '/usr/lib/xen/boot/mem-map.sxp'
    15.9 -
   15.10  #-----------------------------------------------------------------------------
   15.11  # Disk image for 
   15.12  #cdrom=
    16.1 --- a/tools/firmware/vmxassist/Makefile	Fri Sep 23 15:41:28 2005 -0600
    16.2 +++ b/tools/firmware/vmxassist/Makefile	Mon Sep 26 11:07:49 2005 -0600
    16.3 @@ -44,7 +44,7 @@ all: vmxloader
    16.4  vmxloader: roms.h vmxloader.c acpi.h acpi_madt.c
    16.5  	${CC} ${CFLAGS} ${DEFINES} -c vmxloader.c -c acpi_madt.c
    16.6  	$(CC) -o vmxloader.tmp -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,0x100000 vmxloader.o acpi_madt.o
    16.7 -	objcopy --change-addresses=0xC0000000 vmxloader.tmp vmxloader
    16.8 +	objcopy vmxloader.tmp vmxloader
    16.9  	rm -f vmxloader.tmp
   16.10  
   16.11  vmxassist.bin: vmxassist.ld ${OBJECTS}
    17.1 --- a/tools/firmware/vmxassist/vmxloader.c	Fri Sep 23 15:41:28 2005 -0600
    17.2 +++ b/tools/firmware/vmxassist/vmxloader.c	Mon Sep 26 11:07:49 2005 -0600
    17.3 @@ -34,28 +34,39 @@ int acpi_madt_update(unsigned char* acpi
    17.4  /*
    17.5   * C runtime start off
    17.6   */
    17.7 -asm("					\n\
    17.8 -	.text				\n\
    17.9 -	.globl	_start			\n\
   17.10 -_start:					\n\
   17.11 -	cli				\n\
   17.12 -	movl	$stack_top, %esp	\n\
   17.13 -	movl	%esp, %ebp		\n\
   17.14 -	call    main			\n\
   17.15 -	jmp	halt			\n\
   17.16 -					\n\
   17.17 -	.globl	halt			\n\
   17.18 -halt:					\n\
   17.19 -	sti				\n\
   17.20 -	jmp	.			\n\
   17.21 -					\n\
   17.22 -	.bss				\n\
   17.23 -	.align	8			\n\
   17.24 -	.globl	stack, stack_top	\n\
   17.25 -stack:					\n\
   17.26 -	.skip	0x4000			\n\
   17.27 -stack_top:				\n\
   17.28 -");
   17.29 +asm(
   17.30 +"	.text				\n"
   17.31 +"	.globl	_start			\n"
   17.32 +"_start:				\n"
   17.33 +"	cld				\n"
   17.34 +"	cli				\n"
   17.35 +"	lgdt	gdt_desr		\n"
   17.36 +"	movl	$stack_top, %esp	\n"
   17.37 +"	movl	%esp, %ebp		\n"
   17.38 +"	call	main			\n"
   17.39 +"	jmp	halt			\n"
   17.40 +"					\n"
   17.41 +"gdt_desr:				\n"
   17.42 +"	.word	gdt_end - gdt - 1	\n"
   17.43 +"	.long	gdt			\n"
   17.44 +"					\n"
   17.45 +"	.align	8			\n"
   17.46 +"gdt:					\n"
   17.47 +"	.quad	0x0000000000000000	\n"
   17.48 +"	.quad	0x00CF92000000FFFF	\n"
   17.49 +"	.quad	0x00CF9A000000FFFF	\n"
   17.50 +"gdt_end:				\n"
   17.51 +"					\n"
   17.52 +"halt:					\n"
   17.53 +"	sti				\n"
   17.54 +"	jmp	.			\n"
   17.55 +"					\n"
   17.56 +"	.bss				\n"
   17.57 +"	.align	8			\n"
   17.58 +"stack:					\n"
   17.59 +"	.skip	0x4000			\n"
   17.60 +"stack_top:				\n"
   17.61 +);
   17.62  
   17.63  void *
   17.64  memcpy(void *dest, const void *src, unsigned n)
   17.65 @@ -95,7 +106,7 @@ cirrus_check(void)
   17.66  }
   17.67  
   17.68  int
   17.69 -main()
   17.70 +main(void)
   17.71  {
   17.72  	puts("VMXAssist Loader\n");
   17.73  	puts("Loading ROMBIOS ...\n");
    18.1 --- a/tools/ioemu/hw/cirrus_vga.c	Fri Sep 23 15:41:28 2005 -0600
    18.2 +++ b/tools/ioemu/hw/cirrus_vga.c	Mon Sep 26 11:07:49 2005 -0600
    18.3 @@ -231,6 +231,8 @@ typedef struct CirrusVGAState {
    18.4      int cirrus_linear_io_addr;
    18.5      int cirrus_linear_bitblt_io_addr;
    18.6      int cirrus_mmio_io_addr;
    18.7 +    unsigned long cirrus_lfb_addr;
    18.8 +    unsigned long cirrus_lfb_end;
    18.9      uint32_t cirrus_addr_mask;
   18.10      uint32_t linear_mmio_mask;
   18.11      uint8_t cirrus_shadow_gr0;
   18.12 @@ -2447,6 +2449,10 @@ static void cirrus_update_memory_access(
   18.13  {
   18.14      unsigned mode;
   18.15  
   18.16 +    extern void unset_vram_mapping(unsigned long addr, unsigned long end);
   18.17 +    extern void set_vram_mapping(unsigned long addr, unsigned long end);
   18.18 +    extern int vga_accelerate;
   18.19 +
   18.20      if ((s->sr[0x17] & 0x44) == 0x44) {
   18.21          goto generic_io;
   18.22      } else if (s->cirrus_srcptr != s->cirrus_srcptr_end) {
   18.23 @@ -2454,17 +2460,21 @@ static void cirrus_update_memory_access(
   18.24      } else {
   18.25  	if ((s->gr[0x0B] & 0x14) == 0x14) {
   18.26              goto generic_io;
   18.27 -	} else if (s->gr[0x0B] & 0x02) {
   18.28 -            goto generic_io;
   18.29 -        }
   18.30 -        
   18.31 -	mode = s->gr[0x05] & 0x7;
   18.32 -	if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
   18.33 +    } else if (s->gr[0x0B] & 0x02) {
   18.34 +        goto generic_io;
   18.35 +    }
   18.36 +
   18.37 +    mode = s->gr[0x05] & 0x7;
   18.38 +    if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
   18.39 +            if (vga_accelerate && s->cirrus_lfb_addr && s->cirrus_lfb_end)
   18.40 +                set_vram_mapping(s->cirrus_lfb_addr, s->cirrus_lfb_end);
   18.41              s->cirrus_linear_write[0] = cirrus_linear_mem_writeb;
   18.42              s->cirrus_linear_write[1] = cirrus_linear_mem_writew;
   18.43              s->cirrus_linear_write[2] = cirrus_linear_mem_writel;
   18.44          } else {
   18.45          generic_io:
   18.46 +            if (vga_accelerate && s->cirrus_lfb_addr && s->cirrus_lfb_end)
   18.47 +                 unset_vram_mapping(s->cirrus_lfb_addr, s->cirrus_lfb_end);
   18.48              s->cirrus_linear_write[0] = cirrus_linear_writeb;
   18.49              s->cirrus_linear_write[1] = cirrus_linear_writew;
   18.50              s->cirrus_linear_write[2] = cirrus_linear_writel;
   18.51 @@ -3058,6 +3068,8 @@ static void cirrus_pci_lfb_map(PCIDevice
   18.52      /* XXX: add byte swapping apertures */
   18.53      cpu_register_physical_memory(addr, s->vram_size,
   18.54  				 s->cirrus_linear_io_addr);
   18.55 +    s->cirrus_lfb_addr = addr;
   18.56 +    s->cirrus_lfb_end = addr + VGA_RAM_SIZE;
   18.57      cpu_register_physical_memory(addr + 0x1000000, 0x400000,
   18.58  				 s->cirrus_linear_bitblt_io_addr);
   18.59  }
    19.1 --- a/tools/ioemu/hw/pc.c	Fri Sep 23 15:41:28 2005 -0600
    19.2 +++ b/tools/ioemu/hw/pc.c	Mon Sep 26 11:07:49 2005 -0600
    19.3 @@ -385,6 +385,7 @@ void pc_init(int ram_size, int vga_ram_s
    19.4      unsigned long bios_offset, vga_bios_offset;
    19.5      int bios_size, isa_bios_size;
    19.6      PCIBus *pci_bus;
    19.7 +    extern void * shared_vram;
    19.8      
    19.9      linux_boot = (kernel_filename != NULL);
   19.10  
   19.11 @@ -511,14 +512,14 @@ void pc_init(int ram_size, int vga_ram_s
   19.12      if (cirrus_vga_enabled) {
   19.13          if (pci_enabled) {
   19.14              pci_cirrus_vga_init(pci_bus, 
   19.15 -                                ds, phys_ram_base + ram_size, ram_size, 
   19.16 +                                ds, shared_vram, ram_size, 
   19.17                                  vga_ram_size);
   19.18          } else {
   19.19 -            isa_cirrus_vga_init(ds, phys_ram_base + ram_size, ram_size, 
   19.20 +            isa_cirrus_vga_init(ds, shared_vram, ram_size, 
   19.21                                  vga_ram_size);
   19.22          }
   19.23      } else {
   19.24 -        vga_initialize(pci_bus, ds, phys_ram_base + ram_size, ram_size, 
   19.25 +        vga_initialize(pci_bus, ds, shared_vram, ram_size, 
   19.26                         vga_ram_size);
   19.27      }
   19.28  
    20.1 --- a/tools/ioemu/hw/vga.c	Fri Sep 23 15:41:28 2005 -0600
    20.2 +++ b/tools/ioemu/hw/vga.c	Mon Sep 26 11:07:49 2005 -0600
    20.3 @@ -1568,6 +1568,8 @@ void vga_update_display(void)
    20.4              s->graphic_mode = graphic_mode;
    20.5              full_update = 1;
    20.6          }
    20.7 +
    20.8 +        full_update = 1;
    20.9          switch(graphic_mode) {
   20.10          case GMODE_TEXT:
   20.11              vga_draw_text(s, full_update);
   20.12 @@ -1848,6 +1850,7 @@ void vga_common_init(VGAState *s, Displa
   20.13                       unsigned long vga_ram_offset, int vga_ram_size)
   20.14  {
   20.15      int i, j, v, b;
   20.16 +    extern void* shared_vram;
   20.17  
   20.18      for(i = 0;i < 256; i++) {
   20.19          v = 0;
   20.20 @@ -1876,7 +1879,7 @@ void vga_common_init(VGAState *s, Displa
   20.21  
   20.22      /* qemu's vga mem is not detached from phys_ram_base and can cause DM abort
   20.23       * when guest write vga mem, so allocate a new one */
   20.24 -    s->vram_ptr = qemu_mallocz(vga_ram_size);
   20.25 +    s->vram_ptr = shared_vram;
   20.26  
   20.27      s->vram_offset = vga_ram_offset;
   20.28      s->vram_size = vga_ram_size;
    21.1 --- a/tools/ioemu/target-i386-dm/helper2.c	Fri Sep 23 15:41:28 2005 -0600
    21.2 +++ b/tools/ioemu/target-i386-dm/helper2.c	Mon Sep 26 11:07:49 2005 -0600
    21.3 @@ -54,6 +54,8 @@
    21.4  #include "exec-all.h"
    21.5  #include "vl.h"
    21.6  
    21.7 +void *shared_vram;
    21.8 +
    21.9  shared_iopage_t *shared_page = NULL;
   21.10  extern int reset_requested;
   21.11  
    22.1 --- a/tools/ioemu/vl.c	Fri Sep 23 15:41:28 2005 -0600
    22.2 +++ b/tools/ioemu/vl.c	Mon Sep 26 11:07:49 2005 -0600
    22.3 @@ -134,6 +134,7 @@ int pci_enabled = 1;
    22.4  int prep_enabled = 0;
    22.5  int rtc_utc = 1;
    22.6  int cirrus_vga_enabled = 1;
    22.7 +int vga_accelerate = 1;
    22.8  int graphic_width = 800;
    22.9  int graphic_height = 600;
   22.10  int graphic_depth = 15;
   22.11 @@ -141,6 +142,12 @@ int full_screen = 0;
   22.12  TextConsole *vga_console;
   22.13  CharDriverState *serial_hds[MAX_SERIAL_PORTS];
   22.14  int xc_handle;
   22.15 +unsigned long *vgapage_array;
   22.16 +unsigned long *freepage_array;
   22.17 +unsigned long free_pages;
   22.18 +void *vtop_table;
   22.19 +unsigned long toptab;
   22.20 +unsigned long vgaram_pages;
   22.21  
   22.22  /***********************************************************/
   22.23  /* x86 ISA bus support */
   22.24 @@ -2162,6 +2169,7 @@ void help(void)
   22.25             "-isa            simulate an ISA-only system (default is PCI system)\n"
   22.26             "-std-vga        simulate a standard VGA card with VESA Bochs Extensions\n"
   22.27             "                (default is CL-GD5446 PCI VGA)\n"
   22.28 +           "-vgaacc [0|1]   1 to accelerate CL-GD5446 speed, default is 1\n"
   22.29  #endif
   22.30             "-loadvm file    start right away with a saved state (loadvm in monitor)\n"
   22.31             "\n"
   22.32 @@ -2251,6 +2259,7 @@ enum {
   22.33      QEMU_OPTION_serial,
   22.34      QEMU_OPTION_loadvm,
   22.35      QEMU_OPTION_full_screen,
   22.36 +    QEMU_OPTION_vgaacc,
   22.37  };
   22.38  
   22.39  typedef struct QEMUOption {
   22.40 @@ -2327,6 +2336,7 @@ const QEMUOption qemu_options[] = {
   22.41      { "pci", 0, QEMU_OPTION_pci },
   22.42      { "nic-pcnet", 0, QEMU_OPTION_nic_pcnet },
   22.43      { "cirrusvga", 0, QEMU_OPTION_cirrusvga },
   22.44 +    { "vgaacc", HAS_ARG, QEMU_OPTION_vgaacc },
   22.45      { NULL },
   22.46  };
   22.47  
   22.48 @@ -2343,6 +2353,177 @@ static uint8_t *signal_stack;
   22.49  #define NET_IF_USER  1
   22.50  #define NET_IF_DUMMY 2
   22.51  
   22.52 +#include <xg_private.h>
   22.53 +
   22.54 +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
   22.55 +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
   22.56 +
   22.57 +#ifdef __i386__
   22.58 +#define _LEVEL_3_ 0
   22.59 +#else
   22.60 +#define _LEVEL_3_ 1
   22.61 +#endif
   22.62 +
   22.63 +#if _LEVEL_3_
   22.64 +#define L3_PROT (_PAGE_PRESENT)
   22.65 +#define L1_PAGETABLE_ENTRIES    512
   22.66 +#else
   22.67 +#define L1_PAGETABLE_ENTRIES    1024
   22.68 +#endif
   22.69 +
   22.70 +inline int
   22.71 +get_vl2_table(unsigned long count, unsigned long start)
   22.72 +{
   22.73 +#if _LEVEL_3_
   22.74 +    return ((start + (count << PAGE_SHIFT)) >> L3_PAGETABLE_SHIFT) & 0x3;
   22.75 +#else
   22.76 +    return 0;
   22.77 +#endif
   22.78 +}
   22.79 +
   22.80 +int
   22.81 +setup_mapping(int xc_handle, u32 dom, unsigned long toptab, unsigned long  *mem_page_array, unsigned long *page_table_array, unsigned long v_start, unsigned long v_end)
   22.82 +{
   22.83 +    l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
   22.84 +    l2_pgentry_t *vl2tab[4], *vl2e=NULL, *vl2_table = NULL;
   22.85 +    unsigned long l1tab;
   22.86 +    unsigned long ppt_alloc = 0;
   22.87 +    unsigned long count;
   22.88 +    int i = 0;
   22.89 +#if _LEVEL_3_
   22.90 +    l3_pgentry_t *vl3tab = NULL;
   22.91 +    unsigned long l2tab;
   22.92 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
   22.93 +                                        PROT_READ|PROT_WRITE, 
   22.94 +                                        toptab >> PAGE_SHIFT)) == NULL )
   22.95 +        goto error_out;
   22.96 +    for (i = 0; i < 4 ; i++) {
   22.97 +        l2tab = vl3tab[i] & PAGE_MASK;
   22.98 +        vl2tab[i] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   22.99 +          PROT_READ|PROT_WRITE,
  22.100 +          l2tab >> PAGE_SHIFT);
  22.101 +        if(vl2tab[i] == NULL)
  22.102 +            goto error_out;
  22.103 +    }
  22.104 +    munmap(vl3tab, PAGE_SIZE);
  22.105 +    vl3tab = NULL;
  22.106 +#else
  22.107 +    if ( (vl2tab[0] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  22.108 +                                           PROT_READ|PROT_WRITE, 
  22.109 +                                           toptab >> PAGE_SHIFT)) == NULL )
  22.110 +        goto error_out;
  22.111 +#endif
  22.112 +
  22.113 +    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
  22.114 +    {
  22.115 +        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  22.116 +        {
  22.117 +            vl2_table = vl2tab[get_vl2_table(count, v_start)];
  22.118 +            vl2e = &vl2_table[l2_table_offset(
  22.119 +                v_start + (count << PAGE_SHIFT))];
  22.120 +
  22.121 +            l1tab = page_table_array[ppt_alloc++] << PAGE_SHIFT;
  22.122 +            if ( vl1tab != NULL )
  22.123 +                munmap(vl1tab, PAGE_SIZE);
  22.124 +
  22.125 +            if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  22.126 +                                                PROT_READ|PROT_WRITE,
  22.127 +                                                l1tab >> PAGE_SHIFT)) == NULL )
  22.128 +            {
  22.129 +                goto error_out;
  22.130 +            }
  22.131 +            memset(vl1tab, 0, PAGE_SIZE);
  22.132 +            vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))];
  22.133 +            *vl2e = l1tab | L2_PROT;
  22.134 +        }
  22.135 +
  22.136 +        *vl1e = (mem_page_array[count] << PAGE_SHIFT) | L1_PROT;
  22.137 +        vl1e++;
  22.138 +    }
  22.139 +error_out:
  22.140 +    if(vl1tab)  munmap(vl1tab, PAGE_SIZE);
  22.141 +    for(i = 0; i < 4; i++)
  22.142 +        if(vl2tab[i]) munmap(vl2tab[i], PAGE_SIZE);
  22.143 +    return ppt_alloc;
  22.144 +}
  22.145 +
  22.146 +void
  22.147 +unsetup_mapping(int xc_handle, u32 dom, unsigned long toptab, unsigned long v_start, unsigned long v_end)
  22.148 +{
  22.149 +    l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
  22.150 +    l2_pgentry_t *vl2tab[4], *vl2e=NULL, *vl2_table = NULL;
  22.151 +    unsigned long l1tab;
  22.152 +    unsigned long count;
  22.153 +    int i = 0;
  22.154 +#if _LEVEL_3_
  22.155 +    l3_pgentry_t *vl3tab = NULL;
  22.156 +    unsigned long l2tab;
  22.157 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  22.158 +                                        PROT_READ|PROT_WRITE, 
  22.159 +                                        toptab >> PAGE_SHIFT)) == NULL )
  22.160 +        goto error_out;
  22.161 +    for (i = 0; i < 4 ; i ++){
  22.162 +        l2tab = vl3tab[i] & PAGE_MASK;
  22.163 +        vl2tab[i] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  22.164 +          PROT_READ|PROT_WRITE,
  22.165 +          l2tab >> PAGE_SHIFT);
  22.166 +        if(vl2tab[i] == NULL)
  22.167 +            goto error_out;
  22.168 +    }
  22.169 +    munmap(vl3tab, PAGE_SIZE);
  22.170 +    vl3tab = NULL;
  22.171 +#else
  22.172 +    if ( (vl2tab[0] = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  22.173 +                                        PROT_READ|PROT_WRITE, 
  22.174 +                                        toptab >> PAGE_SHIFT)) == NULL )
  22.175 +        goto error_out;
  22.176 +#endif
  22.177 +
  22.178 +    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ){
  22.179 +        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  22.180 +        {
  22.181 +            vl2_table = vl2tab[get_vl2_table(count, v_start)];
  22.182 +            vl2e = &vl2_table[l2_table_offset(v_start + (count << PAGE_SHIFT))];
  22.183 +            l1tab = *vl2e & PAGE_MASK;
  22.184 +
  22.185 +            if(l1tab == 0)
  22.186 +                continue;
  22.187 +            if ( vl1tab != NULL )
  22.188 +                munmap(vl1tab, PAGE_SIZE);
  22.189 +
  22.190 +            if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  22.191 +                      PROT_READ|PROT_WRITE,
  22.192 +                      l1tab >> PAGE_SHIFT)) == NULL )
  22.193 +            {
  22.194 +                goto error_out;
  22.195 +            }
  22.196 +            vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))];
  22.197 +            *vl2e = 0;
  22.198 +        }
  22.199 +
  22.200 +        *vl1e = 0;
  22.201 +        vl1e++;
  22.202 +    }
  22.203 +error_out:
  22.204 +    if(vl1tab)  munmap(vl1tab, PAGE_SIZE);
  22.205 +    for(i = 0; i < 4; i++)
  22.206 +        if(vl2tab[i]) munmap(vl2tab[i], PAGE_SIZE);
  22.207 +}
  22.208 +
  22.209 +void set_vram_mapping(unsigned long addr, unsigned long end)
  22.210 +{
  22.211 +    end = addr + VGA_RAM_SIZE;
  22.212 +    setup_mapping(xc_handle, domid, toptab,
  22.213 +      vgapage_array, freepage_array, addr, end);
  22.214 +}
  22.215 +
  22.216 +void unset_vram_mapping(unsigned long addr, unsigned long end)
  22.217 +{
  22.218 +    end = addr + VGA_RAM_SIZE;
  22.219 +    /* FIXME Flush the shadow page */
  22.220 +    unsetup_mapping(xc_handle, domid, toptab, addr, end);
  22.221 +}
  22.222 +
  22.223  int main(int argc, char **argv)
  22.224  {
  22.225  #ifdef CONFIG_GDBSTUB
  22.226 @@ -2366,8 +2547,9 @@ int main(int argc, char **argv)
  22.227      char serial_devices[MAX_SERIAL_PORTS][128];
  22.228      int serial_device_index;
  22.229      const char *loadvm = NULL;
  22.230 -    unsigned long nr_pages, *page_array;
  22.231 +    unsigned long nr_pages, extra_pages, ram_pages, *page_array;
  22.232      extern void *shared_page;
  22.233 +    extern void *shared_vram;
  22.234      /* change the qemu-dm to daemon, just like bochs dm */
  22.235  //    daemon(0, 0);
  22.236      
  22.237 @@ -2674,6 +2856,17 @@ int main(int argc, char **argv)
  22.238              case QEMU_OPTION_cirrusvga:
  22.239                  cirrus_vga_enabled = 1;
  22.240                  break;
  22.241 +            case QEMU_OPTION_vgaacc:
  22.242 +                {
  22.243 +                    const char *p;
  22.244 +                    p = optarg;
  22.245 +                    vga_accelerate = strtol(p, (char **)&p, 0);
  22.246 +                    if (*p != '\0') {
  22.247 +                        fprintf(stderr, "qemu: invalid vgaacc option\n");
  22.248 +                        exit(1);
  22.249 +                    }
  22.250 +                    break;
  22.251 +                }
  22.252              case QEMU_OPTION_std_vga:
  22.253                  cirrus_vga_enabled = 0;
  22.254                  break;
  22.255 @@ -2803,12 +2996,25 @@ int main(int argc, char **argv)
  22.256      /* init the memory */
  22.257      phys_ram_size = ram_size + vga_ram_size + bios_size;
  22.258  
  22.259 -    #define PAGE_SHIFT 12
  22.260 -    #define PAGE_SIZE  (1 << PAGE_SHIFT)
  22.261 -
  22.262 -    nr_pages = ram_size/PAGE_SIZE;
  22.263 +    ram_pages = ram_size/PAGE_SIZE;
  22.264 +    vgaram_pages =  (vga_ram_size -1)/PAGE_SIZE + 1;
  22.265 +    free_pages = vgaram_pages / L1_PAGETABLE_ENTRIES;
  22.266 +    extra_pages = vgaram_pages + free_pages;
  22.267 +
  22.268      xc_handle = xc_interface_open();
  22.269 -    
  22.270 +
  22.271 +    xc_dominfo_t info;
  22.272 +    xc_domain_getinfo(xc_handle, domid, 1, &info);
  22.273 +
  22.274 +    nr_pages = info.nr_pages + extra_pages;
  22.275 +
  22.276 +    if ( xc_domain_setmaxmem(xc_handle, domid,
  22.277 +            (nr_pages) * PAGE_SIZE/1024 ) != 0)
  22.278 +    {
  22.279 +        perror("set maxmem");
  22.280 +        exit(-1);
  22.281 +    }
  22.282 +   
  22.283      if ( (page_array = (unsigned long *)
  22.284  	  malloc(nr_pages * sizeof(unsigned long))) == NULL)
  22.285      {
  22.286 @@ -2816,6 +3022,12 @@ int main(int argc, char **argv)
  22.287  	    exit(-1);
  22.288      }
  22.289  
  22.290 +    if (xc_domain_memory_increase_reservation(xc_handle, domid, 
  22.291 +          extra_pages , 0, 0, NULL) != 0) {
  22.292 +        perror("increase reservation");
  22.293 +        exit(-1);
  22.294 +    }
  22.295 +
  22.296      if ( xc_get_pfn_list(xc_handle, domid, page_array, nr_pages) != nr_pages )
  22.297      {
  22.298  	    perror("xc_get_pfn_list");
  22.299 @@ -2825,15 +3037,36 @@ int main(int argc, char **argv)
  22.300      if ((phys_ram_base =  xc_map_foreign_batch(xc_handle, domid,
  22.301  						 PROT_READ|PROT_WRITE,
  22.302  						 page_array,
  22.303 -						 nr_pages - 1)) == 0) {
  22.304 +						 ram_pages - 1)) == 0) {
  22.305  	    perror("xc_map_foreign_batch");
  22.306  	    exit(-1);
  22.307      }
  22.308  
  22.309      shared_page = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
  22.310  				       PROT_READ|PROT_WRITE,
  22.311 -				       page_array[nr_pages - 1]);
  22.312 -
  22.313 + 				       page_array[ram_pages - 1]);
  22.314 +
  22.315 +    vgapage_array = &page_array[nr_pages - vgaram_pages];
  22.316 +
  22.317 +    if ((shared_vram =  xc_map_foreign_batch(xc_handle, domid,
  22.318 + 						 PROT_READ|PROT_WRITE,
  22.319 + 						 vgapage_array,
  22.320 + 						 vgaram_pages)) == 0) {
  22.321 + 	    perror("xc_map_foreign_batch vgaram ");
  22.322 + 	    exit(-1);
  22.323 +     }
  22.324 +
  22.325 +
  22.326 +
  22.327 +    memset(shared_vram, 0, vgaram_pages * PAGE_SIZE);
  22.328 +    toptab = page_array[ram_pages] << PAGE_SHIFT;
  22.329 +
  22.330 +    vtop_table = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
  22.331 +				       PROT_READ|PROT_WRITE,
  22.332 + 				       page_array[ram_pages]);
  22.333 +
  22.334 +    freepage_array = &page_array[nr_pages - extra_pages];
  22.335 + 
  22.336  
  22.337      fprintf(logfile, "shared page at pfn:%lx, mfn: %lx\n", (nr_pages-1), 
  22.338             (page_array[nr_pages - 1]));
    23.1 --- a/tools/libxc/linux_boot_params.h	Fri Sep 23 15:41:28 2005 -0600
    23.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.3 @@ -1,166 +0,0 @@
    23.4 -#ifndef __LINUX_BOOT_PARAMS_H__
    23.5 -#define __LINUX_BOOT_PARAMS_H__
    23.6 -
    23.7 -#include <asm/types.h>
    23.8 -
    23.9 -#define E820MAX	32
   23.10 -
   23.11 -struct mem_map {
   23.12 -    int nr_map;
   23.13 -    struct entry {
   23.14 -        u64 addr;	/* start of memory segment */
   23.15 -        u64 size;	/* size of memory segment */
   23.16 -        u32 type;		/* type of memory segment */
   23.17 -#define E820_RAM        1
   23.18 -#define E820_RESERVED   2
   23.19 -#define E820_ACPI       3 /* usable as RAM once ACPI tables have been read */
   23.20 -#define E820_NVS        4
   23.21 -#define E820_IO         16
   23.22 -#define E820_SHARED     17
   23.23 -#define E820_XENSTORE   18
   23.24 -
   23.25 -        u32 caching_attr;    /* used by hypervisor */
   23.26 -#define MEMMAP_UC	0
   23.27 -#define MEMMAP_WC	1
   23.28 -#define MEMMAP_WT	4
   23.29 -#define MEMMAP_WP	5
   23.30 -#define MEMMAP_WB	6
   23.31 -
   23.32 -    }map[E820MAX];
   23.33 -};
   23.34 -
   23.35 -struct e820entry {
   23.36 -	u64 addr;	/* start of memory segment */
   23.37 -	u64 size;	/* size of memory segment */
   23.38 -	u32 type;	/* type of memory segment */
   23.39 -}__attribute__((packed));
   23.40 -
   23.41 -struct e820map {
   23.42 -    u32 nr_map;
   23.43 -    struct e820entry map[E820MAX];
   23.44 -}__attribute__((packed));
   23.45 -
   23.46 -struct drive_info_struct { __u8 dummy[32]; }; 
   23.47 -
   23.48 -struct sys_desc_table { 
   23.49 -    __u16 length; 
   23.50 -    __u8 table[318]; 
   23.51 -}; 
   23.52 -
   23.53 -struct screen_info {
   23.54 -    unsigned char  orig_x;		/* 0x00 */
   23.55 -    unsigned char  orig_y;		/* 0x01 */
   23.56 -    unsigned short dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
   23.57 -    unsigned short orig_video_page;	/* 0x04 */
   23.58 -    unsigned char  orig_video_mode;	/* 0x06 */
   23.59 -    unsigned char  orig_video_cols;	/* 0x07 */
   23.60 -    unsigned short unused2;		/* 0x08 */
   23.61 -    unsigned short orig_video_ega_bx;	/* 0x0a */
   23.62 -    unsigned short unused3;		/* 0x0c */
   23.63 -    unsigned char  orig_video_lines;	/* 0x0e */
   23.64 -    unsigned char  orig_video_isVGA;	/* 0x0f */
   23.65 -    unsigned short orig_video_points;	/* 0x10 */
   23.66 -    
   23.67 -    /* VESA graphic mode -- linear frame buffer */
   23.68 -    unsigned short lfb_width;		/* 0x12 */
   23.69 -    unsigned short lfb_height;		/* 0x14 */
   23.70 -    unsigned short lfb_depth;		/* 0x16 */
   23.71 -    unsigned int   lfb_base;		/* 0x18 */
   23.72 -    unsigned int   lfb_size;		/* 0x1c */
   23.73 -    unsigned short dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
   23.74 -    unsigned short lfb_linelength;	/* 0x24 */
   23.75 -    unsigned char  red_size;		/* 0x26 */
   23.76 -    unsigned char  red_pos;		/* 0x27 */
   23.77 -    unsigned char  green_size;		/* 0x28 */
   23.78 -    unsigned char  green_pos;		/* 0x29 */
   23.79 -    unsigned char  blue_size;		/* 0x2a */
   23.80 -    unsigned char  blue_pos;		/* 0x2b */
   23.81 -    unsigned char  rsvd_size;		/* 0x2c */
   23.82 -    unsigned char  rsvd_pos;		/* 0x2d */
   23.83 -    unsigned short vesapm_seg;		/* 0x2e */
   23.84 -    unsigned short vesapm_off;		/* 0x30 */
   23.85 -    unsigned short pages;		/* 0x32 */
   23.86 -					/* 0x34 -- 0x3f reserved for future expansion */
   23.87 -};
   23.88 -
   23.89 -struct screen_info_overlap { 
   23.90 -    __u8 reserved1[2]; /* 0x00 */ 
   23.91 -    __u16 ext_mem_k; /* 0x02 */ 
   23.92 -    __u8 reserved2[0x20 - 0x04]; /* 0x04 */ 
   23.93 -    __u16 cl_magic; /* 0x20 */ 
   23.94 -#define CL_MAGIC_VALUE 0xA33F 
   23.95 -    __u16 cl_offset; /* 0x22 */ 
   23.96 -    __u8 reserved3[0x40 - 0x24]; /* 0x24 */ 
   23.97 -}; 
   23.98 -
   23.99 -
  23.100 -struct apm_bios_info {
  23.101 -    __u16 version;
  23.102 -    __u16  cseg;
  23.103 -    __u32   offset;
  23.104 -    __u16  cseg_16;
  23.105 -    __u16  dseg;
  23.106 -    __u16  flags;
  23.107 -    __u16  cseg_len;
  23.108 -    __u16  cseg_16_len;
  23.109 -    __u16  dseg_len;
  23.110 -};
  23.111 - 
  23.112 -struct linux_boot_params { 
  23.113 -    union { /* 0x00 */ 
  23.114 -       struct screen_info info; 
  23.115 -       struct screen_info_overlap overlap; 
  23.116 -    } screen; 
  23.117 - 
  23.118 -    struct apm_bios_info apm_bios_info; /* 0x40 */ 
  23.119 -    __u8 reserved4[0x80 - 0x54]; /* 0x54 */ 
  23.120 -    struct drive_info_struct drive_info; /* 0x80 */ 
  23.121 -    struct sys_desc_table sys_desc_table; /* 0xa0 */ 
  23.122 -    __u32 alt_mem_k; /* 0x1e0 */ 
  23.123 -    __u8 reserved5[4]; /* 0x1e4 */ 
  23.124 -    __u8 e820_map_nr; /* 0x1e8 */ 
  23.125 -    __u8 reserved6[8]; /* 0x1e9 */ 
  23.126 -    __u8 setup_sects; /* 0x1f1 */ 
  23.127 -    __u16 mount_root_rdonly; /* 0x1f2 */ 
  23.128 -    __u16 syssize; /* 0x1f4 */ 
  23.129 -    __u16 swapdev; /* 0x1f6 */ 
  23.130 -    __u16 ramdisk_flags; /* 0x1f8 */ 
  23.131 -#define RAMDISK_IMAGE_START_MASK 0x07FF 
  23.132 -#define RAMDISK_PROMPT_FLAG 0x8000 
  23.133 -#define RAMDISK_LOAD_FLAG 0x4000 
  23.134 -    __u16 vid_mode; /* 0x1fa */ 
  23.135 -    __u16 root_dev; /* 0x1fc */ 
  23.136 -    __u8 reserved9[1]; /* 0x1fe */ 
  23.137 -    __u8 aux_device_info; /* 0x1ff */ 
  23.138 -    /* 2.00+ */ 
  23.139 -    __u8 reserved10[2]; /* 0x200 */ 
  23.140 -    __u8 header_magic[4]; /* 0x202 */ 
  23.141 -    __u16 protocol_version; /* 0x206 */ 
  23.142 -    __u8 reserved11[8]; /* 0x208 */ 
  23.143 -    __u8 loader_type; /* 0x210 */ 
  23.144 -#define LOADER_TYPE_LOADLIN 1 
  23.145 -#define LOADER_TYPE_BOOTSECT_LOADER 2 
  23.146 -#define LOADER_TYPE_SYSLINUX 3 
  23.147 -#define LOADER_TYPE_ETHERBOOT 4 
  23.148 -#define LOADER_TYPE_UNKNOWN 0xFF 
  23.149 -    __u8 loader_flags; /* 0x211 */ 
  23.150 -    __u8 reserved12[2]; /* 0x212 */ 
  23.151 -    __u32 code32_start; /* 0x214 */ 
  23.152 -    __u32 initrd_start; /* 0x218 */ 
  23.153 -    __u32 initrd_size; /* 0x21c */ 
  23.154 -    __u8 reserved13[4]; /* 0x220 */ 
  23.155 -    /* 2.01+ */ 
  23.156 -    __u16 heap_end_ptr; /* 0x224 */ 
  23.157 -    __u8 reserved14[2]; /* 0x226 */ 
  23.158 -    /* 2.02+ */ 
  23.159 -    __u32 cmd_line_ptr; /* 0x228 */ 
  23.160 -    /* 2.03+ */ 
  23.161 -    __u32 ramdisk_max; /* 0x22c */ 
  23.162 -    __u8 reserved15[0x2d0 - 0x230]; /* 0x230 */ 
  23.163 -    struct e820entry e820_map[E820MAX]; /* 0x2d0 */ 
  23.164 -    __u64 shared_info; /* 0x550 */
  23.165 -    __u8 padding[0x800 - 0x558]; /* 0x558 */ 
  23.166 -    __u8 cmd_line[0x800]; /* 0x800 */
  23.167 -} __attribute__((packed)); 
  23.168 -
  23.169 -#endif /* __LINUX_BOOT_PARAMS_H__ */
    24.1 --- a/tools/libxc/xc_vmx_build.c	Fri Sep 23 15:41:28 2005 -0600
    24.2 +++ b/tools/libxc/xc_vmx_build.c	Mon Sep 26 11:07:49 2005 -0600
    24.3 @@ -10,7 +10,8 @@
    24.4  #include <unistd.h>
    24.5  #include <zlib.h>
    24.6  #include <xen/io/ioreq.h>
    24.7 -#include "linux_boot_params.h"
    24.8 +
    24.9 +#define VMX_LOADER_ENTR_ADDR  0x00100000
   24.10  
   24.11  #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
   24.12  #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
   24.13 @@ -18,13 +19,29 @@
   24.14  #define L3_PROT (_PAGE_PRESENT)
   24.15  #endif
   24.16  
   24.17 +#define E820MAX	128
   24.18 +
   24.19 +#define E820_RAM          1
   24.20 +#define E820_RESERVED     2
   24.21 +#define E820_ACPI         3
   24.22 +#define E820_NVS          4
   24.23 +#define E820_IO          16
   24.24 +#define E820_SHARED_PAGE 17
   24.25 +#define E820_XENSTORE    18
   24.26 +
   24.27 +#define E820_MAP_PAGE        0x00090000
   24.28 +#define E820_MAP_NR_OFFSET   0x000001E8
   24.29 +#define E820_MAP_OFFSET      0x000002D0
   24.30 +
   24.31 +struct e820entry {
   24.32 +    u64 addr;
   24.33 +    u64 size;
   24.34 +    u32 type;
   24.35 +} __attribute__((packed));
   24.36 +
   24.37  #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
   24.38  #define round_pgdown(_p)  ((_p)&PAGE_MASK)
   24.39  
   24.40 -#define LINUX_BOOT_PARAMS_ADDR   0x00090000
   24.41 -#define LINUX_KERNEL_ENTR_ADDR   0x00100000
   24.42 -#define LINUX_PAGE_OFFSET        0xC0000000
   24.43 -
   24.44  static int
   24.45  parseelfimage(
   24.46      char *elfbase, unsigned long elfsize, struct domain_setup_info *dsi);
   24.47 @@ -33,78 +50,70 @@ loadelfimage(
   24.48      char *elfbase, int xch, u32 dom, unsigned long *parray,
   24.49      struct domain_setup_info *dsi);
   24.50  
   24.51 -static void build_e820map(struct mem_map *mem_mapp, unsigned long mem_size)
   24.52 +static unsigned char build_e820map(void *e820_page, unsigned long mem_size)
   24.53  {
   24.54 -    int nr_map = 0;
   24.55 +    struct e820entry *e820entry =
   24.56 +        (struct e820entry *)(((unsigned char *)e820_page) + E820_MAP_OFFSET);
   24.57 +    unsigned char nr_map = 0;
   24.58  
   24.59      /* XXX: Doesn't work for > 4GB yet */
   24.60 -    mem_mapp->map[nr_map].addr = 0x0;
   24.61 -    mem_mapp->map[nr_map].size = 0x9F800;
   24.62 -    mem_mapp->map[nr_map].type = E820_RAM;
   24.63 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
   24.64 +    e820entry[nr_map].addr = 0x0;
   24.65 +    e820entry[nr_map].size = 0x9F800;
   24.66 +    e820entry[nr_map].type = E820_RAM;
   24.67      nr_map++;
   24.68  
   24.69 -    mem_mapp->map[nr_map].addr = 0x9F800;
   24.70 -    mem_mapp->map[nr_map].size = 0x800;
   24.71 -    mem_mapp->map[nr_map].type = E820_RESERVED;
   24.72 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   24.73 +    e820entry[nr_map].addr = 0x9F800;
   24.74 +    e820entry[nr_map].size = 0x800;
   24.75 +    e820entry[nr_map].type = E820_RESERVED;
   24.76      nr_map++;
   24.77  
   24.78 -    mem_mapp->map[nr_map].addr = 0xA0000;
   24.79 -    mem_mapp->map[nr_map].size = 0x20000;
   24.80 -    mem_mapp->map[nr_map].type = E820_IO;
   24.81 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   24.82 +    e820entry[nr_map].addr = 0xA0000;
   24.83 +    e820entry[nr_map].size = 0x20000;
   24.84 +    e820entry[nr_map].type = E820_IO;
   24.85      nr_map++;
   24.86  
   24.87 -    mem_mapp->map[nr_map].addr = 0xF0000;
   24.88 -    mem_mapp->map[nr_map].size = 0x10000;
   24.89 -    mem_mapp->map[nr_map].type = E820_RESERVED;
   24.90 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
   24.91 +    e820entry[nr_map].addr = 0xF0000;
   24.92 +    e820entry[nr_map].size = 0x10000;
   24.93 +    e820entry[nr_map].type = E820_RESERVED;
   24.94      nr_map++;
   24.95  
   24.96  #define STATIC_PAGES    2       /* for ioreq_t and store_mfn */
   24.97      /* Most of the ram goes here */
   24.98 -    mem_mapp->map[nr_map].addr = 0x100000;
   24.99 -    mem_mapp->map[nr_map].size = mem_size - 0x100000 - STATIC_PAGES*PAGE_SIZE;
  24.100 -    mem_mapp->map[nr_map].type = E820_RAM;
  24.101 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  24.102 +    e820entry[nr_map].addr = 0x100000;
  24.103 +    e820entry[nr_map].size = mem_size - 0x100000 - STATIC_PAGES*PAGE_SIZE;
  24.104 +    e820entry[nr_map].type = E820_RAM;
  24.105      nr_map++;
  24.106  
  24.107      /* Statically allocated special pages */
  24.108  
  24.109      /* Shared ioreq_t page */
  24.110 -    mem_mapp->map[nr_map].addr = mem_size - PAGE_SIZE;
  24.111 -    mem_mapp->map[nr_map].size = PAGE_SIZE;
  24.112 -    mem_mapp->map[nr_map].type = E820_SHARED;
  24.113 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  24.114 +    e820entry[nr_map].addr = mem_size - PAGE_SIZE;
  24.115 +    e820entry[nr_map].size = PAGE_SIZE;
  24.116 +    e820entry[nr_map].type = E820_SHARED_PAGE;
  24.117      nr_map++;
  24.118  
  24.119      /* For xenstore */
  24.120 -    mem_mapp->map[nr_map].addr = mem_size - 2*PAGE_SIZE;
  24.121 -    mem_mapp->map[nr_map].size = PAGE_SIZE;
  24.122 -    mem_mapp->map[nr_map].type = E820_XENSTORE;
  24.123 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  24.124 -    nr_map++;
  24.125 -
  24.126 -    mem_mapp->map[nr_map].addr = mem_size;
  24.127 -    mem_mapp->map[nr_map].size = 0x3 * PAGE_SIZE;
  24.128 -    mem_mapp->map[nr_map].type = E820_NVS;
  24.129 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
  24.130 +    e820entry[nr_map].addr = mem_size - 2*PAGE_SIZE;
  24.131 +    e820entry[nr_map].size = PAGE_SIZE;
  24.132 +    e820entry[nr_map].type = E820_XENSTORE;
  24.133      nr_map++;
  24.134  
  24.135 -    mem_mapp->map[nr_map].addr = mem_size + 0x3 * PAGE_SIZE;
  24.136 -    mem_mapp->map[nr_map].size = 0xA * PAGE_SIZE;
  24.137 -    mem_mapp->map[nr_map].type = E820_ACPI;
  24.138 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_WB;
  24.139 +    e820entry[nr_map].addr = mem_size;
  24.140 +    e820entry[nr_map].size = 0x3 * PAGE_SIZE;
  24.141 +    e820entry[nr_map].type = E820_NVS;
  24.142      nr_map++;
  24.143  
  24.144 -    mem_mapp->map[nr_map].addr = 0xFEC00000;
  24.145 -    mem_mapp->map[nr_map].size = 0x1400000;
  24.146 -    mem_mapp->map[nr_map].type = E820_IO;
  24.147 -    mem_mapp->map[nr_map].caching_attr = MEMMAP_UC;
  24.148 +    e820entry[nr_map].addr = mem_size + 0x3 * PAGE_SIZE;
  24.149 +    e820entry[nr_map].size = 0xA * PAGE_SIZE;
  24.150 +    e820entry[nr_map].type = E820_ACPI;
  24.151      nr_map++;
  24.152  
  24.153 -    mem_mapp->nr_map = nr_map;
  24.154 +    e820entry[nr_map].addr = 0xFEC00000;
  24.155 +    e820entry[nr_map].size = 0x1400000;
  24.156 +    e820entry[nr_map].type = E820_IO;
  24.157 +    nr_map++;
  24.158 +
  24.159 +    return (*(((unsigned char *)e820_page) + E820_MAP_NR_OFFSET) = nr_map);
  24.160  }
  24.161  
  24.162  /*
  24.163 @@ -112,19 +121,19 @@ static void build_e820map(struct mem_map
  24.164   * vmxloader will use it to config ACPI MADT table
  24.165   */
  24.166  #define VCPU_MAGIC 0x76637075 /* "vcpu" */
  24.167 -static int 
  24.168 -set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list, 
  24.169 +static int
  24.170 +set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list,
  24.171               struct domain_setup_info *dsi, unsigned long vcpus)
  24.172  {
  24.173      char          *va_map;
  24.174      unsigned long *va_vcpus;
  24.175 -    
  24.176 +
  24.177      va_map = xc_map_foreign_range(
  24.178          xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  24.179 -        pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]);    
  24.180 +        pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]);
  24.181      if ( va_map == NULL )
  24.182          return -1;
  24.183 -    
  24.184 +
  24.185      va_vcpus = (unsigned long *)(va_map + 0x800);
  24.186      *va_vcpus++ = VCPU_MAGIC;
  24.187      *va_vcpus++ = vcpus;
  24.188 @@ -164,24 +173,23 @@ static int zap_mmio_range(int xc_handle,
  24.189      return 0;
  24.190  }
  24.191  
  24.192 -static int zap_mmio_ranges(int xc_handle, u32 dom,
  24.193 -                           unsigned long l2tab,
  24.194 -                           struct mem_map *mem_mapp)
  24.195 +static int zap_mmio_ranges(int xc_handle, u32 dom, unsigned long l2tab,
  24.196 +                           unsigned char e820_map_nr, unsigned char *e820map)
  24.197  {
  24.198 -    int i;
  24.199 +    unsigned int i;
  24.200 +    struct e820entry *e820entry = (struct e820entry *)e820map;
  24.201 +
  24.202      l2_pgentry_32_t *vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  24.203                                                     PROT_READ|PROT_WRITE,
  24.204                                                     l2tab >> PAGE_SHIFT);
  24.205      if ( vl2tab == 0 )
  24.206          return -1;
  24.207  
  24.208 -    for ( i = 0; i < mem_mapp->nr_map; i++ )
  24.209 +    for ( i = 0; i < e820_map_nr; i++ )
  24.210      {
  24.211 -        if ( (mem_mapp->map[i].type == E820_IO) &&
  24.212 -             (mem_mapp->map[i].caching_attr == MEMMAP_UC) &&
  24.213 +        if ( (e820entry[i].type == E820_IO) &&
  24.214               (zap_mmio_range(xc_handle, dom, vl2tab,
  24.215 -                             mem_mapp->map[i].addr,
  24.216 -                             mem_mapp->map[i].size) == -1) )
  24.217 +                             e820entry[i].addr, e820entry[i].size) == -1))
  24.218              return -1;
  24.219      }
  24.220  
  24.221 @@ -200,7 +208,7 @@ static int zap_mmio_range(int xc_handle,
  24.222      unsigned long vl3e;
  24.223      l1_pgentry_t *vl1tab;
  24.224      l2_pgentry_t *vl2tab;
  24.225 - 
  24.226 +
  24.227      mmio_addr = mmio_range_start & PAGE_MASK;
  24.228      for ( ; mmio_addr < mmio_range_end; mmio_addr += PAGE_SIZE )
  24.229      {
  24.230 @@ -239,22 +247,22 @@ static int zap_mmio_range(int xc_handle,
  24.231      return 0;
  24.232  }
  24.233  
  24.234 -static int zap_mmio_ranges(int xc_handle, u32 dom,
  24.235 -                           unsigned long l3tab,
  24.236 -                           struct mem_map *mem_mapp)
  24.237 +static int zap_mmio_ranges(int xc_handle, u32 dom, unsigned long l3tab,
  24.238 +                           unsigned char e820_map_nr, unsigned char *e820map)
  24.239  {
  24.240 -    int i;
  24.241 +    unsigned int i;
  24.242 +    struct e820entry *e820entry = (struct e820entry *)e820map;
  24.243 +
  24.244      l3_pgentry_t *vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  24.245                                                  PROT_READ|PROT_WRITE,
  24.246                                                  l3tab >> PAGE_SHIFT);
  24.247      if (vl3tab == 0)
  24.248          return -1;
  24.249 -    for (i = 0; i < mem_mapp->nr_map; i++) {
  24.250 -        if ((mem_mapp->map[i].type == E820_IO)
  24.251 -            && (mem_mapp->map[i].caching_attr == MEMMAP_UC))
  24.252 -            if (zap_mmio_range(xc_handle, dom, vl3tab,
  24.253 -                               mem_mapp->map[i].addr, mem_mapp->map[i].size) == -1)
  24.254 -                return -1;
  24.255 +    for ( i = 0; i < e820_map_nr; i++ ) {
  24.256 +        if ( (e820entry[i].type == E820_IO) &&
  24.257 +             (zap_mmio_range(xc_handle, dom, vl3tab,
  24.258 +                             e820entry[i].addr, e820entry[i].size) == -1) )
  24.259 +            return -1;
  24.260      }
  24.261      munmap(vl3tab, PAGE_SIZE);
  24.262      return 0;
  24.263 @@ -265,18 +273,14 @@ static int zap_mmio_ranges(int xc_handle
  24.264  static int setup_guest(int xc_handle,
  24.265                         u32 dom, int memsize,
  24.266                         char *image, unsigned long image_size,
  24.267 -                       gzFile initrd_gfd, unsigned long initrd_len,
  24.268                         unsigned long nr_pages,
  24.269                         vcpu_guest_context_t *ctxt,
  24.270 -                       const char *cmdline,
  24.271                         unsigned long shared_info_frame,
  24.272                         unsigned int control_evtchn,
  24.273                         unsigned long flags,
  24.274                         unsigned int vcpus,
  24.275                         unsigned int store_evtchn,
  24.276 -                       unsigned long *store_mfn,
  24.277 -                       struct mem_map *mem_mapp
  24.278 -    )
  24.279 +                       unsigned long *store_mfn)
  24.280  {
  24.281      l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
  24.282      l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
  24.283 @@ -289,8 +293,8 @@ static int setup_guest(int xc_handle,
  24.284      unsigned long l1tab;
  24.285      unsigned long count, i;
  24.286      shared_info_t *shared_info;
  24.287 -    struct linux_boot_params * boot_paramsp;
  24.288 -    __u16 * boot_gdtp;
  24.289 +    void *e820_page;
  24.290 +    unsigned char e820_map_nr;
  24.291      xc_mmu_t *mmu = NULL;
  24.292      int rc;
  24.293  
  24.294 @@ -298,12 +302,6 @@ static int setup_guest(int xc_handle,
  24.295      unsigned long ppt_alloc;
  24.296  
  24.297      struct domain_setup_info dsi;
  24.298 -    unsigned long vinitrd_start;
  24.299 -    unsigned long vinitrd_end;
  24.300 -    unsigned long vboot_params_start;
  24.301 -    unsigned long vboot_params_end;
  24.302 -    unsigned long vboot_gdt_start;
  24.303 -    unsigned long vboot_gdt_end;
  24.304      unsigned long vpt_start;
  24.305      unsigned long vpt_end;
  24.306      unsigned long v_end;
  24.307 @@ -322,27 +320,8 @@ static int setup_guest(int xc_handle,
  24.308          goto error_out;
  24.309      }
  24.310  
  24.311 -    /*
  24.312 -     * Why do we need this? The number of page-table frames depends on the 
  24.313 -     * size of the bootstrap address space. But the size of the address space 
  24.314 -     * depends on the number of page-table frames (since each one is mapped 
  24.315 -     * read-only). We have a pair of simultaneous equations in two unknowns, 
  24.316 -     * which we solve by exhaustive search.
  24.317 -     */
  24.318 -    vboot_params_start = LINUX_BOOT_PARAMS_ADDR;
  24.319 -    vboot_params_end   = vboot_params_start + PAGE_SIZE;
  24.320 -    vboot_gdt_start    = vboot_params_end;
  24.321 -    vboot_gdt_end      = vboot_gdt_start + PAGE_SIZE;
  24.322 -
  24.323      /* memsize is in megabytes */
  24.324      v_end              = memsize << 20;
  24.325 -    /* leaving the top 4k untouched for IO requests page use */
  24.326 -    vinitrd_end        = v_end - PAGE_SIZE;
  24.327 -    vinitrd_start      = vinitrd_end - initrd_len;
  24.328 -    vinitrd_start      = vinitrd_start & (~(PAGE_SIZE - 1));
  24.329 -
  24.330 -    if(initrd_len == 0)
  24.331 -        vinitrd_start = vinitrd_end = 0;
  24.332  
  24.333  #ifdef __i386__
  24.334      nr_pt_pages = 1 + ((memsize + 3) >> 2);
  24.335 @@ -353,24 +332,17 @@ static int setup_guest(int xc_handle,
  24.336      vpt_end     = vpt_start + (nr_pt_pages * PAGE_SIZE);
  24.337  
  24.338      printf("VIRTUAL MEMORY ARRANGEMENT:\n"
  24.339 -           " Boot_params:   %08lx->%08lx\n"
  24.340 -           " boot_gdt:      %08lx->%08lx\n"
  24.341 -           " Loaded kernel: %08lx->%08lx\n"
  24.342 -           " Init. ramdisk: %08lx->%08lx\n"
  24.343 +           " Loaded VMX loader: %08lx->%08lx\n"
  24.344             " Page tables:   %08lx->%08lx\n"
  24.345             " TOTAL:         %08lx->%08lx\n",
  24.346 -           vboot_params_start, vboot_params_end,
  24.347 -           vboot_gdt_start, vboot_gdt_end,
  24.348 -           dsi.v_kernstart, dsi.v_kernend, 
  24.349 -           vinitrd_start, vinitrd_end,
  24.350 +           dsi.v_kernstart, dsi.v_kernend,
  24.351             vpt_start, vpt_end,
  24.352             dsi.v_start, v_end);
  24.353      printf(" ENTRY ADDRESS: %08lx\n", dsi.v_kernentry);
  24.354 -    printf(" INITRD LENGTH: %08lx\n", initrd_len);
  24.355  
  24.356      if ( (v_end - dsi.v_start) > (nr_pages * PAGE_SIZE) )
  24.357      {
  24.358 -        printf("Initial guest OS requires too much space\n"
  24.359 +        ERROR("Initial guest OS requires too much space\n"
  24.360                 "(%luMB is greater than %luMB limit)\n",
  24.361                 (v_end-dsi.v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
  24.362          goto error_out;
  24.363 @@ -390,23 +362,6 @@ static int setup_guest(int xc_handle,
  24.364  
  24.365      loadelfimage(image, xc_handle, dom, page_array, &dsi);
  24.366  
  24.367 -    /* Load the initial ramdisk image. */
  24.368 -    if ( initrd_len != 0 )
  24.369 -    {
  24.370 -        for ( i = (vinitrd_start - dsi.v_start); 
  24.371 -              i < (vinitrd_end - dsi.v_start); i += PAGE_SIZE )
  24.372 -        {
  24.373 -            char page[PAGE_SIZE];
  24.374 -            if ( gzread(initrd_gfd, page, PAGE_SIZE) == -1 )
  24.375 -            {
  24.376 -                PERROR("Error reading initrd image, could not");
  24.377 -                goto error_out;
  24.378 -            }
  24.379 -            xc_copy_to_domain_page(xc_handle, dom,
  24.380 -                                   page_array[i>>PAGE_SHIFT], page);
  24.381 -        }
  24.382 -    }
  24.383 -
  24.384      if ( (mmu = xc_init_mmu_updates(xc_handle, dom)) == NULL )
  24.385          goto error_out;
  24.386  
  24.387 @@ -428,15 +383,14 @@ static int setup_guest(int xc_handle,
  24.388      l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  24.389      ctxt->ctrlreg[3] = l2tab;
  24.390  
  24.391 -    /* Initialise the page tables. */
  24.392 -    if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  24.393 -                                        PROT_READ|PROT_WRITE, 
  24.394 +    if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  24.395 +                                        PROT_READ|PROT_WRITE,
  24.396                                          l2tab >> PAGE_SHIFT)) == NULL )
  24.397          goto error_out;
  24.398      memset(vl2tab, 0, PAGE_SIZE);
  24.399      vl2e = &vl2tab[l2_table_offset(dsi.v_start)];
  24.400      for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
  24.401 -    {    
  24.402 +    {
  24.403          if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  24.404          {
  24.405              l1tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  24.406 @@ -460,23 +414,35 @@ static int setup_guest(int xc_handle,
  24.407      munmap(vl1tab, PAGE_SIZE);
  24.408      munmap(vl2tab, PAGE_SIZE);
  24.409  #else
  24.410 -    /* here l3tab means pdpt, only 4 entry is used */
  24.411      l3tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  24.412      ctxt->ctrlreg[3] = l3tab;
  24.413  
  24.414 -    /* Initialise the page tables. */
  24.415 -    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  24.416 -                                        PROT_READ|PROT_WRITE, 
  24.417 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  24.418 +                                        PROT_READ|PROT_WRITE,
  24.419                                          l3tab >> PAGE_SHIFT)) == NULL )
  24.420          goto error_out;
  24.421      memset(vl3tab, 0, PAGE_SIZE);
  24.422  
  24.423 +    /* Fill in every PDPT entry. */
  24.424 +    for ( i = 0; i < L3_PAGETABLE_ENTRIES_PAE; i++ )
  24.425 +    {
  24.426 +        l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  24.427 +        if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  24.428 +                                            PROT_READ|PROT_WRITE,
  24.429 +                                            l2tab >> PAGE_SHIFT)) == NULL )
  24.430 +            goto error_out;
  24.431 +        memset(vl2tab, 0, PAGE_SIZE);
  24.432 +        munmap(vl2tab, PAGE_SIZE);
  24.433 +        vl3tab[i] = l2tab | L3_PROT;
  24.434 +    }
  24.435 +
  24.436      vl3e = &vl3tab[l3_table_offset(dsi.v_start)];
  24.437  
  24.438      for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
  24.439      {
  24.440 -        if (!(count % (1 << (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))){
  24.441 -            l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
  24.442 +        if (!(count & (1 << (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))){
  24.443 +            l2tab = vl3tab[count >> (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]
  24.444 +                & PAGE_MASK;
  24.445  
  24.446              if (vl2tab != NULL)
  24.447                  munmap(vl2tab, PAGE_SIZE);
  24.448 @@ -486,8 +452,6 @@ static int setup_guest(int xc_handle,
  24.449                                                  l2tab >> PAGE_SHIFT)) == NULL )
  24.450                  goto error_out;
  24.451  
  24.452 -            memset(vl2tab, 0, PAGE_SIZE);
  24.453 -            *vl3e++ = l2tab | L3_PROT;
  24.454              vl2e = &vl2tab[l2_table_offset(dsi.v_start + (count << PAGE_SHIFT))];
  24.455          }
  24.456          if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
  24.457 @@ -519,103 +483,31 @@ static int setup_guest(int xc_handle,
  24.458      for ( count = 0; count < nr_pages; count++ )
  24.459      {
  24.460          if ( xc_add_mmu_update(xc_handle, mmu,
  24.461 -                               (page_array[count] << PAGE_SHIFT) | 
  24.462 +                               (page_array[count] << PAGE_SHIFT) |
  24.463                                 MMU_MACHPHYS_UPDATE, count) )
  24.464              goto error_out;
  24.465      }
  24.466  
  24.467      set_nr_vcpus(xc_handle, dom, page_array, &dsi, vcpus);
  24.468  
  24.469 -    if ((boot_paramsp = xc_map_foreign_range(
  24.470 -        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  24.471 -        page_array[(vboot_params_start-dsi.v_start)>>PAGE_SHIFT])) == 0)
  24.472 -        goto error_out;
  24.473 -
  24.474 -    memset(boot_paramsp, 0, sizeof(*boot_paramsp));
  24.475 -
  24.476 -    strncpy((char *)boot_paramsp->cmd_line, cmdline, 0x800);
  24.477 -    boot_paramsp->cmd_line[0x800-1] = '\0';
  24.478 -    boot_paramsp->cmd_line_ptr = ((unsigned long) vboot_params_start) + offsetof(struct linux_boot_params, cmd_line);
  24.479 -
  24.480 -    boot_paramsp->setup_sects = 0;
  24.481 -    boot_paramsp->mount_root_rdonly = 1;
  24.482 -    boot_paramsp->swapdev = 0x0; 
  24.483 -    boot_paramsp->ramdisk_flags = 0x0; 
  24.484 -    boot_paramsp->root_dev = 0x0; /* We must tell kernel root dev by kernel command line. */
  24.485 -
  24.486 -    /* we don't have a ps/2 mouse now.
  24.487 -     * 0xAA means a aux mouse is there.
  24.488 -     * See detect_auxiliary_port() in pc_keyb.c.
  24.489 -     */
  24.490 -    boot_paramsp->aux_device_info = 0x0; 
  24.491 -
  24.492 -    boot_paramsp->header_magic[0] = 0x48; /* "H" */
  24.493 -    boot_paramsp->header_magic[1] = 0x64; /* "d" */
  24.494 -    boot_paramsp->header_magic[2] = 0x72; /* "r" */
  24.495 -    boot_paramsp->header_magic[3] = 0x53; /* "S" */
  24.496 -
  24.497 -    boot_paramsp->protocol_version = 0x0203; /* 2.03 */
  24.498 -    boot_paramsp->loader_type = 0x71; /* GRUB */
  24.499 -    boot_paramsp->loader_flags = 0x1; /* loaded high */
  24.500 -    boot_paramsp->code32_start = LINUX_KERNEL_ENTR_ADDR; /* 1MB */
  24.501 -    boot_paramsp->initrd_start = vinitrd_start;
  24.502 -    boot_paramsp->initrd_size = initrd_len;
  24.503 -
  24.504 -    i = ((memsize - 1) << 10) - 4;
  24.505 -    boot_paramsp->alt_mem_k = i; /* alt_mem_k */
  24.506 -    boot_paramsp->screen.overlap.ext_mem_k = i & 0xFFFF; /* ext_mem_k */
  24.507 +    *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT];
  24.508 +    shared_page_frame = (v_end - PAGE_SIZE) >> PAGE_SHIFT;
  24.509  
  24.510 -    /*
  24.511 -     * Stuff SCREAN_INFO
  24.512 -     */
  24.513 -    boot_paramsp->screen.info.orig_x = 0;
  24.514 -    boot_paramsp->screen.info.orig_y = 0;
  24.515 -    boot_paramsp->screen.info.orig_video_page = 8;
  24.516 -    boot_paramsp->screen.info.orig_video_mode = 3;
  24.517 -    boot_paramsp->screen.info.orig_video_cols = 80;
  24.518 -    boot_paramsp->screen.info.orig_video_ega_bx = 0;
  24.519 -    boot_paramsp->screen.info.orig_video_lines = 25;
  24.520 -    boot_paramsp->screen.info.orig_video_isVGA = 1;
  24.521 -    boot_paramsp->screen.info.orig_video_points = 0x0010;
  24.522 -
  24.523 -    /* seems we may NOT stuff boot_paramsp->apm_bios_info */
  24.524 -    /* seems we may NOT stuff boot_paramsp->drive_info */
  24.525 -    /* seems we may NOT stuff boot_paramsp->sys_desc_table */
  24.526 -    *((unsigned short *) &boot_paramsp->drive_info.dummy[0]) = 800;
  24.527 -    boot_paramsp->drive_info.dummy[2] = 4;
  24.528 -    boot_paramsp->drive_info.dummy[14] = 32;
  24.529 -
  24.530 -    /* memsize is in megabytes */
  24.531 -    /* If you need to create a special e820map, comment this line
  24.532 -       and use mem-map.sxp */
  24.533 -    build_e820map(mem_mapp, memsize << 20);
  24.534 -    *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT];
  24.535 +    if ((e820_page = xc_map_foreign_range(
  24.536 +        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  24.537 +        page_array[E820_MAP_PAGE >> PAGE_SHIFT])) == 0)
  24.538 +        goto error_out;
  24.539 +    memset(e820_page, 0, PAGE_SIZE);
  24.540 +    e820_map_nr = build_e820map(e820_page, v_end);
  24.541  #if defined (__i386__)
  24.542 -    if (zap_mmio_ranges(xc_handle, dom, l2tab, mem_mapp) == -1)
  24.543 +    if (zap_mmio_ranges(xc_handle, dom, l2tab, e820_map_nr,
  24.544 +                        ((unsigned char *)e820_page) + E820_MAP_OFFSET) == -1)
  24.545  #else
  24.546 -        if (zap_mmio_ranges(xc_handle, dom, l3tab, mem_mapp) == -1)
  24.547 +    if (zap_mmio_ranges(xc_handle, dom, l3tab, e820_map_nr,
  24.548 +                        ((unsigned char *)e820_page) + E820_MAP_OFFSET) == -1)
  24.549  #endif
  24.550 -            goto error_out;
  24.551 -    boot_paramsp->e820_map_nr = mem_mapp->nr_map;
  24.552 -    for (i=0; i<mem_mapp->nr_map; i++) {
  24.553 -        boot_paramsp->e820_map[i].addr = mem_mapp->map[i].addr; 
  24.554 -        boot_paramsp->e820_map[i].size = mem_mapp->map[i].size; 
  24.555 -        boot_paramsp->e820_map[i].type = mem_mapp->map[i].type; 
  24.556 -        if (mem_mapp->map[i].type == E820_SHARED)
  24.557 -            shared_page_frame = (mem_mapp->map[i].addr >> PAGE_SHIFT);
  24.558 -    }
  24.559 -    munmap(boot_paramsp, PAGE_SIZE); 
  24.560 -
  24.561 -    if ((boot_gdtp = xc_map_foreign_range(
  24.562 -        xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
  24.563 -        page_array[(vboot_gdt_start-dsi.v_start)>>PAGE_SHIFT])) == 0)
  24.564          goto error_out;
  24.565 -    memset(boot_gdtp, 0, PAGE_SIZE);
  24.566 -    boot_gdtp[12*4 + 0] = boot_gdtp[13*4 + 0] = 0xffff; /* limit */
  24.567 -    boot_gdtp[12*4 + 1] = boot_gdtp[13*4 + 1] = 0x0000; /* base */
  24.568 -    boot_gdtp[12*4 + 2] = 0x9a00; boot_gdtp[13*4 + 2] = 0x9200; /* perms */
  24.569 -    boot_gdtp[12*4 + 3] = boot_gdtp[13*4 + 3] = 0x00cf; /* granu + top of limit */
  24.570 -    munmap(boot_gdtp, PAGE_SIZE);
  24.571 +    munmap(e820_page, PAGE_SIZE);
  24.572  
  24.573      /* shared_info page starts its life empty. */
  24.574      if ((shared_info = xc_map_foreign_range(
  24.575 @@ -651,20 +543,21 @@ static int setup_guest(int xc_handle,
  24.576      /*
  24.577       * Initial register values:
  24.578       */
  24.579 -    ctxt->user_regs.ds = 0x68;
  24.580 -    ctxt->user_regs.es = 0x0;
  24.581 -    ctxt->user_regs.fs = 0x0;
  24.582 -    ctxt->user_regs.gs = 0x0;
  24.583 -    ctxt->user_regs.ss = 0x68;
  24.584 -    ctxt->user_regs.cs = 0x60;
  24.585 +    ctxt->user_regs.ds = 0;
  24.586 +    ctxt->user_regs.es = 0;
  24.587 +    ctxt->user_regs.fs = 0;
  24.588 +    ctxt->user_regs.gs = 0;
  24.589 +    ctxt->user_regs.ss = 0;
  24.590 +    ctxt->user_regs.cs = 0;
  24.591      ctxt->user_regs.eip = dsi.v_kernentry;
  24.592 -    ctxt->user_regs.edx = vboot_gdt_start;
  24.593 -    ctxt->user_regs.eax = 0x800;
  24.594 -    ctxt->user_regs.esp = vboot_gdt_end;
  24.595 +    ctxt->user_regs.edx = 0;
  24.596 +    ctxt->user_regs.eax = 0;
  24.597 +    ctxt->user_regs.esp = 0;
  24.598      ctxt->user_regs.ebx = 0; /* startup_32 expects this to be 0 to signal boot cpu */
  24.599 -    ctxt->user_regs.ecx = mem_mapp->nr_map;
  24.600 -    ctxt->user_regs.esi = vboot_params_start;
  24.601 -    ctxt->user_regs.edi = vboot_params_start + 0x2d0;
  24.602 +    ctxt->user_regs.ecx = 0;
  24.603 +    ctxt->user_regs.esi = 0;
  24.604 +    ctxt->user_regs.edi = 0;
  24.605 +    ctxt->user_regs.ebp = 0;
  24.606  
  24.607      ctxt->user_regs.eflags = 0;
  24.608  
  24.609 @@ -684,9 +577,9 @@ static int vmx_identify(void)
  24.610      int eax, ecx;
  24.611  
  24.612  #ifdef __i386__
  24.613 -    __asm__ __volatile__ ("pushl %%ebx; cpuid; popl %%ebx" 
  24.614 -                          : "=a" (eax), "=c" (ecx) 
  24.615 -                          : "0" (1) 
  24.616 +    __asm__ __volatile__ ("pushl %%ebx; cpuid; popl %%ebx"
  24.617 +                          : "=a" (eax), "=c" (ecx)
  24.618 +                          : "0" (1)
  24.619                            : "dx");
  24.620  #elif defined __x86_64__
  24.621      __asm__ __volatile__ ("pushq %%rbx; cpuid; popq %%rbx"
  24.622 @@ -705,9 +598,6 @@ int xc_vmx_build(int xc_handle,
  24.623                   u32 domid,
  24.624                   int memsize,
  24.625                   const char *image_name,
  24.626 -                 struct mem_map *mem_mapp,
  24.627 -                 const char *ramdisk_name,
  24.628 -                 const char *cmdline,
  24.629                   unsigned int control_evtchn,
  24.630                   unsigned long flags,
  24.631                   unsigned int vcpus,
  24.632 @@ -715,20 +605,18 @@ int xc_vmx_build(int xc_handle,
  24.633                   unsigned long *store_mfn)
  24.634  {
  24.635      dom0_op_t launch_op, op;
  24.636 -    int initrd_fd = -1;
  24.637 -    gzFile initrd_gfd = NULL;
  24.638      int rc, i;
  24.639      vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
  24.640      unsigned long nr_pages;
  24.641      char         *image = NULL;
  24.642 -    unsigned long image_size, initrd_size=0;
  24.643 +    unsigned long image_size;
  24.644  
  24.645      if ( vmx_identify() < 0 )
  24.646      {
  24.647          PERROR("CPU doesn't support VMX Extensions");
  24.648          goto error_out;
  24.649      }
  24.650 -    
  24.651 +
  24.652      if ( (nr_pages = xc_get_tot_pages(xc_handle, domid)) < 0 )
  24.653      {
  24.654          PERROR("Could not find total pages for domain");
  24.655 @@ -738,32 +626,15 @@ int xc_vmx_build(int xc_handle,
  24.656      if ( (image = xc_read_kernel_image(image_name, &image_size)) == NULL )
  24.657          goto error_out;
  24.658  
  24.659 -    if ( (ramdisk_name != NULL) && (strlen(ramdisk_name) != 0) )
  24.660 +    if ( mlock(&st_ctxt, sizeof(st_ctxt) ) )
  24.661      {
  24.662 -        if ( (initrd_fd = open(ramdisk_name, O_RDONLY)) < 0 )
  24.663 -        {
  24.664 -            PERROR("Could not open the initial ramdisk image");
  24.665 -            goto error_out;
  24.666 -        }
  24.667 -
  24.668 -        initrd_size = xc_get_filesz(initrd_fd);
  24.669 -
  24.670 -        if ( (initrd_gfd = gzdopen(initrd_fd, "rb")) == NULL )
  24.671 -        {
  24.672 -            PERROR("Could not allocate decompression state for initrd");
  24.673 -            goto error_out;
  24.674 -        }
  24.675 -    }
  24.676 -
  24.677 -    if ( mlock(&st_ctxt, sizeof(st_ctxt) ) )
  24.678 -    {   
  24.679          PERROR("xc_vmx_build: ctxt mlock failed");
  24.680          return 1;
  24.681      }
  24.682  
  24.683      op.cmd = DOM0_GETDOMAININFO;
  24.684      op.u.getdomaininfo.domain = (domid_t)domid;
  24.685 -    if ( (xc_dom0_op(xc_handle, &op) < 0) || 
  24.686 +    if ( (xc_dom0_op(xc_handle, &op) < 0) ||
  24.687           ((u16)op.u.getdomaininfo.domain != domid) )
  24.688      {
  24.689          PERROR("Could not get info on domain");
  24.690 @@ -783,21 +654,14 @@ int xc_vmx_build(int xc_handle,
  24.691          goto error_out;
  24.692      }
  24.693  
  24.694 -    if ( setup_guest(xc_handle, domid, memsize, image, image_size, 
  24.695 -                     initrd_gfd, initrd_size, nr_pages, 
  24.696 -                     ctxt, cmdline,
  24.697 -                     op.u.getdomaininfo.shared_info_frame,
  24.698 -                     control_evtchn, flags, vcpus, store_evtchn, store_mfn,
  24.699 -                     mem_mapp) < 0 )
  24.700 +    if ( setup_guest(xc_handle, domid, memsize, image, image_size, nr_pages,
  24.701 +                     ctxt, op.u.getdomaininfo.shared_info_frame, control_evtchn,
  24.702 +                     flags, vcpus, store_evtchn, store_mfn) < 0)
  24.703      {
  24.704          ERROR("Error constructing guest OS");
  24.705          goto error_out;
  24.706      }
  24.707  
  24.708 -    if ( initrd_fd >= 0 )
  24.709 -        close(initrd_fd);
  24.710 -    if ( initrd_gfd )
  24.711 -        gzclose(initrd_gfd);
  24.712      free(image);
  24.713  
  24.714      ctxt->flags = VGCF_VMX_GUEST;
  24.715 @@ -813,15 +677,10 @@ int xc_vmx_build(int xc_handle,
  24.716  
  24.717      /* No LDT. */
  24.718      ctxt->ldt_ents = 0;
  24.719 -    
  24.720 +
  24.721      /* Use the default Xen-provided GDT. */
  24.722      ctxt->gdt_ents = 0;
  24.723  
  24.724 -    /* Ring 1 stack is the initial stack. */
  24.725 -/*
  24.726 -  ctxt->kernel_ss = FLAT_KERNEL_DS;
  24.727 -  ctxt->kernel_sp = vstartinfo_start;
  24.728 -*/
  24.729      /* No debugging. */
  24.730      memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg));
  24.731  
  24.732 @@ -845,14 +704,10 @@ int xc_vmx_build(int xc_handle,
  24.733  
  24.734      launch_op.cmd = DOM0_SETDOMAININFO;
  24.735      rc = xc_dom0_op(xc_handle, &launch_op);
  24.736 -    
  24.737 +
  24.738      return rc;
  24.739  
  24.740   error_out:
  24.741 -    if ( initrd_gfd != NULL )
  24.742 -        gzclose(initrd_gfd);
  24.743 -    else if ( initrd_fd >= 0 )
  24.744 -        close(initrd_fd);
  24.745      free(image);
  24.746  
  24.747      return -1;
  24.748 @@ -864,7 +719,7 @@ static inline int is_loadable_phdr(Elf32
  24.749              ((phdr->p_flags & (PF_W|PF_X)) != 0));
  24.750  }
  24.751  
  24.752 -static int parseelfimage(char *elfbase, 
  24.753 +static int parseelfimage(char *elfbase,
  24.754                           unsigned long elfsize,
  24.755                           struct domain_setup_info *dsi)
  24.756  {
  24.757 @@ -899,11 +754,11 @@ static int parseelfimage(char *elfbase,
  24.758          ERROR("ELF image has no section-header strings table (shstrtab).");
  24.759          return -EINVAL;
  24.760      }
  24.761 -    shdr = (Elf32_Shdr *)(elfbase + ehdr->e_shoff + 
  24.762 +    shdr = (Elf32_Shdr *)(elfbase + ehdr->e_shoff +
  24.763                            (ehdr->e_shstrndx*ehdr->e_shentsize));
  24.764      shstrtab = elfbase + shdr->sh_offset;
  24.765 -    
  24.766 -    for ( h = 0; h < ehdr->e_phnum; h++ ) 
  24.767 +
  24.768 +    for ( h = 0; h < ehdr->e_phnum; h++ )
  24.769      {
  24.770          phdr = (Elf32_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
  24.771          if ( !is_loadable_phdr(phdr) )
  24.772 @@ -914,8 +769,8 @@ static int parseelfimage(char *elfbase,
  24.773              kernend = phdr->p_paddr + phdr->p_memsz;
  24.774      }
  24.775  
  24.776 -    if ( (kernstart > kernend) || 
  24.777 -         (ehdr->e_entry < kernstart) || 
  24.778 +    if ( (kernstart > kernend) ||
  24.779 +         (ehdr->e_entry < kernstart) ||
  24.780           (ehdr->e_entry > kernend) )
  24.781      {
  24.782          ERROR("Malformed ELF image.");
  24.783 @@ -924,9 +779,9 @@ static int parseelfimage(char *elfbase,
  24.784  
  24.785      dsi->v_start = 0x00000000;
  24.786  
  24.787 -    dsi->v_kernstart = kernstart - LINUX_PAGE_OFFSET;
  24.788 -    dsi->v_kernend   = kernend - LINUX_PAGE_OFFSET;
  24.789 -    dsi->v_kernentry = LINUX_KERNEL_ENTR_ADDR;
  24.790 +    dsi->v_kernstart = kernstart;
  24.791 +    dsi->v_kernend   = kernend;
  24.792 +    dsi->v_kernentry = VMX_LOADER_ENTR_ADDR;
  24.793  
  24.794      dsi->v_end       = dsi->v_kernend;
  24.795  
  24.796 @@ -945,18 +800,18 @@ loadelfimage(
  24.797      char         *va;
  24.798      unsigned long pa, done, chunksz;
  24.799  
  24.800 -    for ( h = 0; h < ehdr->e_phnum; h++ ) 
  24.801 +    for ( h = 0; h < ehdr->e_phnum; h++ )
  24.802      {
  24.803          phdr = (Elf32_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
  24.804          if ( !is_loadable_phdr(phdr) )
  24.805              continue;
  24.806 -        
  24.807 +
  24.808          for ( done = 0; done < phdr->p_filesz; done += chunksz )
  24.809          {
  24.810 -            pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET;
  24.811 +            pa = (phdr->p_paddr + done) - dsi->v_start;
  24.812              if ((va = xc_map_foreign_range(
  24.813                  xch, dom, PAGE_SIZE, PROT_WRITE,
  24.814 -                parray[pa>>PAGE_SHIFT])) == 0)
  24.815 +                parray[pa >> PAGE_SHIFT])) == 0)
  24.816                  return -1;
  24.817              chunksz = phdr->p_filesz - done;
  24.818              if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
  24.819 @@ -968,10 +823,10 @@ loadelfimage(
  24.820  
  24.821          for ( ; done < phdr->p_memsz; done += chunksz )
  24.822          {
  24.823 -            pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET;
  24.824 +            pa = (phdr->p_paddr + done) - dsi->v_start;
  24.825              if ((va = xc_map_foreign_range(
  24.826                  xch, dom, PAGE_SIZE, PROT_WRITE,
  24.827 -                parray[pa>>PAGE_SHIFT])) == 0)
  24.828 +                parray[pa >> PAGE_SHIFT])) == 0)
  24.829                  return -1;
  24.830              chunksz = phdr->p_memsz - done;
  24.831              if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
    25.1 --- a/tools/libxc/xenguest.h	Fri Sep 23 15:41:28 2005 -0600
    25.2 +++ b/tools/libxc/xenguest.h	Mon Sep 26 11:07:49 2005 -0600
    25.3 @@ -57,9 +57,6 @@ int xc_vmx_build(int xc_handle,
    25.4                   uint32_t domid,
    25.5                   int memsize,
    25.6                   const char *image_name,
    25.7 -                 struct mem_map *memmap,
    25.8 -                 const char *ramdisk_name,
    25.9 -                 const char *cmdline,
   25.10                   unsigned int control_evtchn,
   25.11                   unsigned long flags,
   25.12                   unsigned int vcpus,
    26.1 --- a/tools/libxc/xg_private.h	Fri Sep 23 15:41:28 2005 -0600
    26.2 +++ b/tools/libxc/xg_private.h	Mon Sep 26 11:07:49 2005 -0600
    26.3 @@ -28,25 +28,27 @@ unsigned long csum_page (void * page);
    26.4  #define _PAGE_PSE       0x080
    26.5  #define _PAGE_GLOBAL    0x100
    26.6  
    26.7 -#if defined(__i386__)
    26.8 -#define L1_PAGETABLE_SHIFT       12
    26.9 -#define L2_PAGETABLE_SHIFT       22
   26.10  #define L1_PAGETABLE_SHIFT_PAE   12
   26.11  #define L2_PAGETABLE_SHIFT_PAE   21
   26.12  #define L3_PAGETABLE_SHIFT_PAE   30
   26.13 +
   26.14 +#if defined(__i386__)
   26.15 +#define L1_PAGETABLE_SHIFT       12
   26.16 +#define L2_PAGETABLE_SHIFT       22
   26.17  #elif defined(__x86_64__)
   26.18 -#define L1_PAGETABLE_SHIFT      12
   26.19 -#define L2_PAGETABLE_SHIFT      21
   26.20 -#define L3_PAGETABLE_SHIFT      30
   26.21 -#define L4_PAGETABLE_SHIFT      39
   26.22 +#define L1_PAGETABLE_SHIFT       12
   26.23 +#define L2_PAGETABLE_SHIFT       21
   26.24 +#define L3_PAGETABLE_SHIFT       30
   26.25 +#define L4_PAGETABLE_SHIFT       39
   26.26  #endif
   26.27  
   26.28 -#if defined(__i386__) 
   26.29 -#define ENTRIES_PER_L1_PAGETABLE 1024
   26.30 -#define ENTRIES_PER_L2_PAGETABLE 1024
   26.31  #define L1_PAGETABLE_ENTRIES_PAE  512
   26.32  #define L2_PAGETABLE_ENTRIES_PAE  512
   26.33  #define L3_PAGETABLE_ENTRIES_PAE    4
   26.34 +
   26.35 +#if defined(__i386__) 
   26.36 +#define L1_PAGETABLE_ENTRIES   1024
   26.37 +#define L2_PAGETABLE_ENTRIES   1024
   26.38  #elif defined(__x86_64__)
   26.39  #define L1_PAGETABLE_ENTRIES    512
   26.40  #define L2_PAGETABLE_ENTRIES    512
   26.41 @@ -70,17 +72,18 @@ typedef unsigned long l3_pgentry_t;
   26.42  typedef unsigned long l4_pgentry_t;
   26.43  #endif
   26.44  
   26.45 -#if defined(__i386__)
   26.46 -#define l1_table_offset(_a) \
   26.47 -          (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1))
   26.48 -#define l2_table_offset(_a) \
   26.49 -          ((_a) >> L2_PAGETABLE_SHIFT)
   26.50  #define l1_table_offset_pae(_a) \
   26.51    (((_a) >> L1_PAGETABLE_SHIFT_PAE) & (L1_PAGETABLE_ENTRIES_PAE - 1))
   26.52  #define l2_table_offset_pae(_a) \
   26.53    (((_a) >> L2_PAGETABLE_SHIFT_PAE) & (L2_PAGETABLE_ENTRIES_PAE - 1))
   26.54  #define l3_table_offset_pae(_a) \
   26.55  	(((_a) >> L3_PAGETABLE_SHIFT_PAE) & (L3_PAGETABLE_ENTRIES_PAE - 1))
   26.56 +
   26.57 +#if defined(__i386__)
   26.58 +#define l1_table_offset(_a) \
   26.59 +          (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))
   26.60 +#define l2_table_offset(_a) \
   26.61 +          ((_a) >> L2_PAGETABLE_SHIFT)
   26.62  #elif defined(__x86_64__)
   26.63  #define l1_table_offset(_a) \
   26.64    (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))
    27.1 --- a/tools/python/xen/lowlevel/xc/xc.c	Fri Sep 23 15:41:28 2005 -0600
    27.2 +++ b/tools/python/xen/lowlevel/xc/xc.c	Mon Sep 26 11:07:49 2005 -0600
    27.3 @@ -17,7 +17,6 @@
    27.4  #include <arpa/inet.h>
    27.5  
    27.6  #include "xc_private.h"
    27.7 -#include "linux_boot_params.h"
    27.8  
    27.9  /* Needed for Python versions earlier than 2.3. */
   27.10  #ifndef PyMODINIT_FUNC
   27.11 @@ -310,80 +309,24 @@ static PyObject *pyxc_vmx_build(PyObject
   27.12      XcObject *xc = (XcObject *)self;
   27.13  
   27.14      u32   dom;
   27.15 -    char *image, *ramdisk = NULL, *cmdline = "";
   27.16 -    PyObject *memmap;
   27.17 +    char *image;
   27.18      int   control_evtchn, store_evtchn;
   27.19      int flags = 0, vcpus = 1;
   27.20 -    int numItems, i;
   27.21      int memsize;
   27.22 -    struct mem_map mem_map;
   27.23      unsigned long store_mfn = 0;
   27.24  
   27.25      static char *kwd_list[] = { "dom", "control_evtchn", "store_evtchn",
   27.26 -                                "memsize", "image", "memmap",
   27.27 -				"ramdisk", "cmdline", "flags",
   27.28 -				"vcpus", NULL };
   27.29 +                                "memsize", "image", "flags", "vcpus", NULL };
   27.30  
   27.31 -    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisO!|ssii", kwd_list, 
   27.32 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisii", kwd_list,
   27.33                                        &dom, &control_evtchn, &store_evtchn,
   27.34 -                                      &memsize,
   27.35 -                                      &image, &PyList_Type, &memmap,
   27.36 -				      &ramdisk, &cmdline, &flags, &vcpus) )
   27.37 +                                      &memsize, &image, &flags, &vcpus) )
   27.38          return NULL;
   27.39  
   27.40 -    memset(&mem_map, 0, sizeof(mem_map));
   27.41 -    /* Parse memmap */
   27.42 -
   27.43 -    /* get the number of lines passed to us */
   27.44 -    numItems = PyList_Size(memmap) - 1;	/* removing the line 
   27.45 -					   containing "memmap" */
   27.46 -    mem_map.nr_map = numItems;
   27.47 -   
   27.48 -    /* should raise an error here. */
   27.49 -    if (numItems < 0) return NULL; /* Not a list */
   27.50 -
   27.51 -    /* iterate over items of the list, grabbing ranges and parsing them */
   27.52 -    for (i = 1; i <= numItems; i++) {	// skip over "memmap"
   27.53 -	    PyObject *item, *f1, *f2, *f3, *f4;
   27.54 -	    int numFields;
   27.55 -	    unsigned long lf1, lf2, lf3, lf4;
   27.56 -	    char *sf1, *sf2;
   27.57 -	    
   27.58 -	    /* grab the string object from the next element of the list */
   27.59 -	    item = PyList_GetItem(memmap, i); /* Can't fail */
   27.60 -
   27.61 -	    /* get the number of lines passed to us */
   27.62 -	    numFields = PyList_Size(item);
   27.63 +    if ( xc_vmx_build(xc->xc_handle, dom, memsize, image, control_evtchn,
   27.64 +                      flags, vcpus, store_evtchn, &store_mfn) != 0 )
   27.65 +        return PyErr_SetFromErrno(xc_error);
   27.66  
   27.67 -	    if (numFields != 4)
   27.68 -		    return NULL;
   27.69 -
   27.70 -	    f1 = PyList_GetItem(item, 0);
   27.71 -	    f2 = PyList_GetItem(item, 1);
   27.72 -	    f3 = PyList_GetItem(item, 2);
   27.73 -	    f4 = PyList_GetItem(item, 3);
   27.74 -
   27.75 -	    /* Convert objects to strings/longs */
   27.76 -	    sf1 = PyString_AsString(f1);
   27.77 -	    sf2 = PyString_AsString(f2);
   27.78 -	    lf3 = PyLong_AsLong(f3);
   27.79 -	    lf4 = PyLong_AsLong(f4);
   27.80 -	    if ( sscanf(sf1, "%lx", &lf1) != 1 )
   27.81 -                return NULL;
   27.82 -	    if ( sscanf(sf2, "%lx", &lf2) != 1 )
   27.83 -                return NULL;
   27.84 -
   27.85 -            mem_map.map[i-1].addr = lf1;
   27.86 -            mem_map.map[i-1].size = lf2 - lf1;
   27.87 -            mem_map.map[i-1].type = lf3;
   27.88 -            mem_map.map[i-1].caching_attr = lf4;
   27.89 -    }
   27.90 -
   27.91 -    if ( xc_vmx_build(xc->xc_handle, dom, memsize, image, &mem_map,
   27.92 -                        ramdisk, cmdline, control_evtchn, flags,
   27.93 -                        vcpus, store_evtchn, &store_mfn) != 0 )
   27.94 -        return PyErr_SetFromErrno(xc_error);
   27.95 -    
   27.96      return Py_BuildValue("{s:i}", "store_mfn", store_mfn);
   27.97  }
   27.98  
    28.1 --- a/tools/python/xen/lowlevel/xs/xs.c	Fri Sep 23 15:41:28 2005 -0600
    28.2 +++ b/tools/python/xen/lowlevel/xs/xs.c	Mon Sep 26 11:07:49 2005 -0600
    28.3 @@ -582,9 +582,8 @@ static PyObject *xspy_unwatch(PyObject *
    28.4  }
    28.5  
    28.6  #define xspy_transaction_start_doc "\n"				\
    28.7 -	"Start a transaction on a path.\n"			\
    28.8 +	"Start a transaction.\n"				\
    28.9  	"Only one transaction can be active at a time.\n"	\
   28.10 -	" path [string]: xenstore path.\n"			\
   28.11  	"\n"							\
   28.12  	"Returns None on success.\n"				\
   28.13  	"Raises RuntimeError on error.\n"			\
   28.14 @@ -593,8 +592,8 @@ static PyObject *xspy_unwatch(PyObject *
   28.15  static PyObject *xspy_transaction_start(PyObject *self, PyObject *args,
   28.16                                          PyObject *kwds)
   28.17  {
   28.18 -    static char *kwd_spec[] = { "path", NULL };
   28.19 -    static char *arg_spec = "s|";
   28.20 +    static char *kwd_spec[] = { NULL };
   28.21 +    static char *arg_spec = "";
   28.22      char *path = NULL;
   28.23  
   28.24      struct xs_handle *xh = xshandle(self);
   28.25 @@ -606,7 +605,7 @@ static PyObject *xspy_transaction_start(
   28.26      if (!PyArg_ParseTupleAndKeywords(args, kwds, arg_spec, kwd_spec, &path))
   28.27          goto exit;
   28.28      Py_BEGIN_ALLOW_THREADS
   28.29 -    xsval = xs_transaction_start(xh, path);
   28.30 +    xsval = xs_transaction_start(xh);
   28.31      Py_END_ALLOW_THREADS
   28.32      if (!xsval) {
   28.33          PyErr_SetFromErrno(PyExc_RuntimeError);
   28.34 @@ -623,7 +622,7 @@ static PyObject *xspy_transaction_start(
   28.35  	"Attempts to commit the transaction unless abort is true.\n"	\
   28.36  	" abort [int]: abort flag (default 0).\n"			\
   28.37  	"\n"								\
   28.38 -	"Returns None on success.\n"					\
   28.39 +	"Returns True on success, False if you need to try again.\n"	\
   28.40  	"Raises RuntimeError on error.\n"				\
   28.41  	"\n"
   28.42  
   28.43 @@ -646,11 +645,16 @@ static PyObject *xspy_transaction_end(Py
   28.44      xsval = xs_transaction_end(xh, abort);
   28.45      Py_END_ALLOW_THREADS
   28.46      if (!xsval) {
   28.47 +	if (errno == EAGAIN) {
   28.48 +	    Py_INCREF(Py_False);
   28.49 +	    val = Py_False;
   28.50 +	    goto exit;
   28.51 +	}
   28.52          PyErr_SetFromErrno(PyExc_RuntimeError);
   28.53          goto exit;
   28.54      }
   28.55 -    Py_INCREF(Py_None);
   28.56 -    val = Py_None;
   28.57 +    Py_INCREF(Py_True);
   28.58 +    val = Py_True;
   28.59   exit:
   28.60      return val;
   28.61  }
    29.1 --- a/tools/python/xen/util/memmap.py	Fri Sep 23 15:41:28 2005 -0600
    29.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    29.3 @@ -1,41 +0,0 @@
    29.4 -mem_caching_attr = {
    29.5 -    'UC' : 0,
    29.6 -    'WC' : 1,
    29.7 -    'WT' : 4,
    29.8 -    'WP' : 5,
    29.9 -    'WB' : 6,
   29.10 -    };
   29.11 -
   29.12 -e820_mem_type = {
   29.13 -    'AddressRangeMemory'    : 1,
   29.14 -    'AddressRangeReserved'  : 2,
   29.15 -    'AddressRangeACPI'      : 3,
   29.16 -    'AddressRangeNVS'       : 4,
   29.17 -    'AddressRangeIO'        : 16,
   29.18 -    'AddressRangeShared'    : 17,
   29.19 -};
   29.20 -
   29.21 -MT_COL = 2
   29.22 -MA_COL = 3
   29.23 -
   29.24 -def strmap(row):
   29.25 -   if (type(row) != type([])):
   29.26 -       return row
   29.27 -   row[MT_COL] = e820_mem_type[row[MT_COL]]
   29.28 -   row[MA_COL] = mem_caching_attr[row[MA_COL]]
   29.29 -   return row
   29.30 -
   29.31 -def memmap_parse(memmap):
   29.32 -    return map(strmap, memmap)
   29.33 -
   29.34 -if __name__ == '__main__':
   29.35 -   memmap = [ 'memmap',
   29.36 -              [ '1', '2', 'AddressRangeMemory', 'UC'],
   29.37 -              [ '1', '2', 'AddressRangeReserved', 'UC'],
   29.38 -              [ '1', '2', 'AddressRangeACPI', 'WB'],
   29.39 -              [ '1', '2', 'AddressRangeNVS', 'WB'],
   29.40 -              [ '1', '2', 'AddressRangeIO', 'WB'],
   29.41 -              [ '1', '2', 'AddressRangeShared', 'WB']]
   29.42 -   print memmap_parse(memmap);
   29.43 -
   29.44 -
    30.1 --- a/tools/python/xen/util/tempfile.py	Fri Sep 23 15:41:28 2005 -0600
    30.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    30.3 @@ -1,451 +0,0 @@
    30.4 -"""Temporary files.
    30.5 -
    30.6 -This module provides generic, low- and high-level interfaces for
    30.7 -creating temporary files and directories.  The interfaces listed
    30.8 -as "safe" just below can be used without fear of race conditions.
    30.9 -Those listed as "unsafe" cannot, and are provided for backward
   30.10 -compatibility only.
   30.11 -
   30.12 -This module also provides some data items to the user:
   30.13 -
   30.14 -  TMP_MAX  - maximum number of names that will be tried before
   30.15 -             giving up.
   30.16 -  template - the default prefix for all temporary names.
   30.17 -             You may change this to control the default prefix.
   30.18 -  tempdir  - If this is set to a string before the first use of
   30.19 -             any routine from this module, it will be considered as
   30.20 -             another candidate location to store temporary files.
   30.21 -"""
   30.22 -
   30.23 -__all__ = [
   30.24 -    "NamedTemporaryFile", "TemporaryFile", # high level safe interfaces
   30.25 -    "mkstemp", "mkdtemp",                  # low level safe interfaces
   30.26 -    "mktemp",                              # deprecated unsafe interface
   30.27 -    "TMP_MAX", "gettempprefix",            # constants
   30.28 -    "tempdir", "gettempdir"
   30.29 -   ]
   30.30 -
   30.31 -
   30.32 -# Imports.
   30.33 -
   30.34 -import os as _os
   30.35 -import errno as _errno
   30.36 -from random import Random as _Random
   30.37 -
   30.38 -if _os.name == 'mac':
   30.39 -    import Carbon.Folder as _Folder
   30.40 -    import Carbon.Folders as _Folders
   30.41 -
   30.42 -try:
   30.43 -    import fcntl as _fcntl
   30.44 -    # If PYTHONCASEOK is set on Windows, stinking FCNTL.py gets
   30.45 -    # imported, and we don't get an ImportError then.  Provoke
   30.46 -    # an AttributeError instead in that case.
   30.47 -    _fcntl.fcntl
   30.48 -except (ImportError, AttributeError):
   30.49 -    def _set_cloexec(fd):
   30.50 -        pass
   30.51 -else:
   30.52 -    def _set_cloexec(fd):
   30.53 -        flags = _fcntl.fcntl(fd, _fcntl.F_GETFD, 0)
   30.54 -        if flags >= 0:
   30.55 -            # flags read successfully, modify
   30.56 -            flags |= _fcntl.FD_CLOEXEC
   30.57 -            _fcntl.fcntl(fd, _fcntl.F_SETFD, flags)
   30.58 -
   30.59 -
   30.60 -try:
   30.61 -    import thread as _thread
   30.62 -except ImportError:
   30.63 -    import dummy_thread as _thread
   30.64 -_allocate_lock = _thread.allocate_lock
   30.65 -
   30.66 -_text_openflags = _os.O_RDWR | _os.O_CREAT | _os.O_EXCL
   30.67 -if hasattr(_os, 'O_NOINHERIT'):
   30.68 -    _text_openflags |= _os.O_NOINHERIT
   30.69 -if hasattr(_os, 'O_NOFOLLOW'):
   30.70 -    _text_openflags |= _os.O_NOFOLLOW
   30.71 -
   30.72 -_bin_openflags = _text_openflags
   30.73 -if hasattr(_os, 'O_BINARY'):
   30.74 -    _bin_openflags |= _os.O_BINARY
   30.75 -
   30.76 -if hasattr(_os, 'TMP_MAX'):
   30.77 -    TMP_MAX = _os.TMP_MAX
   30.78 -else:
   30.79 -    TMP_MAX = 10000
   30.80 -
   30.81 -template = "tmp"
   30.82 -
   30.83 -tempdir = None
   30.84 -
   30.85 -# Internal routines.
   30.86 -
   30.87 -_once_lock = _allocate_lock()
   30.88 -
   30.89 -class _RandomNameSequence:
   30.90 -    """An instance of _RandomNameSequence generates an endless
   30.91 -    sequence of unpredictable strings which can safely be incorporated
   30.92 -    into file names.  Each string is six characters long.  Multiple
   30.93 -    threads can safely use the same instance at the same time.
   30.94 -
   30.95 -    _RandomNameSequence is an iterator."""
   30.96 -
   30.97 -    characters = ("abcdefghijklmnopqrstuvwxyz" +
   30.98 -                  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
   30.99 -                  "0123456789-_")
  30.100 -
  30.101 -    def __init__(self):
  30.102 -        self.mutex = _allocate_lock()
  30.103 -        self.rng = _Random()
  30.104 -        self.normcase = _os.path.normcase
  30.105 -
  30.106 -    def __iter__(self):
  30.107 -        return self
  30.108 -
  30.109 -    def next(self):
  30.110 -        m = self.mutex
  30.111 -        c = self.characters
  30.112 -        choose = self.rng.choice
  30.113 -
  30.114 -        m.acquire()
  30.115 -        try:
  30.116 -            letters = [choose(c) for dummy in "123456"]
  30.117 -        finally:
  30.118 -            m.release()
  30.119 -
  30.120 -        return self.normcase(''.join(letters))
  30.121 -
  30.122 -def _candidate_tempdir_list():
  30.123 -    """Generate a list of candidate temporary directories which
  30.124 -    _get_default_tempdir will try."""
  30.125 -
  30.126 -    dirlist = []
  30.127 -
  30.128 -    # First, try the environment.
  30.129 -    for envname in 'TMPDIR', 'TEMP', 'TMP':
  30.130 -        dirname = _os.getenv(envname)
  30.131 -        if dirname: dirlist.append(dirname)
  30.132 -
  30.133 -    # Failing that, try OS-specific locations.
  30.134 -    if _os.name == 'mac':
  30.135 -        try:
  30.136 -            fsr = _Folder.FSFindFolder(_Folders.kOnSystemDisk,
  30.137 -                                              _Folders.kTemporaryFolderType, 1)
  30.138 -            dirname = fsr.as_pathname()
  30.139 -            dirlist.append(dirname)
  30.140 -        except _Folder.error:
  30.141 -            pass
  30.142 -    elif _os.name == 'riscos':
  30.143 -        dirname = _os.getenv('Wimp$ScrapDir')
  30.144 -        if dirname: dirlist.append(dirname)
  30.145 -    elif _os.name == 'nt':
  30.146 -        dirlist.extend([ r'c:\temp', r'c:\tmp', r'\temp', r'\tmp' ])
  30.147 -    else:
  30.148 -        dirlist.extend([ '/tmp', '/var/tmp', '/usr/tmp' ])
  30.149 -
  30.150 -    # As a last resort, the current directory.
  30.151 -    try:
  30.152 -        dirlist.append(_os.getcwd())
  30.153 -    except (AttributeError, _os.error):
  30.154 -        dirlist.append(_os.curdir)
  30.155 -
  30.156 -    return dirlist
  30.157 -
  30.158 -def _get_default_tempdir():
  30.159 -    """Calculate the default directory to use for temporary files.
  30.160 -    This routine should be called exactly once.
  30.161 -
  30.162 -    We determine whether or not a candidate temp dir is usable by
  30.163 -    trying to create and write to a file in that directory.  If this
  30.164 -    is successful, the test file is deleted.  To prevent denial of
  30.165 -    service, the name of the test file must be randomized."""
  30.166 -
  30.167 -    namer = _RandomNameSequence()
  30.168 -    dirlist = _candidate_tempdir_list()
  30.169 -    flags = _text_openflags
  30.170 -
  30.171 -    for dir in dirlist:
  30.172 -        if dir != _os.curdir:
  30.173 -            dir = _os.path.normcase(_os.path.abspath(dir))
  30.174 -        # Try only a few names per directory.
  30.175 -        for seq in xrange(100):
  30.176 -            name = namer.next()
  30.177 -            filename = _os.path.join(dir, name)
  30.178 -            try:
  30.179 -                fd = _os.open(filename, flags, 0600)
  30.180 -                fp = _os.fdopen(fd, 'w')
  30.181 -                fp.write('blat')
  30.182 -                fp.close()
  30.183 -                _os.unlink(filename)
  30.184 -                del fp, fd
  30.185 -                return dir
  30.186 -            except (OSError, IOError), e:
  30.187 -                if e[0] != _errno.EEXIST:
  30.188 -                    break # no point trying more names in this directory
  30.189 -                pass
  30.190 -    raise IOError, (_errno.ENOENT,
  30.191 -                    ("No usable temporary directory found in %s" % dirlist))
  30.192 -
  30.193 -_name_sequence = None
  30.194 -
  30.195 -def _get_candidate_names():
  30.196 -    """Common setup sequence for all user-callable interfaces."""
  30.197 -
  30.198 -    global _name_sequence
  30.199 -    if _name_sequence is None:
  30.200 -        _once_lock.acquire()
  30.201 -        try:
  30.202 -            if _name_sequence is None:
  30.203 -                _name_sequence = _RandomNameSequence()
  30.204 -        finally:
  30.205 -            _once_lock.release()
  30.206 -    return _name_sequence
  30.207 -
  30.208 -
  30.209 -def _mkstemp_inner(dir, pre, suf, flags):
  30.210 -    """Code common to mkstemp, TemporaryFile, and NamedTemporaryFile."""
  30.211 -
  30.212 -    names = _get_candidate_names()
  30.213 -
  30.214 -    for seq in xrange(TMP_MAX):
  30.215 -        name = names.next()
  30.216 -        file = _os.path.join(dir, pre + name + suf)
  30.217 -        try:
  30.218 -            fd = _os.open(file, flags, 0600)
  30.219 -            _set_cloexec(fd)
  30.220 -            return (fd, file)
  30.221 -        except OSError, e:
  30.222 -            if e.errno == _errno.EEXIST:
  30.223 -                continue # try again
  30.224 -            raise
  30.225 -
  30.226 -    raise IOError, (_errno.EEXIST, "No usable temporary file name found")
  30.227 -
  30.228 -
  30.229 -# User visible interfaces.
  30.230 -
  30.231 -def gettempprefix():
  30.232 -    """Accessor for tempdir.template."""
  30.233 -    return template
  30.234 -
  30.235 -tempdir = None
  30.236 -
  30.237 -def gettempdir():
  30.238 -    """Accessor for tempdir.tempdir."""
  30.239 -    global tempdir
  30.240 -    if tempdir is None:
  30.241 -        _once_lock.acquire()
  30.242 -        try:
  30.243 -            if tempdir is None:
  30.244 -                tempdir = _get_default_tempdir()
  30.245 -        finally:
  30.246 -            _once_lock.release()
  30.247 -    return tempdir
  30.248 -
  30.249 -def mkstemp(suffix="", prefix=template, dir=None, text=False):
  30.250 -    """mkstemp([suffix, [prefix, [dir, [text]]]])
  30.251 -    User-callable function to create and return a unique temporary
  30.252 -    file.  The return value is a pair (fd, name) where fd is the
  30.253 -    file descriptor returned by os.open, and name is the filename.
  30.254 -
  30.255 -    If 'suffix' is specified, the file name will end with that suffix,
  30.256 -    otherwise there will be no suffix.
  30.257 -
  30.258 -    If 'prefix' is specified, the file name will begin with that prefix,
  30.259 -    otherwise a default prefix is used.
  30.260 -
  30.261 -    If 'dir' is specified, the file will be created in that directory,
  30.262 -    otherwise a default directory is used.
  30.263 -
  30.264 -    If 'text' is specified and true, the file is opened in text
  30.265 -    mode.  Else (the default) the file is opened in binary mode.  On
  30.266 -    some operating systems, this makes no difference.
  30.267 -
  30.268 -    The file is readable and writable only by the creating user ID.
  30.269 -    If the operating system uses permission bits to indicate whether a
  30.270 -    file is executable, the file is executable by no one. The file
  30.271 -    descriptor is not inherited by children of this process.
  30.272 -
  30.273 -    Caller is responsible for deleting the file when done with it.
  30.274 -    """
  30.275 -
  30.276 -    if dir is None:
  30.277 -        dir = gettempdir()
  30.278 -
  30.279 -    if text:
  30.280 -        flags = _text_openflags
  30.281 -    else:
  30.282 -        flags = _bin_openflags
  30.283 -
  30.284 -    return _mkstemp_inner(dir, prefix, suffix, flags)
  30.285 -
  30.286 -
  30.287 -def mkdtemp(suffix="", prefix=template, dir=None):
  30.288 -    """mkdtemp([suffix, [prefix, [dir]]])
  30.289 -    User-callable function to create and return a unique temporary
  30.290 -    directory.  The return value is the pathname of the directory.
  30.291 -
  30.292 -    Arguments are as for mkstemp, except that the 'text' argument is
  30.293 -    not accepted.
  30.294 -
  30.295 -    The directory is readable, writable, and searchable only by the
  30.296 -    creating user.
  30.297 -
  30.298 -    Caller is responsible for deleting the directory when done with it.
  30.299 -    """
  30.300 -
  30.301 -    if dir is None:
  30.302 -        dir = gettempdir()
  30.303 -
  30.304 -    names = _get_candidate_names()
  30.305 -
  30.306 -    for seq in xrange(TMP_MAX):
  30.307 -        name = names.next()
  30.308 -        file = _os.path.join(dir, prefix + name + suffix)
  30.309 -        try:
  30.310 -            _os.mkdir(file, 0700)
  30.311 -            return file
  30.312 -        except OSError, e:
  30.313 -            if e.errno == _errno.EEXIST:
  30.314 -                continue # try again
  30.315 -            raise
  30.316 -
  30.317 -    raise IOError, (_errno.EEXIST, "No usable temporary directory name found")
  30.318 -
  30.319 -def mktemp(suffix="", prefix=template, dir=None):
  30.320 -    """mktemp([suffix, [prefix, [dir]]])
  30.321 -    User-callable function to return a unique temporary file name.  The
  30.322 -    file is not created.
  30.323 -
  30.324 -    Arguments are as for mkstemp, except that the 'text' argument is
  30.325 -    not accepted.
  30.326 -
  30.327 -    This function is unsafe and should not be used.  The file name
  30.328 -    refers to a file that did not exist at some point, but by the time
  30.329 -    you get around to creating it, someone else may have beaten you to
  30.330 -    the punch.
  30.331 -    """
  30.332 -
  30.333 -##    from warnings import warn as _warn
  30.334 -##    _warn("mktemp is a potential security risk to your program",
  30.335 -##          RuntimeWarning, stacklevel=2)
  30.336 -
  30.337 -    if dir is None:
  30.338 -        dir = gettempdir()
  30.339 -
  30.340 -    names = _get_candidate_names()
  30.341 -    for seq in xrange(TMP_MAX):
  30.342 -        name = names.next()
  30.343 -        file = _os.path.join(dir, prefix + name + suffix)
  30.344 -        if not _os.path.exists(file):
  30.345 -            return file
  30.346 -
  30.347 -    raise IOError, (_errno.EEXIST, "No usable temporary filename found")
  30.348 -
  30.349 -class _TemporaryFileWrapper:
  30.350 -    """Temporary file wrapper
  30.351 -
  30.352 -    This class provides a wrapper around files opened for
  30.353 -    temporary use.  In particular, it seeks to automatically
  30.354 -    remove the file when it is no longer needed.
  30.355 -    """
  30.356 -
  30.357 -    def __init__(self, file, name):
  30.358 -        self.file = file
  30.359 -        self.name = name
  30.360 -        self.close_called = False
  30.361 -
  30.362 -    def __getattr__(self, name):
  30.363 -        file = self.__dict__['file']
  30.364 -        a = getattr(file, name)
  30.365 -        if type(a) != type(0):
  30.366 -            setattr(self, name, a)
  30.367 -        return a
  30.368 -
  30.369 -    # NT provides delete-on-close as a primitive, so we don't need
  30.370 -    # the wrapper to do anything special.  We still use it so that
  30.371 -    # file.name is useful (i.e. not "(fdopen)") with NamedTemporaryFile.
  30.372 -    if _os.name != 'nt':
  30.373 -
  30.374 -        # Cache the unlinker so we don't get spurious errors at
  30.375 -        # shutdown when the module-level "os" is None'd out.  Note
  30.376 -        # that this must be referenced as self.unlink, because the
  30.377 -        # name TemporaryFileWrapper may also get None'd out before
  30.378 -        # __del__ is called.
  30.379 -        unlink = _os.unlink
  30.380 -
  30.381 -        def close(self):
  30.382 -            if not self.close_called:
  30.383 -                self.close_called = True
  30.384 -                self.file.close()
  30.385 -                self.unlink(self.name)
  30.386 -
  30.387 -        def __del__(self):
  30.388 -            self.close()
  30.389 -
  30.390 -def NamedTemporaryFile(mode='w+b', bufsize=-1, suffix="",
  30.391 -                       prefix=template, dir=None):
  30.392 -    """Create and return a temporary file.
  30.393 -    Arguments:
  30.394 -    'prefix', 'suffix', 'dir' -- as for mkstemp.
  30.395 -    'mode' -- the mode argument to os.fdopen (default "w+b").
  30.396 -    'bufsize' -- the buffer size argument to os.fdopen (default -1).
  30.397 -    The file is created as mkstemp() would do it.
  30.398 -
  30.399 -    Returns a file object; the name of the file is accessible as
  30.400 -    file.name.  The file will be automatically deleted when it is
  30.401 -    closed.
  30.402 -    """
  30.403 -
  30.404 -    if dir is None:
  30.405 -        dir = gettempdir()
  30.406 -
  30.407 -    if 'b' in mode:
  30.408 -        flags = _bin_openflags
  30.409 -    else:
  30.410 -        flags = _text_openflags
  30.411 -
  30.412 -    # Setting O_TEMPORARY in the flags causes the OS to delete
  30.413 -    # the file when it is closed.  This is only supported by Windows.
  30.414 -    if _os.name == 'nt':
  30.415 -        flags |= _os.O_TEMPORARY
  30.416 -
  30.417 -    (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags)
  30.418 -    file = _os.fdopen(fd, mode, bufsize)
  30.419 -    return _TemporaryFileWrapper(file, name)
  30.420 -
  30.421 -if _os.name != 'posix' or _os.sys.platform == 'cygwin':
  30.422 -    # On non-POSIX and Cygwin systems, assume that we cannot unlink a file
  30.423 -    # while it is open.
  30.424 -    TemporaryFile = NamedTemporaryFile
  30.425 -
  30.426 -else:
  30.427 -    def TemporaryFile(mode='w+b', bufsize=-1, suffix="",
  30.428 -                      prefix=template, dir=None):
  30.429 -        """Create and return a temporary file.
  30.430 -        Arguments:
  30.431 -        'prefix', 'suffix', 'directory' -- as for mkstemp.
  30.432 -        'mode' -- the mode argument to os.fdopen (default "w+b").
  30.433 -        'bufsize' -- the buffer size argument to os.fdopen (default -1).
  30.434 -        The file is created as mkstemp() would do it.
  30.435 -
  30.436 -        Returns a file object.  The file has no name, and will cease to
  30.437 -        exist when it is closed.
  30.438 -        """
  30.439 -
  30.440 -        if dir is None:
  30.441 -            dir = gettempdir()
  30.442 -
  30.443 -        if 'b' in mode:
  30.444 -            flags = _bin_openflags
  30.445 -        else:
  30.446 -            flags = _text_openflags
  30.447 -
  30.448 -        (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags)
  30.449 -        try:
  30.450 -            _os.unlink(name)
  30.451 -            return _os.fdopen(fd, mode, bufsize)
  30.452 -        except:
  30.453 -            _os.close(fd)
  30.454 -            raise
    31.1 --- a/tools/python/xen/xend/Blkctl.py	Fri Sep 23 15:41:28 2005 -0600
    31.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.3 @@ -1,43 +0,0 @@
    31.4 -"""Xend interface to block control scripts.
    31.5 -"""
    31.6 -import os
    31.7 -import os.path
    31.8 -import sys
    31.9 -import string
   31.10 -import xen.util.process
   31.11 -
   31.12 -from xen.xend import XendRoot
   31.13 -
   31.14 -xroot = XendRoot.instance()
   31.15 -
   31.16 -"""Where network control scripts live."""
   31.17 -SCRIPT_DIR = xroot.block_script_dir
   31.18 -
   31.19 -def block(op, type, dets, script=None):
   31.20 -    """Call a block control script.
   31.21 -    Xend calls this with op 'bind' when it is about to export a block device
   31.22 -    (other than a raw partition).  The script is called with unbind when a
   31.23 -    device is no longer in use and should be removed.
   31.24 -
   31.25 -    @param op:        operation (start, stop, status)
   31.26 -    @param type:      type of block device (determines the script used)
   31.27 -    @param dets:      arguments to the control script
   31.28 -    @param script:    block script name
   31.29 -    """
   31.30 -    
   31.31 -    if op not in ['bind', 'unbind']:
   31.32 -        raise ValueError('Invalid operation:' + op)
   31.33 -
   31.34 -    # Special case phy devices - they don't require any (un)binding
   31.35 -    # Parallax also doesn't need script-based binding.
   31.36 -    if (type == 'phy') or (type == 'parallax'):
   31.37 -        return dets
   31.38 -    
   31.39 -    if script is None:
   31.40 -        script = xroot.get_block_script(type)
   31.41 -    script = os.path.join(SCRIPT_DIR, script)
   31.42 -    args = [op] + string.split(dets, ':')
   31.43 -    args = ' '.join(args)
   31.44 -    ret = xen.util.process.runscript(script + ' ' + args)
   31.45 -    if len(ret):
   31.46 -        return ret.splitlines()[0]
    32.1 --- a/tools/python/xen/xend/PrettyPrint.py	Fri Sep 23 15:41:28 2005 -0600
    32.2 +++ b/tools/python/xen/xend/PrettyPrint.py	Mon Sep 26 11:07:49 2005 -0600
    32.3 @@ -252,7 +252,7 @@ class PrettyPrinter:
    32.4          self.block = self.block.parent
    32.5  
    32.6      def prettyprint(self, out=sys.stdout):
    32.7 -        self.top.prettyprint(Line(out, self.width))
    32.8 +        self.top.prettyprint(Line(out, self.width), self.width)
    32.9  
   32.10  class SXPPrettyPrinter(PrettyPrinter):
   32.11      """An SXP prettyprinter.
    33.1 --- a/tools/python/xen/xend/XendDB.py	Fri Sep 23 15:41:28 2005 -0600
    33.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    33.3 @@ -1,127 +0,0 @@
    33.4 -#============================================================================
    33.5 -# This library is free software; you can redistribute it and/or
    33.6 -# modify it under the terms of version 2.1 of the GNU Lesser General Public
    33.7 -# License as published by the Free Software Foundation.
    33.8 -#
    33.9 -# This library is distributed in the hope that it will be useful,
   33.10 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
   33.11 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   33.12 -# Lesser General Public License for more details.
   33.13 -#
   33.14 -# You should have received a copy of the GNU Lesser General Public
   33.15 -# License along with this library; if not, write to the Free Software
   33.16 -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   33.17 -#============================================================================
   33.18 -# Copyright (C) 2004, 2005 Mike Wray <mike.wray@hp.com>
   33.19 -#============================================================================
   33.20 -
   33.21 -import os
   33.22 -import os.path
   33.23 -import errno
   33.24 -import dircache
   33.25 -import time
   33.26 -
   33.27 -import sxp
   33.28 -import XendRoot
   33.29 -xroot = XendRoot.instance()
   33.30 -
   33.31 -class XendDB:
   33.32 -    """Persistence for Xend. Stores data in files and directories.
   33.33 -    """
   33.34 -
   33.35 -    def __init__(self, path=None):
   33.36 -        self.dbpath = xroot.get_dbroot()
   33.37 -        if path:
   33.38 -            self.dbpath = os.path.join(self.dbpath, path)
   33.39 -        pass
   33.40 -
   33.41 -    def listdir(self, dpath):
   33.42 -        try:
   33.43 -            return dircache.listdir(dpath)
   33.44 -        except:
   33.45 -            return []
   33.46 -
   33.47 -    def filepath(self, path):
   33.48 -        return os.path.join(self.dbpath, path)
   33.49 -        
   33.50 -    def fetch(self, path):
   33.51 -        fpath = self.filepath(path)
   33.52 -        return self.fetchfile(fpath)
   33.53 -
   33.54 -    def fetchfile(self, fpath):
   33.55 -        pin = sxp.Parser()
   33.56 -        fin = file(fpath, "rb")
   33.57 -        try:
   33.58 -            while 1:
   33.59 -                try:
   33.60 -                    buf = fin.read(1024)
   33.61 -                except IOError, ex:
   33.62 -                    if ex.errno == errno.EINTR:
   33.63 -                        continue
   33.64 -                    else:
   33.65 -                        raise
   33.66 -                pin.input(buf)
   33.67 -                if buf == '':
   33.68 -                    pin.input_eof()
   33.69 -                    break
   33.70 -        finally:
   33.71 -            fin.close()
   33.72 -        return pin.get_val()
   33.73 -
   33.74 -    def save(self, path, sxpr):
   33.75 -        fpath = self.filepath(path)
   33.76 -        return self.savefile(fpath, sxpr)
   33.77 -    
   33.78 -    def savefile(self, fpath, sxpr):
   33.79 -        backup = False
   33.80 -        fdir = os.path.dirname(fpath)
   33.81 -        if not os.path.isdir(fdir):
   33.82 -            os.makedirs(fdir)
   33.83 -        if os.path.exists(fpath):
   33.84 -            backup = True
   33.85 -            real_fpath = fpath
   33.86 -            fpath += ".new."
   33.87 -            
   33.88 -        fout = file(fpath, "wb+")
   33.89 -        try:
   33.90 -            try:
   33.91 -                t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
   33.92 -                fout.write("# %s %s\n" % (fpath, t))
   33.93 -                sxp.show(sxpr, out=fout)
   33.94 -            finally:
   33.95 -                fout.close()
   33.96 -        except:
   33.97 -            if backup:
   33.98 -                try:
   33.99 -                    os.unlink(fpath)
  33.100 -                except:
  33.101 -                    pass
  33.102 -                raise
  33.103 -        if backup:
  33.104 -            os.rename(fpath, real_fpath)
  33.105 -
  33.106 -    def fetchall(self, path):
  33.107 -        dpath = self.filepath(path)
  33.108 -        d = {}
  33.109 -        for k in self.listdir(dpath):
  33.110 -            try:
  33.111 -                v = self.fetchfile(os.path.join(dpath, k))
  33.112 -                d[k] = v
  33.113 -            except:
  33.114 -                pass
  33.115 -        return d
  33.116 -
  33.117 -    def saveall(self, path, d):
  33.118 -        for (k, v) in d.items():
  33.119 -            self.save(os.path.join(path, k), v)
  33.120 -
  33.121 -    def delete(self, path):
  33.122 -        dpath = self.filepath(path)
  33.123 -        os.unlink(dpath)
  33.124 -
  33.125 -    def ls(self, path):
  33.126 -        dpath = self.filepath(path)
  33.127 -        return self.listdir(dpath)
  33.128 -        
  33.129 -
  33.130 -        
    34.1 --- a/tools/python/xen/xend/XendDomain.py	Fri Sep 23 15:41:28 2005 -0600
    34.2 +++ b/tools/python/xen/xend/XendDomain.py	Mon Sep 26 11:07:49 2005 -0600
    34.3 @@ -433,12 +433,11 @@ class XendDomain:
    34.4              self.domain_shutdowns()
    34.5          return val
    34.6  
    34.7 +
    34.8      def domain_sysrq(self, id, key):
    34.9 -        """Send a SysRq to a domain
   34.10 -        """
   34.11 -        dominfo = self.domain_lookup(id)
   34.12 -        val = dominfo.send_sysrq(key)
   34.13 -        return val
   34.14 +        """Send a SysRq to the specified domain."""
   34.15 +        return self.callInfo(id, XendDomainInfo.send_sysrq, key)
   34.16 +
   34.17  
   34.18      def domain_shutdowns(self):
   34.19          """Process pending domain shutdowns.
   34.20 @@ -630,73 +629,45 @@ class XendDomain:
   34.21          except Exception, ex:
   34.22              raise XendError(str(ex))
   34.23  
   34.24 -    def domain_device_create(self, id, devconfig):
   34.25 -        """Create a new device for a domain.
   34.26  
   34.27 -        @param id:       domain id
   34.28 -        @param devconfig: device configuration
   34.29 +    def domain_device_create(self, domid, devconfig):
   34.30 +        """Create a new device for the specified domain.
   34.31          """
   34.32 -        dominfo = self.domain_lookup(id)
   34.33 -        val = dominfo.device_create(devconfig)
   34.34 -        dominfo.exportToDB()
   34.35 -        return val
   34.36 +        return self.callInfo(domid, XendDomainInfo.device_create, devconfig)
   34.37  
   34.38 -    def domain_device_configure(self, id, devconfig, devid):
   34.39 -        """Configure an existing device for a domain.
   34.40  
   34.41 -        @param id:   domain id
   34.42 -        @param devconfig: device configuration
   34.43 -        @param devid:  device id
   34.44 +    def domain_device_configure(self, domid, devconfig, devid):
   34.45 +        """Configure an existing device in the specified domain.
   34.46          @return: updated device configuration
   34.47          """
   34.48 -        dominfo = self.domain_lookup(id)
   34.49 -        val = dominfo.device_configure(devconfig, devid)
   34.50 -        dominfo.exportToDB()
   34.51 -        return val
   34.52 -    
   34.53 -    def domain_device_refresh(self, id, type, devid):
   34.54 -        """Refresh a device.
   34.55 +        return self.callInfo(domid, XendDomainInfo.device_configure,
   34.56 +                             devconfig, devid)
   34.57  
   34.58 -        @param id:  domain id
   34.59 -        @param devid:  device id
   34.60 -        @param type: device type
   34.61 -        """
   34.62 -        dominfo = self.domain_lookup(id)
   34.63 -        val = dominfo.device_refresh(type, devid)
   34.64 -        dominfo.exportToDB()
   34.65 -        return val
   34.66 -
   34.67 -    def domain_device_destroy(self, id, type, devid):
   34.68 -        """Destroy a device.
   34.69 -
   34.70 -        @param id:  domain id
   34.71 -        @param devid:  device id
   34.72 -        @param type: device type
   34.73 -        """
   34.74 -        dominfo = self.domain_lookup(id)
   34.75 -        return dominfo.destroyDevice(type, devid)
   34.76 +    
   34.77 +    def domain_device_refresh(self, domid, devtype, devid):
   34.78 +        """Refresh a device."""
   34.79 +        return self.callInfo(domid, XendDomainInfo.device_refresh, devtype,
   34.80 +                             devid)
   34.81  
   34.82  
   34.83 -    def domain_devtype_ls(self, id, type):
   34.84 -        """Get list of device sxprs for a domain.
   34.85 +    def domain_device_destroy(self, domid, devtype, devid):
   34.86 +        """Destroy a device."""
   34.87 +        return self.callInfo(domid, XendDomainInfo.destroyDevice, devtype,
   34.88 +                             devid)
   34.89  
   34.90 -        @param id:  domain
   34.91 -        @param type: device type
   34.92 -        @return: device sxprs
   34.93 -        """
   34.94 -        dominfo = self.domain_lookup(id)
   34.95 -        return dominfo.getDeviceSxprs(type)
   34.96  
   34.97 -    def domain_devtype_get(self, id, type, devid):
   34.98 +    def domain_devtype_ls(self, domid, devtype):
   34.99 +        """Get list of device sxprs for the specified domain."""
  34.100 +        return self.callInfo(domid, XendDomainInfo.getDeviceSxprs, devtype)
  34.101 +
  34.102 +
  34.103 +    def domain_devtype_get(self, domid, devtype, devid):
  34.104          """Get a device from a domain.
  34.105          
  34.106 -        @param id:  domain
  34.107 -        @param type: device type
  34.108 -        @param devid:  device id
  34.109          @return: device object (or None)
  34.110          """
  34.111 -        dominfo = self.domain_lookup(id)
  34.112 -        return dominfo.getDevice(type, devid)
  34.113 +        return self.callInfo(domid, XendDomainInfo.getDevice, devtype, devid)
  34.114 +
  34.115  
  34.116      def domain_vif_limit_set(self, id, vif, credit, period):
  34.117          """Limit the vif's transmission rate
  34.118 @@ -723,7 +694,7 @@ class XendDomain:
  34.119          """Set the memory limit for a domain.
  34.120  
  34.121          @param id: domain
  34.122 -        @param mem: memory limit (in MB)
  34.123 +        @param mem: memory limit (in MiB)
  34.124          @return: 0 on success, -1 on error
  34.125          """
  34.126          dominfo = self.domain_lookup(id)
  34.127 @@ -734,42 +705,37 @@ class XendDomain:
  34.128          except Exception, ex:
  34.129              raise XendError(str(ex))
  34.130  
  34.131 -    def domain_mem_target_set(self, id, mem):
  34.132 +    def domain_mem_target_set(self, domid, mem):
  34.133          """Set the memory target for a domain.
  34.134  
  34.135 -        @param id: domain
  34.136 -        @param mem: memory target (in MB)
  34.137 -        @return: 0 on success, -1 on error
  34.138 +        @param mem: memory target (in MiB)
  34.139          """
  34.140 -        dominfo = self.domain_lookup(id)
  34.141 -        return dominfo.setMemoryTarget(mem << 10)
  34.142 +        self.callInfo(domid, XendDomainInfo.setMemoryTarget, mem << 10)
  34.143  
  34.144 -    def domain_vcpu_hotplug(self, id, vcpu, state):
  34.145 -        """Enable or disable VCPU vcpu in DOM id
  34.146  
  34.147 -        @param id: domain
  34.148 +    def domain_vcpu_hotplug(self, domid, vcpu, state):
  34.149 +        """Enable or disable specified VCPU in specified domain
  34.150 +
  34.151          @param vcpu: target VCPU in domain
  34.152          @param state: which state VCPU will become
  34.153 -        @return: 0 on success, -1 on error
  34.154          """
  34.155 -
  34.156 -        dominfo = self.domain_lookup(id)
  34.157 -        return dominfo.vcpu_hotplug(vcpu, state)
  34.158 -
  34.159 -    def domain_dumpcore(self, id):
  34.160 -        """Save a core dump for a crashed domain.
  34.161 +        self.callInfo(domid, XendDomainInfo.vcpu_hotplug, vcpu, state)
  34.162  
  34.163 -        @param id: domain
  34.164 -        """
  34.165 -        dominfo = self.domain_lookup(id)
  34.166 -        corefile = "/var/xen/dump/%s.%s.core" % (dominfo.getName(),
  34.167 -                                                 dominfo.getDomid())
  34.168 -        try:
  34.169 -            xc.domain_dumpcore(dom=dominfo.getDomid(), corefile=corefile)
  34.170 -        except Exception, ex:
  34.171 -            log.warning("Dumpcore failed, id=%s name=%s: %s",
  34.172 -                        dominfo.getDomid(), dominfo.getName(), ex)
  34.173 -        
  34.174 +
  34.175 +    def domain_dumpcore(self, domid):
  34.176 +        """Save a core dump for a crashed domain."""
  34.177 +        self.callInfo(domid, XendDomainInfo.dumpCore)
  34.178 +
  34.179 +
  34.180 +    ## private:
  34.181 +
  34.182 +    def callInfo(self, domid, fn, *args, **kwargs):
  34.183 +        self.refresh()
  34.184 +        dominfo = self.domains.get(domid)
  34.185 +        if dominfo:
  34.186 +            return fn(dominfo, *args, **kwargs)
  34.187 +
  34.188 +
  34.189  def instance():
  34.190      """Singleton constructor. Use this instead of the class constructor.
  34.191      """
    35.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Fri Sep 23 15:41:28 2005 -0600
    35.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Mon Sep 26 11:07:49 2005 -0600
    35.3 @@ -34,6 +34,7 @@ from xen.util.blkif import blkdev_uname_
    35.4  
    35.5  from xen.xend.server.channel import EventChannel
    35.6  
    35.7 +from xen.xend import image
    35.8  from xen.xend import sxp
    35.9  from xen.xend.XendBootloader import bootloader
   35.10  from xen.xend.XendLogging import log
   35.11 @@ -319,6 +320,7 @@ class XendDomainInfo:
   35.12  
   35.13          try:
   35.14              defaultInfo('name',         lambda: "Domain-%d" % self.domid)
   35.15 +            defaultInfo('ssidref',      lambda: 0)
   35.16              defaultInfo('restart_mode', lambda: RESTART_ONREBOOT)
   35.17              defaultInfo('cpu_weight',   lambda: 1.0)
   35.18              defaultInfo('bootloader',   lambda: None)
   35.19 @@ -511,6 +513,19 @@ class XendDomainInfo:
   35.20                        self.info['backend'], 0)
   35.21  
   35.22  
   35.23 +    def dumpCore(self):
   35.24 +        """Create a core dump for this domain.  Nothrow guarantee."""
   35.25 +        
   35.26 +        try:
   35.27 +            corefile = "/var/xen/dump/%s.%s.core" % (self.info['name'],
   35.28 +                                                     self.domid)
   35.29 +            xc.domain_dumpcore(dom = self.domid, corefile = corefile)
   35.30 +
   35.31 +        except Exception, exn:
   35.32 +            log.error("XendDomainInfo.dumpCore failed: id = %s name = %s: %s",
   35.33 +                      self.domid, self.info['name'], str(exn))
   35.34 +
   35.35 +
   35.36      def closeStoreChannel(self):
   35.37          """Close the store channel, if any.  Nothrow guarantee."""
   35.38          
   35.39 @@ -614,7 +629,7 @@ class XendDomainInfo:
   35.40              sxpr.append(['maxmem', self.info['maxmem_KiB'] / 1024])
   35.41  
   35.42              if self.infoIsSet('device'):
   35.43 -                for (n, c) in self.info['device']:
   35.44 +                for (_, c) in self.info['device']:
   35.45                      sxpr.append(['device', c])
   35.46  
   35.47              def stateChar(name):
   35.48 @@ -706,13 +721,6 @@ class XendDomainInfo:
   35.49          """
   35.50          # todo - add support for scheduling params?
   35.51          try:
   35.52 -            if 'image' not in self.info:
   35.53 -                raise VmError('Missing image in configuration')
   35.54 -
   35.55 -            self.image = ImageHandler.create(self,
   35.56 -                                             self.info['image'],
   35.57 -                                             self.info['device'])
   35.58 -
   35.59              self.initDomain()
   35.60  
   35.61              # Create domain devices.
   35.62 @@ -737,6 +745,14 @@ class XendDomainInfo:
   35.63  
   35.64          self.domid = xc.domain_create(dom = self.domid or 0,
   35.65                                        ssidref = self.info['ssidref'])
   35.66 +
   35.67 +        if 'image' not in self.info:
   35.68 +            raise VmError('Missing image in configuration')
   35.69 +
   35.70 +        self.image = image.create(self,
   35.71 +                                  self.info['image'],
   35.72 +                                  self.info['device'])
   35.73 +
   35.74          if self.domid <= 0:
   35.75              raise VmError('Creating domain failed: name=%s' %
   35.76                            self.info['name'])
   35.77 @@ -839,20 +855,20 @@ class XendDomainInfo:
   35.78          """Release all vm devices.
   35.79          """
   35.80  
   35.81 -        t = xstransact("%s/device" % self.path)
   35.82 -
   35.83 -        for n in controllerClasses.keys():
   35.84 -            for d in t.list(n):
   35.85 -                try:
   35.86 -                    t.remove(d)
   35.87 -                except ex:
   35.88 -                    # Log and swallow any exceptions in removal -- there's
   35.89 -                    # nothing more we can do.
   35.90 -                    log.exception(
   35.91 -                        "Device release failed: %s; %s; %s; %s" %
   35.92 -                        (self.info['name'], n, d, str(ex)))
   35.93 -        t.commit()
   35.94 -
   35.95 +        while True:
   35.96 +            t = xstransact("%s/device" % self.path)
   35.97 +            for n in controllerClasses.keys():
   35.98 +                for d in t.list(n):
   35.99 +                    try:
  35.100 +                        t.remove(d)
  35.101 +                    except ex:
  35.102 +                        # Log and swallow any exceptions in removal --
  35.103 +                        # there's nothing more we can do.
  35.104 +                        log.exception(
  35.105 +                           "Device release failed: %s; %s; %s; %s" %
  35.106 +                            (self.info['name'], n, d, str(ex)))
  35.107 +            if t.commit():
  35.108 +                break
  35.109  
  35.110      def eventChannel(self, path=None):
  35.111          """Create an event channel to the domain.
  35.112 @@ -1085,19 +1101,6 @@ class XendDomainInfo:
  35.113  
  35.114  
  35.115  #============================================================================
  35.116 -# Register image handlers.
  35.117 -
  35.118 -from image import          \
  35.119 -     addImageHandlerClass, \
  35.120 -     ImageHandler,         \
  35.121 -     LinuxImageHandler,    \
  35.122 -     VmxImageHandler
  35.123 -
  35.124 -addImageHandlerClass(LinuxImageHandler)
  35.125 -addImageHandlerClass(VmxImageHandler)
  35.126 -
  35.127 -
  35.128 -#============================================================================
  35.129  # Register device controllers and their device config types.
  35.130  
  35.131  """A map from device-class names to the subclass of DevController that
    36.1 --- a/tools/python/xen/xend/image.py	Fri Sep 23 15:41:28 2005 -0600
    36.2 +++ b/tools/python/xen/xend/image.py	Mon Sep 26 11:07:49 2005 -0600
    36.3 @@ -33,6 +33,15 @@ xc = xen.lowlevel.xc.new()
    36.4  
    36.5  MAX_GUEST_CMDLINE = 1024
    36.6  
    36.7 +
    36.8 +def create(vm, imageConfig, deviceConfig):
    36.9 +    """Create an image handler for a vm.
   36.10 +
   36.11 +    @return ImageHandler instance
   36.12 +    """
   36.13 +    return findImageHandlerClass(imageConfig)(vm, imageConfig, deviceConfig)
   36.14 +
   36.15 +
   36.16  class ImageHandler:
   36.17      """Abstract base class for image handlers.
   36.18  
   36.19 @@ -48,81 +57,39 @@ class ImageHandler:
   36.20  
   36.21      The method destroy() is called when the domain is destroyed.
   36.22      The default is to do nothing.
   36.23 -    
   36.24 -    """
   36.25 -
   36.26 -    #======================================================================
   36.27 -    # Class vars and methods.
   36.28 -
   36.29 -    """Table of image handler classes for virtual machine images.
   36.30 -    Indexed by image type.
   36.31      """
   36.32 -    imageHandlerClasses = {}
   36.33 -
   36.34 -    def addImageHandlerClass(cls, h):
   36.35 -        """Add a handler class for an image type
   36.36 -        @param h:        handler: ImageHandler subclass
   36.37 -        """
   36.38 -        cls.imageHandlerClasses[h.ostype] = h
   36.39 -
   36.40 -    addImageHandlerClass = classmethod(addImageHandlerClass)
   36.41 -
   36.42 -    def findImageHandlerClass(cls, image):
   36.43 -        """Find the image handler class for an image config.
   36.44 -
   36.45 -        @param image config
   36.46 -        @return ImageHandler subclass or None
   36.47 -        """
   36.48 -        ty = sxp.name(image)
   36.49 -        if ty is None:
   36.50 -            raise VmError('missing image type')
   36.51 -        imageClass = cls.imageHandlerClasses.get(ty)
   36.52 -        if imageClass is None:
   36.53 -            raise VmError('unknown image type: ' + ty)
   36.54 -        return imageClass
   36.55 -
   36.56 -    findImageHandlerClass = classmethod(findImageHandlerClass)
   36.57 -
   36.58 -    def create(cls, vm, imageConfig, deviceConfig):
   36.59 -        """Create an image handler for a vm.
   36.60 -
   36.61 -        @return ImageHandler instance
   36.62 -        """
   36.63 -        imageClass = cls.findImageHandlerClass(imageConfig)
   36.64 -        return imageClass(vm, imageConfig, deviceConfig)
   36.65 -
   36.66 -    create = classmethod(create)
   36.67 -
   36.68 -    #======================================================================
   36.69 -    # Instance vars and methods.
   36.70  
   36.71      ostype = None
   36.72  
   36.73 -    kernel = None
   36.74 -    ramdisk = None
   36.75 -    cmdline = None
   36.76 -
   36.77 -    flags = 0
   36.78  
   36.79      def __init__(self, vm, imageConfig, deviceConfig):
   36.80          self.vm = vm
   36.81 +
   36.82 +        self.kernel = None
   36.83 +        self.ramdisk = None
   36.84 +        self.cmdline = None
   36.85 +        self.flags = 0
   36.86 +
   36.87          self.configure(imageConfig, deviceConfig)
   36.88  
   36.89      def configure(self, imageConfig, _):
   36.90          """Config actions common to all unix-like domains."""
   36.91  
   36.92 -        self.kernel = sxp.child_value(imageConfig, "kernel")
   36.93 +        def get_cfg(name, default = None):
   36.94 +            return sxp.child_value(imageConfig, name, default)
   36.95 +
   36.96 +        self.kernel = get_cfg("kernel")
   36.97          self.cmdline = ""
   36.98 -        ip = sxp.child_value(imageConfig, "ip", None)
   36.99 +        ip = get_cfg("ip")
  36.100          if ip:
  36.101              self.cmdline += " ip=" + ip
  36.102 -        root = sxp.child_value(imageConfig, "root")
  36.103 +        root = get_cfg("root")
  36.104          if root:
  36.105              self.cmdline += " root=" + root
  36.106 -        args = sxp.child_value(imageConfig, "args")
  36.107 +        args = get_cfg("args")
  36.108          if args:
  36.109              self.cmdline += " " + args
  36.110 -        self.ramdisk = sxp.child_value(imageConfig, "ramdisk", '')
  36.111 +        self.ramdisk = get_cfg("ramdisk", '')
  36.112          
  36.113          self.vm.storeVm(("image/ostype", self.ostype),
  36.114                          ("image/kernel", self.kernel),
  36.115 @@ -130,7 +97,7 @@ class ImageHandler:
  36.116                          ("image/ramdisk", self.ramdisk))
  36.117  
  36.118  
  36.119 -    def handleBootloading():
  36.120 +    def handleBootloading(self):
  36.121          self.unlink(self.kernel)
  36.122          self.unlink(self.ramdisk)
  36.123  
  36.124 @@ -194,7 +161,6 @@ class ImageHandler:
  36.125          if d.has_key('console_mfn'):
  36.126              self.vm.setConsoleRef(d.get('console_mfn'))
  36.127  
  36.128 -addImageHandlerClass = ImageHandler.addImageHandlerClass
  36.129  
  36.130  class LinuxImageHandler(ImageHandler):
  36.131  
  36.132 @@ -238,22 +204,19 @@ class VmxImageHandler(ImageHandler):
  36.133  
  36.134      def configure(self, imageConfig, deviceConfig):
  36.135          ImageHandler.configure(self, imageConfig, deviceConfig)
  36.136 -        
  36.137 -        self.memmap = sxp.child_value(imageConfig, 'memmap')
  36.138 +
  36.139          self.dmargs = self.parseDeviceModelArgs(imageConfig, deviceConfig)
  36.140          self.device_model = sxp.child_value(imageConfig, 'device_model')
  36.141          if not self.device_model:
  36.142              raise VmError("vmx: missing device model")
  36.143          self.display = sxp.child_value(imageConfig, 'display')
  36.144  
  36.145 -        self.vm.storeVm(("image/memmap", self.memmap),
  36.146 -                        ("image/dmargs", " ".join(self.dmargs)),
  36.147 +        self.vm.storeVm(("image/dmargs", " ".join(self.dmargs)),
  36.148                          ("image/device-model", self.device_model),
  36.149                          ("image/display", self.display))
  36.150  
  36.151          self.device_channel = None
  36.152          self.pid = 0
  36.153 -        self.memmap_value = []
  36.154  
  36.155          self.dmargs += self.configVNC(imageConfig)
  36.156  
  36.157 @@ -261,7 +224,6 @@ class VmxImageHandler(ImageHandler):
  36.158      def createImage(self):
  36.159          """Create a VM for the VMX environment.
  36.160          """
  36.161 -        self.parseMemmap()
  36.162          self.createDomain()
  36.163  
  36.164      def buildDomain(self):
  36.165 @@ -278,9 +240,6 @@ class VmxImageHandler(ImageHandler):
  36.166          log.debug("control_evtchn = %d", self.device_channel.port2)
  36.167          log.debug("store_evtchn   = %d", store_evtchn)
  36.168          log.debug("memsize        = %d", self.vm.getMemoryTarget() / 1024)
  36.169 -        log.debug("memmap         = %s", self.memmap_value)
  36.170 -        log.debug("cmdline        = %s", self.cmdline)
  36.171 -        log.debug("ramdisk        = %s", self.ramdisk)
  36.172          log.debug("flags          = %d", self.flags)
  36.173          log.debug("vcpus          = %d", self.vm.getVCpuCount())
  36.174  
  36.175 @@ -289,9 +248,6 @@ class VmxImageHandler(ImageHandler):
  36.176                             control_evtchn = self.device_channel.port2,
  36.177                             store_evtchn   = store_evtchn,
  36.178                             memsize        = self.vm.getMemoryTarget() / 1024,
  36.179 -                           memmap         = self.memmap_value,
  36.180 -                           cmdline        = self.cmdline,
  36.181 -                           ramdisk        = self.ramdisk,
  36.182                             flags          = self.flags,
  36.183                             vcpus          = self.vm.getVCpuCount())
  36.184          if isinstance(ret, dict):
  36.185 @@ -299,18 +255,11 @@ class VmxImageHandler(ImageHandler):
  36.186              return 0
  36.187          return ret
  36.188  
  36.189 -    def parseMemmap(self):
  36.190 -        if self.memmap is None:
  36.191 -            return
  36.192 -        memmap = sxp.parse(open(self.memmap))[0]
  36.193 -        from xen.util.memmap import memmap_parse
  36.194 -        self.memmap_value = memmap_parse(memmap)
  36.195 -        
  36.196      # Return a list of cmd line args to the device models based on the
  36.197      # xm config file
  36.198      def parseDeviceModelArgs(self, imageConfig, deviceConfig):
  36.199          dmargs = [ 'cdrom', 'boot', 'fda', 'fdb',
  36.200 -                   'localtime', 'serial', 'stdvga', 'isa', 'vcpus' ] 
  36.201 +                   'localtime', 'serial', 'stdvga', 'isa', 'vcpus' ]
  36.202          ret = []
  36.203          for a in dmargs:
  36.204              v = sxp.child_value(imageConfig, a)
  36.205 @@ -439,3 +388,28 @@ class VmxImageHandler(ImageHandler):
  36.206              return 16 * 1024
  36.207          else:
  36.208              return (1 + ((mem_mb + 3) >> 2)) * 4
  36.209 +
  36.210 +
  36.211 +"""Table of image handler classes for virtual machine images.  Indexed by
  36.212 +image type.
  36.213 +"""
  36.214 +imageHandlerClasses = {}
  36.215 +
  36.216 +
  36.217 +for h in LinuxImageHandler, VmxImageHandler:
  36.218 +    imageHandlerClasses[h.ostype] = h
  36.219 +
  36.220 +
  36.221 +def findImageHandlerClass(image):
  36.222 +    """Find the image handler class for an image config.
  36.223 +
  36.224 +    @param image config
  36.225 +    @return ImageHandler subclass or None
  36.226 +    """
  36.227 +    ty = sxp.name(image)
  36.228 +    if ty is None:
  36.229 +        raise VmError('missing image type')
  36.230 +    imageClass = imageHandlerClasses.get(ty)
  36.231 +    if imageClass is None:
  36.232 +        raise VmError('unknown image type: ' + ty)
  36.233 +    return imageClass
    37.1 --- a/tools/python/xen/xend/server/DevController.py	Fri Sep 23 15:41:28 2005 -0600
    37.2 +++ b/tools/python/xen/xend/server/DevController.py	Mon Sep 26 11:07:49 2005 -0600
    37.3 @@ -126,20 +126,21 @@ class DevController:
    37.4          compulsory to use it; subclasses may prefer to allocate IDs based upon
    37.5          the device configuration instead.
    37.6          """
    37.7 -        path = self.frontendMiscPath()
    37.8 -        t = xstransact(path)
    37.9 -        try:
   37.10 -            result = t.read("nextDeviceID")
   37.11 -            if result:
   37.12 -                result = int(result)
   37.13 -            else:
   37.14 -                result = 1
   37.15 -            t.write("nextDeviceID", str(result + 1))
   37.16 -            t.commit()
   37.17 -            return result
   37.18 -        except:
   37.19 -            t.abort()
   37.20 -            raise
   37.21 +        while True:
   37.22 +            path = self.frontendMiscPath()
   37.23 +            t = xstransact(path)
   37.24 +            try:
   37.25 +                result = t.read("nextDeviceID")
   37.26 +                if result:
   37.27 +                    result = int(result)
   37.28 +                else:
   37.29 +                    result = 1
   37.30 +                t.write("nextDeviceID", str(result + 1))
   37.31 +                if t.commit():
   37.32 +                    return result
   37.33 +            except:
   37.34 +                t.abort()
   37.35 +                raise
   37.36  
   37.37  
   37.38      ## private:
    38.1 --- a/tools/python/xen/xend/xenstore/xsnode.py	Fri Sep 23 15:41:28 2005 -0600
    38.2 +++ b/tools/python/xen/xend/xenstore/xsnode.py	Mon Sep 26 11:07:49 2005 -0600
    38.3 @@ -280,8 +280,8 @@ class XenStore:
    38.4                                 (', while writing %s : %s' % (str(path),
    38.5                                                               str(data))))
    38.6  
    38.7 -    def begin(self, path):
    38.8 -        self.getxs().transaction_start(path)
    38.9 +    def begin(self):
   38.10 +        self.getxs().transaction_start()
   38.11  
   38.12      def commit(self, abandon=False):
   38.13          self.getxs().transaction_end(abort=abandon)
    39.1 --- a/tools/python/xen/xend/xenstore/xstransact.py	Fri Sep 23 15:41:28 2005 -0600
    39.2 +++ b/tools/python/xen/xend/xenstore/xstransact.py	Mon Sep 26 11:07:49 2005 -0600
    39.3 @@ -14,16 +14,8 @@ class xstransact:
    39.4      def __init__(self, path):
    39.5          self.in_transaction = False
    39.6          self.path = path.rstrip("/")
    39.7 -        while True:
    39.8 -            try:
    39.9 -                xshandle().transaction_start(path)
   39.10 -                self.in_transaction = True
   39.11 -                return
   39.12 -            except RuntimeError, ex:
   39.13 -                if ex.args[0] == errno.ENOENT and path != "/":
   39.14 -                    path = "/".join(path.split("/")[0:-1]) or "/"
   39.15 -                else:
   39.16 -                    raise
   39.17 +        xshandle().transaction_start()
   39.18 +        self.in_transaction = True
   39.19  
   39.20      def __del__(self):
   39.21          if self.in_transaction:
   39.22 @@ -175,14 +167,8 @@ class xstransact:
   39.23              t = cls(path)
   39.24              try:
   39.25                  v = t.read(*args)
   39.26 -                t.commit()
   39.27 -                return v
   39.28 -            except RuntimeError, ex:
   39.29                  t.abort()
   39.30 -                if ex.args[0] == errno.ETIMEDOUT:
   39.31 -                    pass
   39.32 -                else:
   39.33 -                    raise
   39.34 +                return v
   39.35              except:
   39.36                  t.abort()
   39.37                  raise
   39.38 @@ -194,14 +180,8 @@ class xstransact:
   39.39              t = cls(path)
   39.40              try:
   39.41                  t.write(*args, **opts)
   39.42 -                t.commit()
   39.43 -                return
   39.44 -            except RuntimeError, ex:
   39.45 -                t.abort()
   39.46 -                if ex.args[0] == errno.ETIMEDOUT:
   39.47 -                    pass
   39.48 -                else:
   39.49 -                    raise
   39.50 +                if t.commit():
   39.51 +                    return
   39.52              except:
   39.53                  t.abort()
   39.54                  raise
   39.55 @@ -217,14 +197,8 @@ class xstransact:
   39.56              t = cls(path)
   39.57              try:
   39.58                  t.remove(*args)
   39.59 -                t.commit()
   39.60 -                return
   39.61 -            except RuntimeError, ex:
   39.62 -                t.abort()
   39.63 -                if ex.args[0] == errno.ETIMEDOUT:
   39.64 -                    pass
   39.65 -                else:
   39.66 -                    raise
   39.67 +                if t.commit():
   39.68 +                    return
   39.69              except:
   39.70                  t.abort()
   39.71                  raise
   39.72 @@ -236,14 +210,8 @@ class xstransact:
   39.73              t = cls(path)
   39.74              try:
   39.75                  v = t.list(*args)
   39.76 -                t.commit()
   39.77 -                return v
   39.78 -            except RuntimeError, ex:
   39.79 -                t.abort()
   39.80 -                if ex.args[0] == errno.ETIMEDOUT:
   39.81 -                    pass
   39.82 -                else:
   39.83 -                    raise
   39.84 +                if t.commit():
   39.85 +                    return v
   39.86              except:
   39.87                  t.abort()
   39.88                  raise
   39.89 @@ -255,14 +223,8 @@ class xstransact:
   39.90              t = cls(path)
   39.91              try:
   39.92                  v = t.gather(*args)
   39.93 -                t.commit()
   39.94 -                return v
   39.95 -            except RuntimeError, ex:
   39.96 -                t.abort()
   39.97 -                if ex.args[0] == errno.ETIMEDOUT:
   39.98 -                    pass
   39.99 -                else:
  39.100 -                    raise
  39.101 +                if t.commit():
  39.102 +                    return v
  39.103              except:
  39.104                  t.abort()
  39.105                  raise
  39.106 @@ -274,14 +236,8 @@ class xstransact:
  39.107              t = cls(path)
  39.108              try:
  39.109                  v = t.store(*args)
  39.110 -                t.commit()
  39.111 -                return v
  39.112 -            except RuntimeError, ex:
  39.113 -                t.abort()
  39.114 -                if ex.args[0] == errno.ETIMEDOUT:
  39.115 -                    pass
  39.116 -                else:
  39.117 -                    raise
  39.118 +                if t.commit():
  39.119 +                    return v
  39.120              except:
  39.121                  t.abort()
  39.122                  raise
    40.1 --- a/tools/python/xen/xm/main.py	Fri Sep 23 15:41:28 2005 -0600
    40.2 +++ b/tools/python/xen/xm/main.py	Mon Sep 26 11:07:49 2005 -0600
    40.3 @@ -1,5 +1,6 @@
    40.4  # (C) Copyright IBM Corp. 2005
    40.5  # Copyright (C) 2004 Mike Wray
    40.6 +# Copyright (c) 2005 XenSource Ltd
    40.7  #
    40.8  # Authors:
    40.9  #     Sean Dague <sean at dague dot net>
   40.10 @@ -169,12 +170,6 @@ def handle_xend_error(cmd, dom, ex):
   40.11  #
   40.12  #########################################################################
   40.13  
   40.14 -def xm_create(args):
   40.15 -    from xen.xm import create
   40.16 -    # ugly hack because the opt parser apparently wants
   40.17 -    # the subcommand name just to throw it away!
   40.18 -    create.main(["bogus"] + args)
   40.19 -
   40.20  def xm_save(args):
   40.21      arg_check(args,2,"save")
   40.22  
   40.23 @@ -196,13 +191,6 @@ def xm_restore(args):
   40.24      if id is not None:
   40.25          server.xend_domain_unpause(domid)
   40.26  
   40.27 -def xm_migrate(args):
   40.28 -    # TODO: arg_check
   40.29 -    from xen.xm import migrate
   40.30 -    # ugly hack because the opt parser apparently wants
   40.31 -    # the subcommand name just to throw it away!
   40.32 -    migrate.main(["bogus"] + args)
   40.33 -
   40.34  def xm_list(args):
   40.35      use_long = 0
   40.36      show_vcpus = 0
   40.37 @@ -290,14 +278,6 @@ def xm_show_vcpus(domsinfo):
   40.38  def xm_vcpu_list(args):
   40.39      xm_list(["-v"] + args)
   40.40  
   40.41 -def xm_destroy(args):
   40.42 -    arg_check(args,1,"destroy")
   40.43 -
   40.44 -    from xen.xm import destroy
   40.45 -    # ugly hack because the opt parser apparently wants
   40.46 -    # the subcommand name just to throw it away!
   40.47 -    destroy.main(["bogus"] + args)
   40.48 -            
   40.49  def xm_reboot(args):
   40.50      arg_check(args,1,"reboot")
   40.51      from xen.xm import shutdown
   40.52 @@ -305,20 +285,6 @@ def xm_reboot(args):
   40.53      # the subcommand name just to throw it away!
   40.54      shutdown.main(["bogus", "-R"] + args)
   40.55  
   40.56 -def xm_shutdown(args):
   40.57 -    arg_check(args,1,"shutdown")
   40.58 -
   40.59 -    from xen.xm import shutdown
   40.60 -    # ugly hack because the opt parser apparently wants
   40.61 -    # the subcommand name just to throw it away!
   40.62 -    shutdown.main(["bogus"] + args)
   40.63 -
   40.64 -def xm_sysrq(args):
   40.65 -    from xen.xm import sysrq
   40.66 -    # ugly hack because the opt parser apparently wants
   40.67 -    # the subcommand name just to throw it away!
   40.68 -    sysrq.main(["bogus"] + args)
   40.69 -
   40.70  def xm_pause(args):
   40.71      arg_check(args, 1, "pause")
   40.72      dom = args[0]
   40.73 @@ -333,6 +299,11 @@ def xm_unpause(args):
   40.74      from xen.xend.XendClient import server
   40.75      server.xend_domain_unpause(dom)
   40.76  
   40.77 +def xm_subcommand(command, args):
   40.78 +    cmd = __import__(command, globals(), locals(), 'xen.xm')
   40.79 +    cmd.main(["bogus"] + args)
   40.80 +
   40.81 +
   40.82  #############################################################
   40.83  
   40.84  def cpu_make_map(cpulist):
   40.85 @@ -506,14 +477,6 @@ def xm_network_list(args):
   40.86          sxp.show(x)
   40.87          print
   40.88  
   40.89 -def xm_network_attach(args):
   40.90 -
   40.91 -    print "Not implemented"
   40.92 -
   40.93 -def xm_network_detach(args):
   40.94 -
   40.95 -    print "Not implemented"
   40.96 -    
   40.97  def xm_block_list(args):
   40.98      arg_check(args,1,"block-list")
   40.99      dom = args[0]
  40.100 @@ -609,11 +572,8 @@ commands = {
  40.101      # domain commands
  40.102      "domid": xm_domid,
  40.103      "domname": xm_domname,
  40.104 -    "create": xm_create,
  40.105 -    "destroy": xm_destroy,
  40.106      "restore": xm_restore,
  40.107      "save": xm_save,
  40.108 -    "shutdown": xm_shutdown,
  40.109      "reboot": xm_reboot,
  40.110      "list": xm_list,
  40.111      # memory commands
  40.112 @@ -625,10 +585,7 @@ commands = {
  40.113      "vcpu-enable": xm_vcpu_enable,
  40.114      "vcpu-disable": xm_vcpu_disable,
  40.115      "vcpu-list": xm_vcpu_list,
  40.116 -    # migration
  40.117 -    "migrate": xm_migrate,
  40.118      # special
  40.119 -    "sysrq": xm_sysrq,
  40.120      "pause": xm_pause,
  40.121      "unpause": xm_unpause,
  40.122      # host commands
  40.123 @@ -647,14 +604,24 @@ commands = {
  40.124      # network
  40.125      "network-limit": xm_network_limit,
  40.126      "network-list": xm_network_list,
  40.127 -    "network-attach": xm_network_attach,
  40.128 -    "network-detach": xm_network_detach,
  40.129      # vnet
  40.130      "vnet-list": xm_vnet_list,
  40.131      "vnet-create": xm_vnet_create,
  40.132      "vnet-delete": xm_vnet_delete,
  40.133      }
  40.134  
  40.135 +## The commands supported by a separate argument parser in xend.xm.
  40.136 +subcommands = [
  40.137 +    'create',
  40.138 +    'destroy',
  40.139 +    'migrate',
  40.140 +    'sysrq',
  40.141 +    'shutdown'
  40.142 +    ]
  40.143 +
  40.144 +for c in subcommands:
  40.145 +    commands[c] = eval('lambda args: xm_subcommand("%s", args)' % c)
  40.146 +
  40.147  aliases = {
  40.148      "balloon": "mem-set",
  40.149      "vif-list": "network-list",
  40.150 @@ -669,6 +636,7 @@ help = {
  40.151      "--long": longhelp
  40.152     }
  40.153  
  40.154 +
  40.155  def xm_lookup_cmd(cmd):
  40.156      if commands.has_key(cmd):
  40.157          return commands[cmd]
  40.158 @@ -688,9 +656,7 @@ def deprecated(old,new):
  40.159      err('Option %s is the new replacement, see "xm help %s" for more info' % (new, new))
  40.160  
  40.161  def usage(cmd=None):
  40.162 -    if cmd == "full":
  40.163 -        print fullhelp
  40.164 -    elif help.has_key(cmd):
  40.165 +    if help.has_key(cmd):
  40.166          print help[cmd]
  40.167      else:
  40.168          print shorthelp
  40.169 @@ -701,7 +667,7 @@ def main(argv=sys.argv):
  40.170          usage()
  40.171      
  40.172      if re.compile('-*help').match(argv[1]):
  40.173 -	if len(argv) > 2 and help.has_key(argv[2]):
  40.174 +	if len(argv) > 2:
  40.175  	    usage(argv[2])
  40.176  	else:
  40.177  	    usage()
    41.1 --- a/tools/xenstore/Makefile	Fri Sep 23 15:41:28 2005 -0600
    41.2 +++ b/tools/xenstore/Makefile	Mon Sep 26 11:07:49 2005 -0600
    41.3 @@ -28,11 +28,11 @@ CLIENTS := xenstore-exists xenstore-list
    41.4  CLIENTS += xenstore-write
    41.5  CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
    41.6  
    41.7 -all: libxenstore.so xenstored $(CLIENTS)
    41.8 +all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump
    41.9  
   41.10  testcode: xs_test xenstored_test xs_random xs_dom0_test
   41.11  
   41.12 -xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o
   41.13 +xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o tdb.o
   41.14  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
   41.15  
   41.16  $(CLIENTS): libxenstore.so
   41.17 @@ -42,7 +42,10 @@ xenstored: xenstored_core.o xenstored_wa
   41.18  $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
   41.19  	$(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
   41.20  
   41.21 -xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o
   41.22 +xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o tdb.o
   41.23 +	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
   41.24 +
   41.25 +xs_tdb_dump: xs_tdb_dump.o utils.o tdb.o talloc.o
   41.26  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
   41.27  
   41.28  xs_test: xs_test.o xs_lib.o utils.o
   41.29 @@ -50,6 +53,11 @@ xs_random: xs_random.o xs_test_lib.o xs_
   41.30  xs_stress: xs_stress.o xs_test_lib.o xs_lib.o talloc.o utils.o
   41.31  xs_crashme: xs_crashme.o xs_lib.o talloc.o utils.o
   41.32  
   41.33 +speedtest: speedtest.o xs.o xs_lib.o utils.o talloc.o
   41.34 +
   41.35 +check-speed: speedtest xenstored_test $(TESTDIR)
   41.36 +	$(TESTENV) time ./speedtest 100
   41.37 +
   41.38  xs_test.o xs_stress.o xenstored_core_test.o xenstored_watch_test.o xenstored_transaction_test.o xenstored_domain_test.o xs_random.o xs_test_lib.o talloc_test.o fake_libxc.o xs_crashme.o: CFLAGS=$(BASECFLAGS) $(TESTFLAGS)
   41.39  
   41.40  xenstored_%_test.o: xenstored_%.c
   41.41 @@ -98,7 +106,7 @@ RANDSEED=$(shell date +%s)
   41.42  randomcheck: xs_random xenstored_test $(TESTDIR)
   41.43  	$(TESTENV) ./xs_random --simple --fast /tmp/xs_random 200000 $(RANDSEED) && echo
   41.44  	$(TESTENV) ./xs_random --fast /tmp/xs_random 100000 $(RANDSEED) && echo
   41.45 -	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
   41.46 +#	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
   41.47  
   41.48  crashme:  xs_crashme xenstored_test $(TESTDIR)
   41.49  	rm -rf $(TESTDIR)/store $(TESTDIR)/transactions /tmp/xs_crashme.vglog* /tmp/trace
    42.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    42.2 +++ b/tools/xenstore/speedtest.c	Mon Sep 26 11:07:49 2005 -0600
    42.3 @@ -0,0 +1,130 @@
    42.4 +/* 
    42.5 +    Xen Store Daemon Speed test
    42.6 +    Copyright (C) 2005 Rusty Russell IBM Corporation
    42.7 +
    42.8 +    This program is free software; you can redistribute it and/or modify
    42.9 +    it under the terms of the GNU General Public License as published by
   42.10 +    the Free Software Foundation; either version 2 of the License, or
   42.11 +    (at your option) any later version.
   42.12 +
   42.13 +    This program is distributed in the hope that it will be useful,
   42.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   42.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   42.16 +    GNU General Public License for more details.
   42.17 +
   42.18 +    You should have received a copy of the GNU General Public License
   42.19 +    along with this program; if not, write to the Free Software
   42.20 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   42.21 +*/
   42.22 +
   42.23 +#include <stdlib.h>
   42.24 +#include <sys/types.h>
   42.25 +#include <sys/wait.h>
   42.26 +#include <stdio.h>
   42.27 +#include <stdarg.h>
   42.28 +#include <unistd.h>
   42.29 +#include <fcntl.h>
   42.30 +#include <errno.h>
   42.31 +#include "utils.h"
   42.32 +#include "xs.h"
   42.33 +#include "list.h"
   42.34 +#include "talloc.h"
   42.35 +
   42.36 +static void do_command(const char *cmd)
   42.37 +{
   42.38 +	int ret;
   42.39 +
   42.40 +	ret = system(cmd);
   42.41 +	if (ret == -1 || !WIFEXITED(ret) || WEXITSTATUS(ret) != 0)
   42.42 +		barf_perror("Failed '%s': %i", cmd, ret);
   42.43 +}
   42.44 +
   42.45 +static int start_daemon(void)
   42.46 +{
   42.47 +	int fds[2], pid;
   42.48 +
   42.49 +	do_command(talloc_asprintf(NULL, "rm -rf testsuite/tmp/*"));
   42.50 +
   42.51 +	/* Start daemon. */
   42.52 +	pipe(fds);
   42.53 +	if ((pid = fork())) {
   42.54 +		/* Child writes PID when its ready: we wait for that. */
   42.55 +		char buffer[20];
   42.56 +		close(fds[1]);
   42.57 +		if (read(fds[0], buffer, sizeof(buffer)) < 0)
   42.58 +			barf("Failed to summon daemon");
   42.59 +		close(fds[0]);
   42.60 +	} else {
   42.61 +		dup2(fds[1], STDOUT_FILENO);
   42.62 +		close(fds[0]);
   42.63 +#if 0
   42.64 +		execlp("valgrind", "valgrind", "-q", "--suppressions=testsuite/vg-suppressions", "xenstored_test", "--output-pid",
   42.65 +		       "--no-fork", "--trace-file=/tmp/trace", NULL);
   42.66 +#else
   42.67 +		execlp("./xenstored_test", "xenstored_test", "--output-pid", "--no-fork", NULL);
   42.68 +//		execlp("strace", "strace", "-o", "/tmp/out", "./xenstored_test", "--output-pid", "--no-fork", NULL);
   42.69 +#endif
   42.70 +		exit(1);
   42.71 +	}
   42.72 +	return pid;
   42.73 +}
   42.74 +
   42.75 +static void kill_daemon(int pid)
   42.76 +{
   42.77 +	int saved_errno = errno;
   42.78 +	kill(pid, SIGTERM);
   42.79 +	errno = saved_errno;
   42.80 +}
   42.81 +
   42.82 +#define NUM_ENTRIES 50
   42.83 +
   42.84 +/* We create the given number of trees, each with NUM_ENTRIES, using
   42.85 + * transactions. */
   42.86 +int main(int argc, char *argv[])
   42.87 +{
   42.88 +	int i, j, pid, print;
   42.89 +	struct xs_handle *h;
   42.90 +
   42.91 +	if (argc != 2)
   42.92 +		barf("Usage: speedtest <numdomains>");
   42.93 +
   42.94 +	pid = start_daemon();
   42.95 +	h = xs_daemon_open();
   42.96 +	print = atoi(argv[1]) / 76;
   42.97 +	if (!print)
   42.98 +		print = 1;
   42.99 +	for (i = 0; i < atoi(argv[1]); i ++) {
  42.100 +		char name[64];
  42.101 +
  42.102 +		if (i % print == 0)
  42.103 +			write(1, ".", 1);
  42.104 +		if (!xs_transaction_start(h, "/")) {
  42.105 +			kill_daemon(pid);
  42.106 +			barf_perror("Starting transaction");
  42.107 +		}
  42.108 +		sprintf(name, "/%i", i);
  42.109 +		if (!xs_mkdir(h, name)) {
  42.110 +			kill_daemon(pid);
  42.111 +			barf_perror("Making directory %s", name);
  42.112 +		}
  42.113 +
  42.114 +		for (j = 0; j < NUM_ENTRIES; j++) {
  42.115 +			sprintf(name, "/%i/%i", i, j);
  42.116 +			if (!xs_write(h, name, name, strlen(name))) {
  42.117 +				kill_daemon(pid);
  42.118 +				barf_perror("Making directory %s", name);
  42.119 +			}
  42.120 +		}
  42.121 +		if (!xs_transaction_end(h, false)) {
  42.122 +			kill_daemon(pid);
  42.123 +			barf_perror("Ending transaction");
  42.124 +		}
  42.125 +	}
  42.126 +	write(1, "\n", 1);
  42.127 +
  42.128 +	kill_daemon(pid);
  42.129 +	wait(NULL);
  42.130 +	return 0;
  42.131 +}
  42.132 +	
  42.133 +	
    43.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    43.2 +++ b/tools/xenstore/tdb.c	Mon Sep 26 11:07:49 2005 -0600
    43.3 @@ -0,0 +1,2151 @@
    43.4 + /* 
    43.5 +   Unix SMB/CIFS implementation.
    43.6 +
    43.7 +   trivial database library
    43.8 +
    43.9 +   Copyright (C) Andrew Tridgell              1999-2004
   43.10 +   Copyright (C) Paul `Rusty' Russell		   2000
   43.11 +   Copyright (C) Jeremy Allison			   2000-2003
   43.12 +   
   43.13 +     ** NOTE! The following LGPL license applies to the tdb
   43.14 +     ** library. This does NOT imply that all of Samba is released
   43.15 +     ** under the LGPL
   43.16 +   
   43.17 +   This library is free software; you can redistribute it and/or
   43.18 +   modify it under the terms of the GNU Lesser General Public
   43.19 +   License as published by the Free Software Foundation; either
   43.20 +   version 2 of the License, or (at your option) any later version.
   43.21 +
   43.22 +   This library is distributed in the hope that it will be useful,
   43.23 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
   43.24 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   43.25 +   Lesser General Public License for more details.
   43.26 +
   43.27 +   You should have received a copy of the GNU Lesser General Public
   43.28 +   License along with this library; if not, write to the Free Software
   43.29 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   43.30 +*/
   43.31 +
   43.32 +
   43.33 +#ifndef _SAMBA_BUILD_
   43.34 +#if HAVE_CONFIG_H
   43.35 +#include <config.h>
   43.36 +#endif
   43.37 +
   43.38 +#include <stdlib.h>
   43.39 +#include <stdio.h>
   43.40 +#include <stdint.h>
   43.41 +#include <fcntl.h>
   43.42 +#include <unistd.h>
   43.43 +#include <string.h>
   43.44 +#include <fcntl.h>
   43.45 +#include <errno.h>
   43.46 +#include <sys/mman.h>
   43.47 +#include <sys/stat.h>
   43.48 +#include "tdb.h"
   43.49 +#include <stdarg.h>
   43.50 +#include "talloc.h"
   43.51 +#define HAVE_MMAP
   43.52 +#else
   43.53 +#include "includes.h"
   43.54 +#include "lib/tdb/include/tdb.h"
   43.55 +#include "system/time.h"
   43.56 +#include "system/shmem.h"
   43.57 +#include "system/filesys.h"
   43.58 +#endif
   43.59 +
   43.60 +#define TDB_MAGIC_FOOD "TDB file\n"
   43.61 +#define TDB_VERSION (0x26011967 + 6)
   43.62 +#define TDB_MAGIC (0x26011999U)
   43.63 +#define TDB_FREE_MAGIC (~TDB_MAGIC)
   43.64 +#define TDB_DEAD_MAGIC (0xFEE1DEAD)
   43.65 +#define TDB_ALIGNMENT 4
   43.66 +#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
   43.67 +#define DEFAULT_HASH_SIZE 131
   43.68 +#define TDB_PAGE_SIZE 0x2000
   43.69 +#define FREELIST_TOP (sizeof(struct tdb_header))
   43.70 +#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
   43.71 +#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
   43.72 +#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
   43.73 +#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
   43.74 +#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
   43.75 +#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1))
   43.76 +
   43.77 +
   43.78 +/* NB assumes there is a local variable called "tdb" that is the
   43.79 + * current context, also takes doubly-parenthesized print-style
   43.80 + * argument. */
   43.81 +#define TDB_LOG(x) tdb->log_fn x
   43.82 +
   43.83 +/* lock offsets */
   43.84 +#define GLOBAL_LOCK 0
   43.85 +#define ACTIVE_LOCK 4
   43.86 +
   43.87 +#ifndef MAP_FILE
   43.88 +#define MAP_FILE 0
   43.89 +#endif
   43.90 +
   43.91 +#ifndef MAP_FAILED
   43.92 +#define MAP_FAILED ((void *)-1)
   43.93 +#endif
   43.94 +
   43.95 +#ifndef discard_const_p
   43.96 +# if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
   43.97 +#  define discard_const(ptr) ((void *)((intptr_t)(ptr)))
   43.98 +# else
   43.99 +#  define discard_const(ptr) ((void *)(ptr))
  43.100 +# endif
  43.101 +# define discard_const_p(type, ptr) ((type *)discard_const(ptr))
  43.102 +#endif
  43.103 +
  43.104 +/* free memory if the pointer is valid and zero the pointer */
  43.105 +#ifndef SAFE_FREE
  43.106 +#define SAFE_FREE(x) do { if ((x) != NULL) {talloc_free(discard_const_p(void *, (x))); (x)=NULL;} } while(0)
  43.107 +#endif
  43.108 +
  43.109 +#define BUCKET(hash) ((hash) % tdb->header.hash_size)
  43.110 +TDB_DATA tdb_null;
  43.111 +
  43.112 +/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
  43.113 +static TDB_CONTEXT *tdbs = NULL;
  43.114 +
  43.115 +static int tdb_munmap(TDB_CONTEXT *tdb)
  43.116 +{
  43.117 +	if (tdb->flags & TDB_INTERNAL)
  43.118 +		return 0;
  43.119 +
  43.120 +#ifdef HAVE_MMAP
  43.121 +	if (tdb->map_ptr) {
  43.122 +		int ret = munmap(tdb->map_ptr, tdb->map_size);
  43.123 +		if (ret != 0)
  43.124 +			return ret;
  43.125 +	}
  43.126 +#endif
  43.127 +	tdb->map_ptr = NULL;
  43.128 +	return 0;
  43.129 +}
  43.130 +
  43.131 +static void tdb_mmap(TDB_CONTEXT *tdb)
  43.132 +{
  43.133 +	if (tdb->flags & TDB_INTERNAL)
  43.134 +		return;
  43.135 +
  43.136 +#ifdef HAVE_MMAP
  43.137 +	if (!(tdb->flags & TDB_NOMMAP)) {
  43.138 +		tdb->map_ptr = mmap(NULL, tdb->map_size, 
  43.139 +				    PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
  43.140 +				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
  43.141 +
  43.142 +		/*
  43.143 +		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
  43.144 +		 */
  43.145 +
  43.146 +		if (tdb->map_ptr == MAP_FAILED) {
  43.147 +			tdb->map_ptr = NULL;
  43.148 +			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n", 
  43.149 +				 tdb->map_size, strerror(errno)));
  43.150 +		}
  43.151 +	} else {
  43.152 +		tdb->map_ptr = NULL;
  43.153 +	}
  43.154 +#else
  43.155 +	tdb->map_ptr = NULL;
  43.156 +#endif
  43.157 +}
  43.158 +
  43.159 +/* Endian conversion: we only ever deal with 4 byte quantities */
  43.160 +static void *convert(void *buf, u32 size)
  43.161 +{
  43.162 +	u32 i, *p = buf;
  43.163 +	for (i = 0; i < size / 4; i++)
  43.164 +		p[i] = TDB_BYTEREV(p[i]);
  43.165 +	return buf;
  43.166 +}
  43.167 +#define DOCONV() (tdb->flags & TDB_CONVERT)
  43.168 +#define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
  43.169 +
  43.170 +/* the body of the database is made of one list_struct for the free space
  43.171 +   plus a separate data list for each hash value */
  43.172 +struct list_struct {
  43.173 +	tdb_off next; /* offset of the next record in the list */
  43.174 +	tdb_len rec_len; /* total byte length of record */
  43.175 +	tdb_len key_len; /* byte length of key */
  43.176 +	tdb_len data_len; /* byte length of data */
  43.177 +	u32 full_hash; /* the full 32 bit hash of the key */
  43.178 +	u32 magic;   /* try to catch errors */
  43.179 +	/* the following union is implied:
  43.180 +		union {
  43.181 +			char record[rec_len];
  43.182 +			struct {
  43.183 +				char key[key_len];
  43.184 +				char data[data_len];
  43.185 +			}
  43.186 +			u32 totalsize; (tailer)
  43.187 +		}
  43.188 +	*/
  43.189 +};
  43.190 +
  43.191 +/* a byte range locking function - return 0 on success
  43.192 +   this functions locks/unlocks 1 byte at the specified offset.
  43.193 +
  43.194 +   On error, errno is also set so that errors are passed back properly
  43.195 +   through tdb_open(). */
  43.196 +static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset, 
  43.197 +		      int rw_type, int lck_type, int probe)
  43.198 +{
  43.199 +	struct flock fl;
  43.200 +	int ret;
  43.201 +
  43.202 +	if (tdb->flags & TDB_NOLOCK)
  43.203 +		return 0;
  43.204 +	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
  43.205 +		errno = EACCES;
  43.206 +		return -1;
  43.207 +	}
  43.208 +
  43.209 +	fl.l_type = rw_type;
  43.210 +	fl.l_whence = SEEK_SET;
  43.211 +	fl.l_start = offset;
  43.212 +	fl.l_len = 1;
  43.213 +	fl.l_pid = 0;
  43.214 +
  43.215 +	do {
  43.216 +		ret = fcntl(tdb->fd,lck_type,&fl);
  43.217 +	} while (ret == -1 && errno == EINTR);
  43.218 +
  43.219 +	if (ret == -1) {
  43.220 +		if (!probe && lck_type != F_SETLK) {
  43.221 +			/* Ensure error code is set for log fun to examine. */
  43.222 +			tdb->ecode = TDB_ERR_LOCK;
  43.223 +			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n", 
  43.224 +				 tdb->fd, offset, rw_type, lck_type));
  43.225 +		}
  43.226 +		/* Generic lock error. errno set by fcntl.
  43.227 +		 * EAGAIN is an expected return from non-blocking
  43.228 +		 * locks. */
  43.229 +		if (errno != EAGAIN) {
  43.230 +		TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n", 
  43.231 +				 tdb->fd, offset, rw_type, lck_type, 
  43.232 +				 strerror(errno)));
  43.233 +		}
  43.234 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
  43.235 +	}
  43.236 +	return 0;
  43.237 +}
  43.238 +
  43.239 +/* lock a list in the database. list -1 is the alloc list */
  43.240 +static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
  43.241 +{
  43.242 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
  43.243 +		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n", 
  43.244 +			   list, ltype));
  43.245 +		return -1;
  43.246 +	}
  43.247 +	if (tdb->flags & TDB_NOLOCK)
  43.248 +		return 0;
  43.249 +
  43.250 +	/* Since fcntl locks don't nest, we do a lock for the first one,
  43.251 +	   and simply bump the count for future ones */
  43.252 +	if (tdb->locked[list+1].count == 0) {
  43.253 +		if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
  43.254 +			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n", 
  43.255 +					   list, ltype, strerror(errno)));
  43.256 +			return -1;
  43.257 +		}
  43.258 +		tdb->locked[list+1].ltype = ltype;
  43.259 +	}
  43.260 +	tdb->locked[list+1].count++;
  43.261 +	return 0;
  43.262 +}
  43.263 +
  43.264 +/* unlock the database: returns void because it's too late for errors. */
  43.265 +	/* changed to return int it may be interesting to know there
  43.266 +	   has been an error  --simo */
  43.267 +static int tdb_unlock(TDB_CONTEXT *tdb, int list,
  43.268 +		      int ltype __attribute__((unused)))
  43.269 +{
  43.270 +	int ret = -1;
  43.271 +
  43.272 +	if (tdb->flags & TDB_NOLOCK)
  43.273 +		return 0;
  43.274 +
  43.275 +	/* Sanity checks */
  43.276 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
  43.277 +		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
  43.278 +		return ret;
  43.279 +	}
  43.280 +
  43.281 +	if (tdb->locked[list+1].count==0) {
  43.282 +		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
  43.283 +		return ret;
  43.284 +	}
  43.285 +
  43.286 +	if (tdb->locked[list+1].count == 1) {
  43.287 +		/* Down to last nested lock: unlock underneath */
  43.288 +		ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
  43.289 +	} else {
  43.290 +		ret = 0;
  43.291 +	}
  43.292 +	tdb->locked[list+1].count--;
  43.293 +
  43.294 +	if (ret)
  43.295 +		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n")); 
  43.296 +	return ret;
  43.297 +}
  43.298 +
  43.299 +/* This is based on the hash algorithm from gdbm */
  43.300 +static u32 default_tdb_hash(TDB_DATA *key)
  43.301 +{
  43.302 +	u32 value;	/* Used to compute the hash value.  */
  43.303 +	u32   i;	/* Used to cycle through random values. */
  43.304 +
  43.305 +	/* Set the initial value from the key size. */
  43.306 +	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
  43.307 +		value = (value + (key->dptr[i] << (i*5 % 24)));
  43.308 +
  43.309 +	return (1103515243 * value + 12345);  
  43.310 +}
  43.311 +
  43.312 +/* check for an out of bounds access - if it is out of bounds then
  43.313 +   see if the database has been expanded by someone else and expand
  43.314 +   if necessary 
  43.315 +   note that "len" is the minimum length needed for the db
  43.316 +*/
  43.317 +static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
  43.318 +{
  43.319 +	struct stat st;
  43.320 +	if (len <= tdb->map_size)
  43.321 +		return 0;
  43.322 +	if (tdb->flags & TDB_INTERNAL) {
  43.323 +		if (!probe) {
  43.324 +			/* Ensure ecode is set for log fn. */
  43.325 +			tdb->ecode = TDB_ERR_IO;
  43.326 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
  43.327 +				 (int)len, (int)tdb->map_size));
  43.328 +		}
  43.329 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  43.330 +	}
  43.331 +
  43.332 +	if (fstat(tdb->fd, &st) == -1)
  43.333 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  43.334 +
  43.335 +	if (st.st_size < (off_t)len) {
  43.336 +		if (!probe) {
  43.337 +			/* Ensure ecode is set for log fn. */
  43.338 +			tdb->ecode = TDB_ERR_IO;
  43.339 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
  43.340 +				 (int)len, (int)st.st_size));
  43.341 +		}
  43.342 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  43.343 +	}
  43.344 +
  43.345 +	/* Unmap, update size, remap */
  43.346 +	if (tdb_munmap(tdb) == -1)
  43.347 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  43.348 +	tdb->map_size = st.st_size;
  43.349 +	tdb_mmap(tdb);
  43.350 +	return 0;
  43.351 +}
  43.352 +
  43.353 +/* write a lump of data at a specified offset */
  43.354 +static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
  43.355 +{
  43.356 +	if (tdb_oob(tdb, off + len, 0) != 0)
  43.357 +		return -1;
  43.358 +
  43.359 +	if (tdb->map_ptr)
  43.360 +		memcpy(off + (char *)tdb->map_ptr, buf, len);
  43.361 +#ifdef HAVE_PWRITE
  43.362 +	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
  43.363 +#else
  43.364 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
  43.365 +		 || write(tdb->fd, buf, len) != (off_t)len) {
  43.366 +#endif
  43.367 +		/* Ensure ecode is set for log fn. */
  43.368 +		tdb->ecode = TDB_ERR_IO;
  43.369 +		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
  43.370 +			   off, len, strerror(errno)));
  43.371 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  43.372 +	}
  43.373 +	return 0;
  43.374 +}
  43.375 +
  43.376 +/* read a lump of data at a specified offset, maybe convert */
  43.377 +static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
  43.378 +{
  43.379 +	if (tdb_oob(tdb, off + len, 0) != 0)
  43.380 +		return -1;
  43.381 +
  43.382 +	if (tdb->map_ptr)
  43.383 +		memcpy(buf, off + (char *)tdb->map_ptr, len);
  43.384 +#ifdef HAVE_PREAD
  43.385 +	else if (pread(tdb->fd, buf, len, off) != (off_t)len) {
  43.386 +#else
  43.387 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
  43.388 +		 || read(tdb->fd, buf, len) != (off_t)len) {
  43.389 +#endif
  43.390 +		/* Ensure ecode is set for log fn. */
  43.391 +		tdb->ecode = TDB_ERR_IO;
  43.392 +		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
  43.393 +			   off, len, strerror(errno)));
  43.394 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
  43.395 +	}
  43.396 +	if (cv)
  43.397 +		convert(buf, len);
  43.398 +	return 0;
  43.399 +}
  43.400 +
  43.401 +/* don't allocate memory: used in tdb_delete path. */
  43.402 +static int tdb_key_eq(TDB_CONTEXT *tdb, tdb_off off, TDB_DATA key)
  43.403 +{
  43.404 +	char buf[64];
  43.405 +	u32 len;
  43.406 +
  43.407 +	if (tdb_oob(tdb, off + key.dsize, 0) != 0)
  43.408 +		return -1;
  43.409 +
  43.410 +	if (tdb->map_ptr)
  43.411 +		return !memcmp(off + (char*)tdb->map_ptr, key.dptr, key.dsize);
  43.412 +
  43.413 +	while (key.dsize) {
  43.414 +		len = key.dsize;
  43.415 +		if (len > sizeof(buf))
  43.416 +			len = sizeof(buf);
  43.417 +		if (tdb_read(tdb, off, buf, len, 0) != 0)
  43.418 +			return -1;
  43.419 +		if (memcmp(buf, key.dptr, len) != 0)
  43.420 +			return 0;
  43.421 +		key.dptr += len;
  43.422 +		key.dsize -= len;
  43.423 +		off += len;
  43.424 +	}
  43.425 +	return 1;
  43.426 +}
  43.427 +
  43.428 +/* read a lump of data, allocating the space for it */
  43.429 +static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
  43.430 +{
  43.431 +	char *buf;
  43.432 +
  43.433 +	if (!(buf = talloc_size(tdb, len))) {
  43.434 +		/* Ensure ecode is set for log fn. */
  43.435 +		tdb->ecode = TDB_ERR_OOM;
  43.436 +		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
  43.437 +			   len, strerror(errno)));
  43.438 +		return TDB_ERRCODE(TDB_ERR_OOM, buf);
  43.439 +	}
  43.440 +	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
  43.441 +		SAFE_FREE(buf);
  43.442 +		return NULL;
  43.443 +	}
  43.444 +	return buf;
  43.445 +}
  43.446 +
  43.447 +/* read/write a tdb_off */
  43.448 +static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
  43.449 +{
  43.450 +	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
  43.451 +}
  43.452 +static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
  43.453 +{
  43.454 +	tdb_off off = *d;
  43.455 +	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
  43.456 +}
  43.457 +
  43.458 +/* read/write a record */
  43.459 +static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  43.460 +{
  43.461 +	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
  43.462 +		return -1;
  43.463 +	if (TDB_BAD_MAGIC(rec)) {
  43.464 +		/* Ensure ecode is set for log fn. */
  43.465 +		tdb->ecode = TDB_ERR_CORRUPT;
  43.466 +		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
  43.467 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  43.468 +	}
  43.469 +	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
  43.470 +}
  43.471 +static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  43.472 +{
  43.473 +	struct list_struct r = *rec;
  43.474 +	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
  43.475 +}
  43.476 +
  43.477 +/* read a freelist record and check for simple errors */
  43.478 +static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
  43.479 +{
  43.480 +	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
  43.481 +		return -1;
  43.482 +
  43.483 +	if (rec->magic == TDB_MAGIC) {
  43.484 +		/* this happens when a app is showdown while deleting a record - we should
  43.485 +		   not completely fail when this happens */
  43.486 +		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n", 
  43.487 +			 rec->magic, off));
  43.488 +		rec->magic = TDB_FREE_MAGIC;
  43.489 +		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
  43.490 +			return -1;
  43.491 +	}
  43.492 +
  43.493 +	if (rec->magic != TDB_FREE_MAGIC) {
  43.494 +		/* Ensure ecode is set for log fn. */
  43.495 +		tdb->ecode = TDB_ERR_CORRUPT;
  43.496 +		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n", 
  43.497 +			   rec->magic, off));
  43.498 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  43.499 +	}
  43.500 +	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
  43.501 +		return -1;
  43.502 +	return 0;
  43.503 +}
  43.504 +
  43.505 +/* update a record tailer (must hold allocation lock) */
  43.506 +static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
  43.507 +			 const struct list_struct *rec)
  43.508 +{
  43.509 +	tdb_off totalsize;
  43.510 +
  43.511 +	/* Offset of tailer from record header */
  43.512 +	totalsize = sizeof(*rec) + rec->rec_len;
  43.513 +	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
  43.514 +			 &totalsize);
  43.515 +}
  43.516 +
  43.517 +static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
  43.518 +{
  43.519 +	struct list_struct rec;
  43.520 +	tdb_off tailer_ofs, tailer;
  43.521 +
  43.522 +	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
  43.523 +		printf("ERROR: failed to read record at %u\n", offset);
  43.524 +		return 0;
  43.525 +	}
  43.526 +
  43.527 +	printf(" rec: offset=0x%08x next=0x%08x rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
  43.528 +	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
  43.529 +
  43.530 +	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
  43.531 +	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
  43.532 +		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
  43.533 +		return rec.next;
  43.534 +	}
  43.535 +
  43.536 +	if (tailer != rec.rec_len + sizeof(rec)) {
  43.537 +		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
  43.538 +				(unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
  43.539 +	}
  43.540 +	return rec.next;
  43.541 +}
  43.542 +
  43.543 +static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
  43.544 +{
  43.545 +	tdb_off rec_ptr, top;
  43.546 +
  43.547 +	top = TDB_HASH_TOP(i);
  43.548 +
  43.549 +	if (tdb_lock(tdb, i, F_WRLCK) != 0)
  43.550 +		return -1;
  43.551 +
  43.552 +	if (ofs_read(tdb, top, &rec_ptr) == -1)
  43.553 +		return tdb_unlock(tdb, i, F_WRLCK);
  43.554 +
  43.555 +	if (rec_ptr)
  43.556 +		printf("hash=%d\n", i);
  43.557 +
  43.558 +	while (rec_ptr) {
  43.559 +		rec_ptr = tdb_dump_record(tdb, rec_ptr);
  43.560 +	}
  43.561 +
  43.562 +	return tdb_unlock(tdb, i, F_WRLCK);
  43.563 +}
  43.564 +
  43.565 +void tdb_dump_all(TDB_CONTEXT *tdb)
  43.566 +{
  43.567 +	unsigned int i;
  43.568 +	for (i=0;i<tdb->header.hash_size;i++) {
  43.569 +		tdb_dump_chain(tdb, i);
  43.570 +	}
  43.571 +	printf("freelist:\n");
  43.572 +	tdb_dump_chain(tdb, -1);
  43.573 +}
  43.574 +
  43.575 +int tdb_printfreelist(TDB_CONTEXT *tdb)
  43.576 +{
  43.577 +	int ret;
  43.578 +	long total_free = 0;
  43.579 +	tdb_off offset, rec_ptr;
  43.580 +	struct list_struct rec;
  43.581 +
  43.582 +	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
  43.583 +		return ret;
  43.584 +
  43.585 +	offset = FREELIST_TOP;
  43.586 +
  43.587 +	/* read in the freelist top */
  43.588 +	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
  43.589 +		tdb_unlock(tdb, -1, F_WRLCK);
  43.590 +		return 0;
  43.591 +	}
  43.592 +
  43.593 +	printf("freelist top=[0x%08x]\n", rec_ptr );
  43.594 +	while (rec_ptr) {
  43.595 +		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
  43.596 +			tdb_unlock(tdb, -1, F_WRLCK);
  43.597 +			return -1;
  43.598 +		}
  43.599 +
  43.600 +		if (rec.magic != TDB_FREE_MAGIC) {
  43.601 +			printf("bad magic 0x%08x in free list\n", rec.magic);
  43.602 +			tdb_unlock(tdb, -1, F_WRLCK);
  43.603 +			return -1;
  43.604 +		}
  43.605 +
  43.606 +		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n", 
  43.607 +		       rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
  43.608 +		total_free += rec.rec_len;
  43.609 +
  43.610 +		/* move to the next record */
  43.611 +		rec_ptr = rec.next;
  43.612 +	}
  43.613 +	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free, 
  43.614 +               (int)total_free);
  43.615 +
  43.616 +	return tdb_unlock(tdb, -1, F_WRLCK);
  43.617 +}
  43.618 +
  43.619 +/* Remove an element from the freelist.  Must have alloc lock. */
  43.620 +static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
  43.621 +{
  43.622 +	tdb_off last_ptr, i;
  43.623 +
  43.624 +	/* read in the freelist top */
  43.625 +	last_ptr = FREELIST_TOP;
  43.626 +	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
  43.627 +		if (i == off) {
  43.628 +			/* We've found it! */
  43.629 +			return ofs_write(tdb, last_ptr, &next);
  43.630 +		}
  43.631 +		/* Follow chain (next offset is at start of record) */
  43.632 +		last_ptr = i;
  43.633 +	}
  43.634 +	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
  43.635 +	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
  43.636 +}
  43.637 +
  43.638 +/* Add an element into the freelist. Merge adjacent records if
  43.639 +   neccessary. */
  43.640 +static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
  43.641 +{
  43.642 +	tdb_off right, left;
  43.643 +
  43.644 +	/* Allocation and tailer lock */
  43.645 +	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
  43.646 +		return -1;
  43.647 +
  43.648 +	/* set an initial tailer, so if we fail we don't leave a bogus record */
  43.649 +	if (update_tailer(tdb, offset, rec) != 0) {
  43.650 +		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
  43.651 +		goto fail;
  43.652 +	}
  43.653 +
  43.654 +	/* Look right first (I'm an Australian, dammit) */
  43.655 +	right = offset + sizeof(*rec) + rec->rec_len;
  43.656 +	if (right + sizeof(*rec) <= tdb->map_size) {
  43.657 +		struct list_struct r;
  43.658 +
  43.659 +		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
  43.660 +			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
  43.661 +			goto left;
  43.662 +		}
  43.663 +
  43.664 +		/* If it's free, expand to include it. */
  43.665 +		if (r.magic == TDB_FREE_MAGIC) {
  43.666 +			if (remove_from_freelist(tdb, right, r.next) == -1) {
  43.667 +				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
  43.668 +				goto left;
  43.669 +			}
  43.670 +			rec->rec_len += sizeof(r) + r.rec_len;
  43.671 +		}
  43.672 +	}
  43.673 +
  43.674 +left:
  43.675 +	/* Look left */
  43.676 +	left = offset - sizeof(tdb_off);
  43.677 +	if (left > TDB_DATA_START(tdb->header.hash_size)) {
  43.678 +		struct list_struct l;
  43.679 +		tdb_off leftsize;
  43.680 +		
  43.681 +		/* Read in tailer and jump back to header */
  43.682 +		if (ofs_read(tdb, left, &leftsize) == -1) {
  43.683 +			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
  43.684 +			goto update;
  43.685 +		}
  43.686 +		left = offset - leftsize;
  43.687 +
  43.688 +		/* Now read in record */
  43.689 +		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
  43.690 +			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
  43.691 +			goto update;
  43.692 +		}
  43.693 +
  43.694 +		/* If it's free, expand to include it. */
  43.695 +		if (l.magic == TDB_FREE_MAGIC) {
  43.696 +			if (remove_from_freelist(tdb, left, l.next) == -1) {
  43.697 +				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
  43.698 +				goto update;
  43.699 +			} else {
  43.700 +				offset = left;
  43.701 +				rec->rec_len += leftsize;
  43.702 +			}
  43.703 +		}
  43.704 +	}
  43.705 +
  43.706 +update:
  43.707 +	if (update_tailer(tdb, offset, rec) == -1) {
  43.708 +		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
  43.709 +		goto fail;
  43.710 +	}
  43.711 +
  43.712 +	/* Now, prepend to free list */
  43.713 +	rec->magic = TDB_FREE_MAGIC;
  43.714 +
  43.715 +	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
  43.716 +	    rec_write(tdb, offset, rec) == -1 ||
  43.717 +	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
  43.718 +		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
  43.719 +		goto fail;
  43.720 +	}
  43.721 +
  43.722 +	/* And we're done. */
  43.723 +	tdb_unlock(tdb, -1, F_WRLCK);
  43.724 +	return 0;
  43.725 +
  43.726 + fail:
  43.727 +	tdb_unlock(tdb, -1, F_WRLCK);
  43.728 +	return -1;
  43.729 +}
  43.730 +
  43.731 +
  43.732 +/* expand a file.  we prefer to use ftruncate, as that is what posix
  43.733 +  says to use for mmap expansion */
  43.734 +static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
  43.735 +{
  43.736 +	char buf[1024];
  43.737 +#if HAVE_FTRUNCATE_EXTEND
  43.738 +	if (ftruncate(tdb->fd, size+addition) != 0) {
  43.739 +		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n", 
  43.740 +			   size+addition, strerror(errno)));
  43.741 +		return -1;
  43.742 +	}
  43.743 +#else
  43.744 +	char b = 0;
  43.745 +
  43.746 +#ifdef HAVE_PWRITE
  43.747 +	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
  43.748 +#else
  43.749 +	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (off_t)(size+addition) - 1 || 
  43.750 +	    write(tdb->fd, &b, 1) != 1) {
  43.751 +#endif
  43.752 +		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n", 
  43.753 +			   size+addition, strerror(errno)));
  43.754 +		return -1;
  43.755 +	}
  43.756 +#endif
  43.757 +
  43.758 +	/* now fill the file with something. This ensures that the file isn't sparse, which would be
  43.759 +	   very bad if we ran out of disk. This must be done with write, not via mmap */
  43.760 +	memset(buf, 0x42, sizeof(buf));
  43.761 +	while (addition) {
  43.762 +		int n = addition>sizeof(buf)?sizeof(buf):addition;
  43.763 +#ifdef HAVE_PWRITE
  43.764 +		int ret = pwrite(tdb->fd, buf, n, size);
  43.765 +#else
  43.766 +		int ret;
  43.767 +		if (lseek(tdb->fd, size, SEEK_SET) != (off_t)size)
  43.768 +			return -1;
  43.769 +		ret = write(tdb->fd, buf, n);
  43.770 +#endif
  43.771 +		if (ret != n) {
  43.772 +			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n", 
  43.773 +				   n, strerror(errno)));
  43.774 +			return -1;
  43.775 +		}
  43.776 +		addition -= n;
  43.777 +		size += n;
  43.778 +	}
  43.779 +	return 0;
  43.780 +}
  43.781 +
  43.782 +
  43.783 +/* expand the database at least size bytes by expanding the underlying
  43.784 +   file and doing the mmap again if necessary */
  43.785 +static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
  43.786 +{
  43.787 +	struct list_struct rec;
  43.788 +	tdb_off offset;
  43.789 +
  43.790 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
  43.791 +		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
  43.792 +		return -1;
  43.793 +	}
  43.794 +
  43.795 +	/* must know about any previous expansions by another process */
  43.796 +	tdb_oob(tdb, tdb->map_size + 1, 1);
  43.797 +
  43.798 +	/* always make room for at least 10 more records, and round
  43.799 +           the database up to a multiple of TDB_PAGE_SIZE */
  43.800 +	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
  43.801 +
  43.802 +	if (!(tdb->flags & TDB_INTERNAL))
  43.803 +		tdb_munmap(tdb);
  43.804 +
  43.805 +	/*
  43.806 +	 * We must ensure the file is unmapped before doing this
  43.807 +	 * to ensure consistency with systems like OpenBSD where
  43.808 +	 * writes and mmaps are not consistent.
  43.809 +	 */
  43.810 +
  43.811 +	/* expand the file itself */
  43.812 +	if (!(tdb->flags & TDB_INTERNAL)) {
  43.813 +		if (expand_file(tdb, tdb->map_size, size) != 0)
  43.814 +			goto fail;
  43.815 +	}
  43.816 +
  43.817 +	tdb->map_size += size;
  43.818 +
  43.819 +	if (tdb->flags & TDB_INTERNAL) {
  43.820 +		char *new_map_ptr = talloc_realloc_size(tdb, tdb->map_ptr,
  43.821 +							tdb->map_size);
  43.822 +		if (!new_map_ptr) {
  43.823 +			tdb->map_size -= size;
  43.824 +			goto fail;
  43.825 +		}
  43.826 +		tdb->map_ptr = new_map_ptr;
  43.827 +	} else {
  43.828 +		/*
  43.829 +		 * We must ensure the file is remapped before adding the space
  43.830 +		 * to ensure consistency with systems like OpenBSD where
  43.831 +		 * writes and mmaps are not consistent.
  43.832 +		 */
  43.833 +
  43.834 +		/* We're ok if the mmap fails as we'll fallback to read/write */
  43.835 +		tdb_mmap(tdb);
  43.836 +	}
  43.837 +
  43.838 +	/* form a new freelist record */
  43.839 +	memset(&rec,'\0',sizeof(rec));
  43.840 +	rec.rec_len = size - sizeof(rec);
  43.841 +
  43.842 +	/* link it into the free list */
  43.843 +	offset = tdb->map_size - size;
  43.844 +	if (tdb_free(tdb, offset, &rec) == -1)
  43.845 +		goto fail;
  43.846 +
  43.847 +	tdb_unlock(tdb, -1, F_WRLCK);
  43.848 +	return 0;
  43.849 + fail:
  43.850 +	tdb_unlock(tdb, -1, F_WRLCK);
  43.851 +	return -1;
  43.852 +}
  43.853 +
  43.854 +
  43.855 +/* 
  43.856 +   the core of tdb_allocate - called when we have decided which
  43.857 +   free list entry to use
  43.858 + */
  43.859 +static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
  43.860 +				struct list_struct *rec, tdb_off last_ptr)
  43.861 +{
  43.862 +	struct list_struct newrec;
  43.863 +	tdb_off newrec_ptr;
  43.864 +
  43.865 +	memset(&newrec, '\0', sizeof(newrec));
  43.866 +
  43.867 +	/* found it - now possibly split it up  */
  43.868 +	if (rec->rec_len > length + MIN_REC_SIZE) {
  43.869 +		/* Length of left piece */
  43.870 +		length = TDB_ALIGN(length, TDB_ALIGNMENT);
  43.871 +		
  43.872 +		/* Right piece to go on free list */
  43.873 +		newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
  43.874 +		newrec_ptr = rec_ptr + sizeof(*rec) + length;
  43.875 +		
  43.876 +		/* And left record is shortened */
  43.877 +		rec->rec_len = length;
  43.878 +	} else {
  43.879 +		newrec_ptr = 0;
  43.880 +	}
  43.881 +	
  43.882 +	/* Remove allocated record from the free list */
  43.883 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
  43.884 +		return 0;
  43.885 +	}
  43.886 +	
  43.887 +	/* Update header: do this before we drop alloc
  43.888 +	   lock, otherwise tdb_free() might try to
  43.889 +	   merge with us, thinking we're free.
  43.890 +	   (Thanks Jeremy Allison). */
  43.891 +	rec->magic = TDB_MAGIC;
  43.892 +	if (rec_write(tdb, rec_ptr, rec) == -1) {
  43.893 +		return 0;
  43.894 +	}
  43.895 +	
  43.896 +	/* Did we create new block? */
  43.897 +	if (newrec_ptr) {
  43.898 +		/* Update allocated record tailer (we
  43.899 +		   shortened it). */
  43.900 +		if (update_tailer(tdb, rec_ptr, rec) == -1) {
  43.901 +			return 0;
  43.902 +		}
  43.903 +		
  43.904 +		/* Free new record */
  43.905 +		if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
  43.906 +			return 0;
  43.907 +		}
  43.908 +	}
  43.909 +	
  43.910 +	/* all done - return the new record offset */
  43.911 +	return rec_ptr;
  43.912 +}
  43.913 +
  43.914 +/* allocate some space from the free list. The offset returned points
  43.915 +   to a unconnected list_struct within the database with room for at
  43.916 +   least length bytes of total data
  43.917 +
  43.918 +   0 is returned if the space could not be allocated
  43.919 + */
  43.920 +static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
  43.921 +			    struct list_struct *rec)
  43.922 +{
  43.923 +	tdb_off rec_ptr, last_ptr, newrec_ptr;
  43.924 +	struct {
  43.925 +		tdb_off rec_ptr, last_ptr;
  43.926 +		tdb_len rec_len;
  43.927 +	} bestfit = { 0, 0, 0 };
  43.928 +
  43.929 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
  43.930 +		return 0;
  43.931 +
  43.932 +	/* Extra bytes required for tailer */
  43.933 +	length += sizeof(tdb_off);
  43.934 +
  43.935 + again:
  43.936 +	last_ptr = FREELIST_TOP;
  43.937 +
  43.938 +	/* read in the freelist top */
  43.939 +	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
  43.940 +		goto fail;
  43.941 +
  43.942 +	bestfit.rec_ptr = 0;
  43.943 +
  43.944 +	/* 
  43.945 +	   this is a best fit allocation strategy. Originally we used
  43.946 +	   a first fit strategy, but it suffered from massive fragmentation
  43.947 +	   issues when faced with a slowly increasing record size.
  43.948 +	 */
  43.949 +	while (rec_ptr) {
  43.950 +		if (rec_free_read(tdb, rec_ptr, rec) == -1) {
  43.951 +			goto fail;
  43.952 +		}
  43.953 +
  43.954 +		if (rec->rec_len >= length) {
  43.955 +			if (bestfit.rec_ptr == 0 ||
  43.956 +			    rec->rec_len < bestfit.rec_len) {
  43.957 +				bestfit.rec_len = rec->rec_len;
  43.958 +				bestfit.rec_ptr = rec_ptr;
  43.959 +				bestfit.last_ptr = last_ptr;
  43.960 +				/* consider a fit to be good enough if we aren't wasting more than half the space */
  43.961 +				if (bestfit.rec_len < 2*length) {
  43.962 +					break;
  43.963 +				}
  43.964 +			}
  43.965 +		}
  43.966 +
  43.967 +		/* move to the next record */
  43.968 +		last_ptr = rec_ptr;
  43.969 +		rec_ptr = rec->next;
  43.970 +	}
  43.971 +
  43.972 +	if (bestfit.rec_ptr != 0) {
  43.973 +		if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
  43.974 +			goto fail;
  43.975 +		}
  43.976 +
  43.977 +		newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
  43.978 +		tdb_unlock(tdb, -1, F_WRLCK);
  43.979 +		return newrec_ptr;
  43.980 +	}
  43.981 +
  43.982 +	/* we didn't find enough space. See if we can expand the
  43.983 +	   database and if we can then try again */
  43.984 +	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
  43.985 +		goto again;
  43.986 + fail:
  43.987 +	tdb_unlock(tdb, -1, F_WRLCK);
  43.988 +	return 0;
  43.989 +}
  43.990 +
  43.991 +/* initialise a new database with a specified hash size */
  43.992 +static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
  43.993 +{
  43.994 +	struct tdb_header *newdb;
  43.995 +	int size, ret = -1;
  43.996 +
  43.997 +	/* We make it up in memory, then write it out if not internal */
  43.998 +	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
  43.999 +	if (!(newdb = talloc_zero_size(tdb, size)))
 43.1000 +		return TDB_ERRCODE(TDB_ERR_OOM, -1);
 43.1001 +
 43.1002 +	/* Fill in the header */
 43.1003 +	newdb->version = TDB_VERSION;
 43.1004 +	newdb->hash_size = hash_size;
 43.1005 +	if (tdb->flags & TDB_INTERNAL) {
 43.1006 +		tdb->map_size = size;
 43.1007 +		tdb->map_ptr = (char *)newdb;
 43.1008 +		memcpy(&tdb->header, newdb, sizeof(tdb->header));
 43.1009 +		/* Convert the `ondisk' version if asked. */
 43.1010 +		CONVERT(*newdb);
 43.1011 +		return 0;
 43.1012 +	}
 43.1013 +	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 43.1014 +		goto fail;
 43.1015 +
 43.1016 +	if (ftruncate(tdb->fd, 0) == -1)
 43.1017 +		goto fail;
 43.1018 +
 43.1019 +	/* This creates an endian-converted header, as if read from disk */
 43.1020 +	CONVERT(*newdb);
 43.1021 +	memcpy(&tdb->header, newdb, sizeof(tdb->header));
 43.1022 +	/* Don't endian-convert the magic food! */
 43.1023 +	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 43.1024 +	if (write(tdb->fd, newdb, size) != size)
 43.1025 +		ret = -1;
 43.1026 +	else
 43.1027 +		ret = 0;
 43.1028 +
 43.1029 +  fail:
 43.1030 +	SAFE_FREE(newdb);
 43.1031 +	return ret;
 43.1032 +}
 43.1033 +
 43.1034 +/* Returns 0 on fail.  On success, return offset of record, and fills
 43.1035 +   in rec */
 43.1036 +static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 43.1037 +			struct list_struct *r)
 43.1038 +{
 43.1039 +	tdb_off rec_ptr;
 43.1040 +	
 43.1041 +	/* read in the hash top */
 43.1042 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 43.1043 +		return 0;
 43.1044 +
 43.1045 +	/* keep looking until we find the right record */
 43.1046 +	while (rec_ptr) {
 43.1047 +		if (rec_read(tdb, rec_ptr, r) == -1)
 43.1048 +			return 0;
 43.1049 +
 43.1050 +		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 43.1051 +			/* a very likely hit - read the key */
 43.1052 +			int cmp = tdb_key_eq(tdb, rec_ptr + sizeof(*r), key);
 43.1053 +			if (cmp < 0)
 43.1054 +				return 0;
 43.1055 +			else if (cmp > 0)
 43.1056 +				return rec_ptr;
 43.1057 +		}
 43.1058 +		rec_ptr = r->next;
 43.1059 +	}
 43.1060 +	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
 43.1061 +}
 43.1062 +
 43.1063 +/* As tdb_find, but if you succeed, keep the lock */
 43.1064 +static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
 43.1065 +			     struct list_struct *rec)
 43.1066 +{
 43.1067 +	u32 rec_ptr;
 43.1068 +
 43.1069 +	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 43.1070 +		return 0;
 43.1071 +	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 43.1072 +		tdb_unlock(tdb, BUCKET(hash), locktype);
 43.1073 +	return rec_ptr;
 43.1074 +}
 43.1075 +
 43.1076 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
 43.1077 +{
 43.1078 +	return tdb->ecode;
 43.1079 +}
 43.1080 +
 43.1081 +static struct tdb_errname {
 43.1082 +	enum TDB_ERROR ecode; const char *estring;
 43.1083 +} emap[] = { {TDB_SUCCESS, "Success"},
 43.1084 +	     {TDB_ERR_CORRUPT, "Corrupt database"},
 43.1085 +	     {TDB_ERR_IO, "IO Error"},
 43.1086 +	     {TDB_ERR_LOCK, "Locking error"},
 43.1087 +	     {TDB_ERR_OOM, "Out of memory"},
 43.1088 +	     {TDB_ERR_EXISTS, "Record exists"},
 43.1089 +	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
 43.1090 +	     {TDB_ERR_NOEXIST, "Record does not exist"} };
 43.1091 +
 43.1092 +/* Error string for the last tdb error */
 43.1093 +const char *tdb_errorstr(TDB_CONTEXT *tdb)
 43.1094 +{
 43.1095 +	u32 i;
 43.1096 +	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
 43.1097 +		if (tdb->ecode == emap[i].ecode)
 43.1098 +			return emap[i].estring;
 43.1099 +	return "Invalid error code";
 43.1100 +}
 43.1101 +
 43.1102 +/* update an entry in place - this only works if the new data size
 43.1103 +   is <= the old data size and the key exists.
 43.1104 +   on failure return -1.
 43.1105 +*/
 43.1106 +
 43.1107 +static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
 43.1108 +{
 43.1109 +	struct list_struct rec;
 43.1110 +	tdb_off rec_ptr;
 43.1111 +
 43.1112 +	/* find entry */
 43.1113 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 43.1114 +		return -1;
 43.1115 +
 43.1116 +	/* must be long enough key, data and tailer */
 43.1117 +	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
 43.1118 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
 43.1119 +		return -1;
 43.1120 +	}
 43.1121 +
 43.1122 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 43.1123 +		      dbuf.dptr, dbuf.dsize) == -1)
 43.1124 +		return -1;
 43.1125 +
 43.1126 +	if (dbuf.dsize != rec.data_len) {
 43.1127 +		/* update size */
 43.1128 +		rec.data_len = dbuf.dsize;
 43.1129 +		return rec_write(tdb, rec_ptr, &rec);
 43.1130 +	}
 43.1131 + 
 43.1132 +	return 0;
 43.1133 +}
 43.1134 +
 43.1135 +/* find an entry in the database given a key */
 43.1136 +/* If an entry doesn't exist tdb_err will be set to
 43.1137 + * TDB_ERR_NOEXIST. If a key has no data attached
 43.1138 + * then the TDB_DATA will have zero length but
 43.1139 + * a non-zero pointer
 43.1140 + */
 43.1141 +
 43.1142 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
 43.1143 +{
 43.1144 +	tdb_off rec_ptr;
 43.1145 +	struct list_struct rec;
 43.1146 +	TDB_DATA ret;
 43.1147 +	u32 hash;
 43.1148 +
 43.1149 +	/* find which hash bucket it is in */
 43.1150 +	hash = tdb->hash_fn(&key);
 43.1151 +	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 43.1152 +		return tdb_null;
 43.1153 +
 43.1154 +	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 43.1155 +				  rec.data_len);
 43.1156 +	ret.dsize = rec.data_len;
 43.1157 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 43.1158 +	return ret;
 43.1159 +}
 43.1160 +
 43.1161 +/* check if an entry in the database exists 
 43.1162 +
 43.1163 +   note that 1 is returned if the key is found and 0 is returned if not found
 43.1164 +   this doesn't match the conventions in the rest of this module, but is
 43.1165 +   compatible with gdbm
 43.1166 +*/
 43.1167 +static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
 43.1168 +{
 43.1169 +	struct list_struct rec;
 43.1170 +	
 43.1171 +	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 43.1172 +		return 0;
 43.1173 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 43.1174 +	return 1;
 43.1175 +}
 43.1176 +
 43.1177 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
 43.1178 +{
 43.1179 +	u32 hash = tdb->hash_fn(&key);
 43.1180 +	return tdb_exists_hash(tdb, key, hash);
 43.1181 +}
 43.1182 +
 43.1183 +/* record lock stops delete underneath */
 43.1184 +static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
 43.1185 +{
 43.1186 +	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
 43.1187 +}
 43.1188 +/*
 43.1189 +  Write locks override our own fcntl readlocks, so check it here.
 43.1190 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 43.1191 +  an error to fail to get the lock here.
 43.1192 +*/
 43.1193 + 
 43.1194 +static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
 43.1195 +{
 43.1196 +	struct tdb_traverse_lock *i;
 43.1197 +	for (i = &tdb->travlocks; i; i = i->next)
 43.1198 +		if (i->off == off)
 43.1199 +			return -1;
 43.1200 +	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
 43.1201 +}
 43.1202 +
 43.1203 +/*
 43.1204 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 43.1205 +  an error to fail to get the lock here.
 43.1206 +*/
 43.1207 +
 43.1208 +static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
 43.1209 +{
 43.1210 +	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
 43.1211 +}
 43.1212 +/* fcntl locks don't stack: avoid unlocking someone else's */
 43.1213 +static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
 43.1214 +{
 43.1215 +	struct tdb_traverse_lock *i;
 43.1216 +	u32 count = 0;
 43.1217 +
 43.1218 +	if (off == 0)
 43.1219 +		return 0;
 43.1220 +	for (i = &tdb->travlocks; i; i = i->next)
 43.1221 +		if (i->off == off)
 43.1222 +			count++;
 43.1223 +	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
 43.1224 +}
 43.1225 +
 43.1226 +/* actually delete an entry in the database given the offset */
 43.1227 +static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
 43.1228 +{
 43.1229 +	tdb_off last_ptr, i;
 43.1230 +	struct list_struct lastrec;
 43.1231 +
 43.1232 +	if (tdb->read_only) return -1;
 43.1233 +
 43.1234 +	if (write_lock_record(tdb, rec_ptr) == -1) {
 43.1235 +		/* Someone traversing here: mark it as dead */
 43.1236 +		rec->magic = TDB_DEAD_MAGIC;
 43.1237 +		return rec_write(tdb, rec_ptr, rec);
 43.1238 +	}
 43.1239 +	if (write_unlock_record(tdb, rec_ptr) != 0)
 43.1240 +		return -1;
 43.1241 +
 43.1242 +	/* find previous record in hash chain */
 43.1243 +	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 43.1244 +		return -1;
 43.1245 +	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 43.1246 +		if (rec_read(tdb, i, &lastrec) == -1)
 43.1247 +			return -1;
 43.1248 +
 43.1249 +	/* unlink it: next ptr is at start of record. */
 43.1250 +	if (last_ptr == 0)
 43.1251 +		last_ptr = TDB_HASH_TOP(rec->full_hash);
 43.1252 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 43.1253 +		return -1;
 43.1254 +
 43.1255 +	/* recover the space */
 43.1256 +	if (tdb_free(tdb, rec_ptr, rec) == -1)
 43.1257 +		return -1;
 43.1258 +	return 0;
 43.1259 +}
 43.1260 +
 43.1261 +/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
 43.1262 +static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
 43.1263 +			 struct list_struct *rec)
 43.1264 +{
 43.1265 +	int want_next = (tlock->off != 0);
 43.1266 +
 43.1267 +	/* Lock each chain from the start one. */
 43.1268 +	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
 43.1269 +
 43.1270 +		/* this is an optimisation for the common case where
 43.1271 +		   the hash chain is empty, which is particularly
 43.1272 +		   common for the use of tdb with ldb, where large
 43.1273 +		   hashes are used. In that case we spend most of our
 43.1274 +		   time in tdb_brlock(), locking empty hash chains.
 43.1275 +
 43.1276 +		   To avoid this, we do an unlocked pre-check to see
 43.1277 +		   if the hash chain is empty before starting to look
 43.1278 +		   inside it. If it is empty then we can avoid that
 43.1279 +		   hash chain. If it isn't empty then we can't believe
 43.1280 +		   the value we get back, as we read it without a
 43.1281 +		   lock, so instead we get the lock and re-fetch the
 43.1282 +		   value below.
 43.1283 +
 43.1284 +		   Notice that not doing this optimisation on the
 43.1285 +		   first hash chain is critical. We must guarantee
 43.1286 +		   that we have done at least one fcntl lock at the
 43.1287 +		   start of a search to guarantee that memory is
 43.1288 +		   coherent on SMP systems. If records are added by
 43.1289 +		   others during the search then thats OK, and we
 43.1290 +		   could possibly miss those with this trick, but we
 43.1291 +		   could miss them anyway without this trick, so the
 43.1292 +		   semantics don't change.
 43.1293 +
 43.1294 +		   With a non-indexed ldb search this trick gains us a
 43.1295 +		   factor of around 80 in speed on a linux 2.6.x
 43.1296 +		   system (testing using ldbtest).
 43.1297 +		 */
 43.1298 +		if (!tlock->off && tlock->hash != 0) {
 43.1299 +			u32 off;
 43.1300 +			if (tdb->map_ptr) {
 43.1301 +				for (;tlock->hash < tdb->header.hash_size;tlock->hash++) {
 43.1302 +					if (0 != *(u32 *)(TDB_HASH_TOP(tlock->hash) + (unsigned char *)tdb->map_ptr)) {
 43.1303 +						break;
 43.1304 +					}
 43.1305 +				}
 43.1306 +				if (tlock->hash == tdb->header.hash_size) {
 43.1307 +					continue;
 43.1308 +				}
 43.1309 +			} else {
 43.1310 +				if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash), &off) == 0 &&
 43.1311 +				    off == 0) {
 43.1312 +					continue;
 43.1313 +				}
 43.1314 +			}
 43.1315 +		}
 43.1316 +
 43.1317 +		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
 43.1318 +			return -1;
 43.1319 +
 43.1320 +		/* No previous record?  Start at top of chain. */
 43.1321 +		if (!tlock->off) {
 43.1322 +			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
 43.1323 +				     &tlock->off) == -1)
 43.1324 +				goto fail;
 43.1325 +		} else {
 43.1326 +			/* Otherwise unlock the previous record. */
 43.1327 +			if (unlock_record(tdb, tlock->off) != 0)
 43.1328 +				goto fail;
 43.1329 +		}
 43.1330 +
 43.1331 +		if (want_next) {
 43.1332 +			/* We have offset of old record: grab next */
 43.1333 +			if (rec_read(tdb, tlock->off, rec) == -1)
 43.1334 +				goto fail;
 43.1335 +			tlock->off = rec->next;
 43.1336 +		}
 43.1337 +
 43.1338 +		/* Iterate through chain */
 43.1339 +		while( tlock->off) {
 43.1340 +			tdb_off current;
 43.1341 +			if (rec_read(tdb, tlock->off, rec) == -1)
 43.1342 +				goto fail;
 43.1343 +
 43.1344 +			/* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
 43.1345 +			if (tlock->off == rec->next) {
 43.1346 +				TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
 43.1347 +				goto fail;
 43.1348 +			}
 43.1349 +
 43.1350 +			if (!TDB_DEAD(rec)) {
 43.1351 +				/* Woohoo: we found one! */
 43.1352 +				if (lock_record(tdb, tlock->off) != 0)
 43.1353 +					goto fail;
 43.1354 +				return tlock->off;
 43.1355 +			}
 43.1356 +
 43.1357 +			/* Try to clean dead ones from old traverses */
 43.1358 +			current = tlock->off;
 43.1359 +			tlock->off = rec->next;
 43.1360 +			if (!tdb->read_only && 
 43.1361 +			    do_delete(tdb, current, rec) != 0)
 43.1362 +				goto fail;
 43.1363 +		}
 43.1364 +		tdb_unlock(tdb, tlock->hash, F_WRLCK);
 43.1365 +		want_next = 0;
 43.1366 +	}
 43.1367 +	/* We finished iteration without finding anything */
 43.1368 +	return TDB_ERRCODE(TDB_SUCCESS, 0);
 43.1369 +
 43.1370 + fail:
 43.1371 +	tlock->off = 0;
 43.1372 +	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
 43.1373 +		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
 43.1374 +	return -1;
 43.1375 +}
 43.1376 +
 43.1377 +/* traverse the entire database - calling fn(tdb, key, data) on each element.
 43.1378 +   return -1 on error or the record count traversed
 43.1379 +   if fn is NULL then it is not called
 43.1380 +   a non-zero return value from fn() indicates that the traversal should stop
 43.1381 +  */
 43.1382 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
 43.1383 +{
 43.1384 +	TDB_DATA key, dbuf;
 43.1385 +	struct list_struct rec;
 43.1386 +	struct tdb_traverse_lock tl = { NULL, 0, 0 };
 43.1387 +	int ret, count = 0;
 43.1388 +
 43.1389 +	/* This was in the initializaton, above, but the IRIX compiler
 43.1390 +	 * did not like it.  crh
 43.1391 +	 */
 43.1392 +	tl.next = tdb->travlocks.next;
 43.1393 +
 43.1394 +	/* fcntl locks don't stack: beware traverse inside traverse */
 43.1395 +	tdb->travlocks.next = &tl;
 43.1396 +
 43.1397 +	/* tdb_next_lock places locks on the record returned, and its chain */
 43.1398 +	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
 43.1399 +		count++;
 43.1400 +		/* now read the full record */
 43.1401 +		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec), 
 43.1402 +					  rec.key_len + rec.data_len);
 43.1403 +		if (!key.dptr) {
 43.1404 +			ret = -1;
 43.1405 +			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
 43.1406 +				goto out;
 43.1407 +			if (unlock_record(tdb, tl.off) != 0)
 43.1408 +				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
 43.1409 +			goto out;
 43.1410 +		}
 43.1411 +		key.dsize = rec.key_len;
 43.1412 +		dbuf.dptr = key.dptr + rec.key_len;
 43.1413 +		dbuf.dsize = rec.data_len;
 43.1414 +
 43.1415 +		/* Drop chain lock, call out */
 43.1416 +		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
 43.1417 +			ret = -1;
 43.1418 +			goto out;
 43.1419 +		}
 43.1420 +		if (fn && fn(tdb, key, dbuf, private)) {
 43.1421 +			/* They want us to terminate traversal */
 43.1422 +			ret = count;
 43.1423 +			if (unlock_record(tdb, tl.off) != 0) {
 43.1424 +				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
 43.1425 +				ret = -1;
 43.1426 +			}
 43.1427 +			tdb->travlocks.next = tl.next;
 43.1428 +			SAFE_FREE(key.dptr);
 43.1429 +			return count;
 43.1430 +		}
 43.1431 +		SAFE_FREE(key.dptr);
 43.1432 +	}
 43.1433 +out:
 43.1434 +	tdb->travlocks.next = tl.next;
 43.1435 +	if (ret < 0)
 43.1436 +		return -1;
 43.1437 +	else
 43.1438 +		return count;
 43.1439 +}
 43.1440 +
 43.1441 +/* find the first entry in the database and return its key */
 43.1442 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
 43.1443 +{
 43.1444 +	TDB_DATA key;
 43.1445 +	struct list_struct rec;
 43.1446 +
 43.1447 +	/* release any old lock */
 43.1448 +	if (unlock_record(tdb, tdb->travlocks.off) != 0)
 43.1449 +		return tdb_null;
 43.1450 +	tdb->travlocks.off = tdb->travlocks.hash = 0;
 43.1451 +
 43.1452 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
 43.1453 +		return tdb_null;
 43.1454 +	/* now read the key */
 43.1455 +	key.dsize = rec.key_len;
 43.1456 +	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
 43.1457 +	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
 43.1458 +		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
 43.1459 +	return key;
 43.1460 +}
 43.1461 +
 43.1462 +/* find the next entry in the database, returning its key */
 43.1463 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
 43.1464 +{
 43.1465 +	u32 oldhash;
 43.1466 +	TDB_DATA key = tdb_null;
 43.1467 +	struct list_struct rec;
 43.1468 +	char *k = NULL;
 43.1469 +
 43.1470 +	/* Is locked key the old key?  If so, traverse will be reliable. */
 43.1471 +	if (tdb->travlocks.off) {
 43.1472 +		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
 43.1473 +			return tdb_null;
 43.1474 +		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
 43.1475 +		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
 43.1476 +					    rec.key_len))
 43.1477 +		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
 43.1478 +			/* No, it wasn't: unlock it and start from scratch */
 43.1479 +			if (unlock_record(tdb, tdb->travlocks.off) != 0)
 43.1480 +				return tdb_null;
 43.1481 +			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
 43.1482 +				return tdb_null;
 43.1483 +			tdb->travlocks.off = 0;
 43.1484 +		}
 43.1485 +
 43.1486 +		SAFE_FREE(k);
 43.1487 +	}
 43.1488 +
 43.1489 +	if (!tdb->travlocks.off) {
 43.1490 +		/* No previous element: do normal find, and lock record */
 43.1491 +		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
 43.1492 +		if (!tdb->travlocks.off)
 43.1493 +			return tdb_null;
 43.1494 +		tdb->travlocks.hash = BUCKET(rec.full_hash);
 43.1495 +		if (lock_record(tdb, tdb->travlocks.off) != 0) {
 43.1496 +			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
 43.1497 +			return tdb_null;
 43.1498 +		}
 43.1499 +	}
 43.1500 +	oldhash = tdb->travlocks.hash;
 43.1501 +
 43.1502 +	/* Grab next record: locks chain and returned record,
 43.1503 +	   unlocks old record */
 43.1504 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
 43.1505 +		key.dsize = rec.key_len;
 43.1506 +		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
 43.1507 +					  key.dsize);
 43.1508 +		/* Unlock the chain of this new record */
 43.1509 +		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
 43.1510 +			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 43.1511 +	}
 43.1512 +	/* Unlock the chain of old record */
 43.1513 +	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
 43.1514 +		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
 43.1515 +	return key;
 43.1516 +}
 43.1517 +
 43.1518 +/* delete an entry in the database given a key */
 43.1519 +static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
 43.1520 +{
 43.1521 +	tdb_off rec_ptr;
 43.1522 +	struct list_struct rec;
 43.1523 +	int ret;
 43.1524 +
 43.1525 +	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
 43.1526 +		return -1;
 43.1527 +	ret = do_delete(tdb, rec_ptr, &rec);
 43.1528 +	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
 43.1529 +		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
 43.1530 +	return ret;
 43.1531 +}
 43.1532 +
 43.1533 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
 43.1534 +{
 43.1535 +	u32 hash = tdb->hash_fn(&key);
 43.1536 +	return tdb_delete_hash(tdb, key, hash);
 43.1537 +}
 43.1538 +
 43.1539 +/* store an element in the database, replacing any existing element
 43.1540 +   with the same key 
 43.1541 +
 43.1542 +   return 0 on success, -1 on failure
 43.1543 +*/
 43.1544 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 43.1545 +{
 43.1546 +	struct list_struct rec;
 43.1547 +	u32 hash;
 43.1548 +	tdb_off rec_ptr;
 43.1549 +	char *p = NULL;
 43.1550 +	int ret = 0;
 43.1551 +
 43.1552 +	/* find which hash bucket it is in */
 43.1553 +	hash = tdb->hash_fn(&key);
 43.1554 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 43.1555 +		return -1;
 43.1556 +
 43.1557 +	/* check for it existing, on insert. */
 43.1558 +	if (flag == TDB_INSERT) {
 43.1559 +		if (tdb_exists_hash(tdb, key, hash)) {
 43.1560 +			tdb->ecode = TDB_ERR_EXISTS;
 43.1561 +			goto fail;
 43.1562 +		}
 43.1563 +	} else {
 43.1564 +		/* first try in-place update, on modify or replace. */
 43.1565 +		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
 43.1566 +			goto out;
 43.1567 +		if (tdb->ecode == TDB_ERR_NOEXIST &&
 43.1568 +		    flag == TDB_MODIFY) {
 43.1569 +			/* if the record doesn't exist and we are in TDB_MODIFY mode then
 43.1570 +			 we should fail the store */
 43.1571 +			goto fail;
 43.1572 +		}
 43.1573 +	}
 43.1574 +	/* reset the error code potentially set by the tdb_update() */
 43.1575 +	tdb->ecode = TDB_SUCCESS;
 43.1576 +
 43.1577 +	/* delete any existing record - if it doesn't exist we don't
 43.1578 +           care.  Doing this first reduces fragmentation, and avoids
 43.1579 +           coalescing with `allocated' block before it's updated. */
 43.1580 +	if (flag != TDB_INSERT)
 43.1581 +		tdb_delete_hash(tdb, key, hash);
 43.1582 +
 43.1583 +	/* Copy key+value *before* allocating free space in case malloc
 43.1584 +	   fails and we are left with a dead spot in the tdb. */
 43.1585 +
 43.1586 +	if (!(p = (char *)talloc_size(tdb, key.dsize + dbuf.dsize))) {
 43.1587 +		tdb->ecode = TDB_ERR_OOM;
 43.1588 +		goto fail;
 43.1589 +	}
 43.1590 +
 43.1591 +	memcpy(p, key.dptr, key.dsize);
 43.1592 +	if (dbuf.dsize)
 43.1593 +		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
 43.1594 +
 43.1595 +	/* we have to allocate some space */
 43.1596 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
 43.1597 +		goto fail;
 43.1598 +
 43.1599 +	/* Read hash top into next ptr */
 43.1600 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 43.1601 +		goto fail;
 43.1602 +
 43.1603 +	rec.key_len = key.dsize;
 43.1604 +	rec.data_len = dbuf.dsize;
 43.1605 +	rec.full_hash = hash;
 43.1606 +	rec.magic = TDB_MAGIC;
 43.1607 +
 43.1608 +	/* write out and point the top of the hash chain at it */
 43.1609 +	if (rec_write(tdb, rec_ptr, &rec) == -1
 43.1610 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
 43.1611 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 43.1612 +		/* Need to tdb_unallocate() here */
 43.1613 +		goto fail;
 43.1614 +	}
 43.1615 + out:
 43.1616 +	SAFE_FREE(p); 
 43.1617 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 43.1618 +	return ret;
 43.1619 +fail:
 43.1620 +	ret = -1;
 43.1621 +	goto out;
 43.1622 +}
 43.1623 +
 43.1624 +/* Attempt to append data to an entry in place - this only works if the new data size
 43.1625 +   is <= the old data size and the key exists.
 43.1626 +   on failure return -1. Record must be locked before calling.
 43.1627 +*/
 43.1628 +static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
 43.1629 +{
 43.1630 +	struct list_struct rec;
 43.1631 +	tdb_off rec_ptr;
 43.1632 +
 43.1633 +	/* find entry */
 43.1634 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 43.1635 +		return -1;
 43.1636 +
 43.1637 +	/* Append of 0 is always ok. */
 43.1638 +	if (new_dbuf.dsize == 0)
 43.1639 +		return 0;
 43.1640 +
 43.1641 +	/* must be long enough for key, old data + new data and tailer */
 43.1642 +	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
 43.1643 +		/* No room. */
 43.1644 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
 43.1645 +		return -1;
 43.1646 +	}
 43.1647 +
 43.1648 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
 43.1649 +		      new_dbuf.dptr, new_dbuf.dsize) == -1)
 43.1650 +		return -1;
 43.1651 +
 43.1652 +	/* update size */
 43.1653 +	rec.data_len += new_dbuf.dsize;
 43.1654 +	return rec_write(tdb, rec_ptr, &rec);
 43.1655 +}
 43.1656 +
 43.1657 +/* Append to an entry. Create if not exist. */
 43.1658 +
 43.1659 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 43.1660 +{
 43.1661 +	struct list_struct rec;
 43.1662 +	u32 hash;
 43.1663 +	tdb_off rec_ptr;
 43.1664 +	char *p = NULL;
 43.1665 +	int ret = 0;
 43.1666 +	size_t new_data_size = 0;
 43.1667 +
 43.1668 +	/* find which hash bucket it is in */
 43.1669 +	hash = tdb->hash_fn(&key);
 43.1670 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 43.1671 +		return -1;
 43.1672 +
 43.1673 +	/* first try in-place. */
 43.1674 +	if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
 43.1675 +		goto out;
 43.1676 +
 43.1677 +	/* reset the error code potentially set by the tdb_append_inplace() */
 43.1678 +	tdb->ecode = TDB_SUCCESS;
 43.1679 +
 43.1680 +	/* find entry */
 43.1681 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
 43.1682 +		if (tdb->ecode != TDB_ERR_NOEXIST)
 43.1683 +			goto fail;
 43.1684 +
 43.1685 +		/* Not found - create. */
 43.1686 +
 43.1687 +		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
 43.1688 +		goto out;
 43.1689 +	}
 43.1690 +
 43.1691 +	new_data_size = rec.data_len + new_dbuf.dsize;
 43.1692 +
 43.1693 +	/* Copy key+old_value+value *before* allocating free space in case malloc
 43.1694 +	   fails and we are left with a dead spot in the tdb. */
 43.1695 +
 43.1696 +	if (!(p = (char *)talloc_size(tdb, key.dsize + new_data_size))) {
 43.1697 +		tdb->ecode = TDB_ERR_OOM;
 43.1698 +		goto fail;
 43.1699 +	}
 43.1700 +
 43.1701 +	/* Copy the key in place. */
 43.1702 +	memcpy(p, key.dptr, key.dsize);
 43.1703 +
 43.1704 +	/* Now read the old data into place. */
 43.1705 +	if (rec.data_len &&
 43.1706 +		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
 43.1707 +			goto fail;
 43.1708 +
 43.1709 +	/* Finally append the new data. */
 43.1710 +	if (new_dbuf.dsize)
 43.1711 +		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
 43.1712 +
 43.1713 +	/* delete any existing record - if it doesn't exist we don't
 43.1714 +           care.  Doing this first reduces fragmentation, and avoids
 43.1715 +           coalescing with `allocated' block before it's updated. */
 43.1716 +
 43.1717 +	tdb_delete_hash(tdb, key, hash);
 43.1718 +
 43.1719 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
 43.1720 +		goto fail;
 43.1721 +
 43.1722 +	/* Read hash top into next ptr */
 43.1723 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 43.1724 +		goto fail;
 43.1725 +
 43.1726 +	rec.key_len = key.dsize;
 43.1727 +	rec.data_len = new_data_size;
 43.1728 +	rec.full_hash = hash;
 43.1729 +	rec.magic = TDB_MAGIC;
 43.1730 +
 43.1731 +	/* write out and point the top of the hash chain at it */
 43.1732 +	if (rec_write(tdb, rec_ptr, &rec) == -1
 43.1733 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
 43.1734 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 43.1735 +		/* Need to tdb_unallocate() here */
 43.1736 +		goto fail;
 43.1737 +	}
 43.1738 +
 43.1739 + out:
 43.1740 +	SAFE_FREE(p); 
 43.1741 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 43.1742 +	return ret;
 43.1743 +
 43.1744 +fail:
 43.1745 +	ret = -1;
 43.1746 +	goto out;
 43.1747 +}
 43.1748 +
 43.1749 +static int tdb_already_open(dev_t device,
 43.1750 +			    ino_t ino)
 43.1751 +{
 43.1752 +	TDB_CONTEXT *i;
 43.1753 +	
 43.1754 +	for (i = tdbs; i; i = i->next) {
 43.1755 +		if (i->device == device && i->inode == ino) {
 43.1756 +			return 1;
 43.1757 +		}
 43.1758 +	}
 43.1759 +
 43.1760 +	return 0;
 43.1761 +}
 43.1762 +
 43.1763 +/* open the database, creating it if necessary 
 43.1764 +
 43.1765 +   The open_flags and mode are passed straight to the open call on the
 43.1766 +   database file. A flags value of O_WRONLY is invalid. The hash size
 43.1767 +   is advisory, use zero for a default value.
 43.1768 +
 43.1769 +   Return is NULL on error, in which case errno is also set.  Don't 
 43.1770 +   try to call tdb_error or tdb_errname, just do strerror(errno).
 43.1771 +
 43.1772 +   @param name may be NULL for internal databases. */
 43.1773 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
 43.1774 +		      int open_flags, mode_t mode)
 43.1775 +{
 43.1776 +	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
 43.1777 +}
 43.1778 +
 43.1779 +/* a default logging function */
 43.1780 +static void null_log_fn(TDB_CONTEXT *tdb __attribute__((unused)),
 43.1781 +			int level __attribute__((unused)),
 43.1782 +			const char *fmt __attribute__((unused)), ...)
 43.1783 +{
 43.1784 +}
 43.1785 +
 43.1786 +
 43.1787 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
 43.1788 +			 int open_flags, mode_t mode,
 43.1789 +			 tdb_log_func log_fn,
 43.1790 +			 tdb_hash_func hash_fn)
 43.1791 +{
 43.1792 +	TDB_CONTEXT *tdb;
 43.1793 +	struct stat st;
 43.1794 +	int rev = 0, locked = 0;
 43.1795 +	uint8_t *vp;
 43.1796 +	u32 vertest;
 43.1797 +
 43.1798 +	if (!(tdb = talloc_zero(name, TDB_CONTEXT))) {
 43.1799 +		/* Can't log this */
 43.1800 +		errno = ENOMEM;
 43.1801 +		goto fail;
 43.1802 +	}
 43.1803 +	tdb->fd = -1;
 43.1804 +	tdb->name = NULL;
 43.1805 +	tdb->map_ptr = NULL;
 43.1806 +	tdb->flags = tdb_flags;
 43.1807 +	tdb->open_flags = open_flags;
 43.1808 +	tdb->log_fn = log_fn?log_fn:null_log_fn;
 43.1809 +	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
 43.1810 +
 43.1811 +	if ((open_flags & O_ACCMODE) == O_WRONLY) {
 43.1812 +		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
 43.1813 +			 name));
 43.1814 +		errno = EINVAL;
 43.1815 +		goto fail;
 43.1816 +	}
 43.1817 +	
 43.1818 +	if (hash_size == 0)
 43.1819 +		hash_size = DEFAULT_HASH_SIZE;
 43.1820 +	if ((open_flags & O_ACCMODE) == O_RDONLY) {
 43.1821 +		tdb->read_only = 1;
 43.1822 +		/* read only databases don't do locking or clear if first */
 43.1823 +		tdb->flags |= TDB_NOLOCK;
 43.1824 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 43.1825 +	}
 43.1826 +
 43.1827 +	/* internal databases don't mmap or lock, and start off cleared */
 43.1828 +	if (tdb->flags & TDB_INTERNAL) {
 43.1829 +		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
 43.1830 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 43.1831 +		if (tdb_new_database(tdb, hash_size) != 0) {
 43.1832 +			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
 43.1833 +			goto fail;
 43.1834 +		}
 43.1835 +		goto internal;
 43.1836 +	}
 43.1837 +
 43.1838 +	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
 43.1839 +		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
 43.1840 +			 name, strerror(errno)));
 43.1841 +		goto fail;	/* errno set by open(2) */
 43.1842 +	}
 43.1843 +
 43.1844 +	/* ensure there is only one process initialising at once */
 43.1845 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
 43.1846 +		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
 43.1847 +			 name, strerror(errno)));
 43.1848 +		goto fail;	/* errno set by tdb_brlock */
 43.1849 +	}
 43.1850 +
 43.1851 +	/* we need to zero database if we are the only one with it open */
 43.1852 +	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
 43.1853 +		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
 43.1854 +		open_flags |= O_CREAT;
 43.1855 +		if (ftruncate(tdb->fd, 0) == -1) {
 43.1856 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
 43.1857 +				 "failed to truncate %s: %s\n",
 43.1858 +				 name, strerror(errno)));
 43.1859 +			goto fail; /* errno set by ftruncate */
 43.1860 +		}
 43.1861 +	}
 43.1862 +
 43.1863 +	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
 43.1864 +	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
 43.1865 +	    || (tdb->header.version != TDB_VERSION
 43.1866 +		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
 43.1867 +		/* its not a valid database - possibly initialise it */
 43.1868 +		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
 43.1869 +			errno = EIO; /* ie bad format or something */
 43.1870 +			goto fail;
 43.1871 +		}
 43.1872 +		rev = (tdb->flags & TDB_CONVERT);
 43.1873 +	}
 43.1874 +	vp = (uint8_t *)&tdb->header.version;
 43.1875 +	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
 43.1876 +		  (((u32)vp[2]) << 8) | (u32)vp[3];
 43.1877 +	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
 43.1878 +	if (!rev)
 43.1879 +		tdb->flags &= ~TDB_CONVERT;
 43.1880 +	else {
 43.1881 +		tdb->flags |= TDB_CONVERT;
 43.1882 +		convert(&tdb->header, sizeof(tdb->header));
 43.1883 +	}
 43.1884 +	if (fstat(tdb->fd, &st) == -1)
 43.1885 +		goto fail;
 43.1886 +
 43.1887 +	/* Is it already in the open list?  If so, fail. */
 43.1888 +	if (tdb_already_open(st.st_dev, st.st_ino)) {
 43.1889 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
 43.1890 +			 "%s (%d,%d) is already open in this process\n",
 43.1891 +			 name, (int)st.st_dev, (int)st.st_ino));
 43.1892 +		errno = EBUSY;
 43.1893 +		goto fail;
 43.1894 +	}
 43.1895 +
 43.1896 +	if (!(tdb->name = (char *)talloc_strdup(tdb, name))) {
 43.1897 +		errno = ENOMEM;
 43.1898 +		goto fail;
 43.1899 +	}
 43.1900 +
 43.1901 +	tdb->map_size = st.st_size;
 43.1902 +	tdb->device = st.st_dev;
 43.1903 +	tdb->inode = st.st_ino;
 43.1904 +	tdb->locked = talloc_zero_array(tdb, struct tdb_lock_type,
 43.1905 +					tdb->header.hash_size+1);
 43.1906 +	if (!tdb->locked) {
 43.1907 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
 43.1908 +			 "failed to allocate lock structure for %s\n",
 43.1909 +			 name));
 43.1910 +		errno = ENOMEM;
 43.1911 +		goto fail;
 43.1912 +	}
 43.1913 +	tdb_mmap(tdb);
 43.1914 +	if (locked) {
 43.1915 +		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
 43.1916 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
 43.1917 +				 "failed to take ACTIVE_LOCK on %s: %s\n",
 43.1918 +				 name, strerror(errno)));
 43.1919 +			goto fail;
 43.1920 +		}
 43.1921 +
 43.1922 +	}
 43.1923 +
 43.1924 +	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
 43.1925 +	   we didn't get the initial exclusive lock as we need to let all other
 43.1926 +	   users know we're using it. */
 43.1927 +
 43.1928 +	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
 43.1929 +	/* leave this lock in place to indicate it's in use */
 43.1930 +	if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
 43.1931 +		goto fail;
 43.1932 +	}
 43.1933 +
 43.1934 +
 43.1935 + internal:
 43.1936 +	/* Internal (memory-only) databases skip all the code above to
 43.1937 +	 * do with disk files, and resume here by releasing their
 43.1938 +	 * global lock and hooking into the active list. */
 43.1939 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
 43.1940 +		goto fail;
 43.1941 +	tdb->next = tdbs;
 43.1942 +	tdbs = tdb;
 43.1943 +	return tdb;
 43.1944 +
 43.1945 + fail:
 43.1946 +	{ int save_errno = errno;
 43.1947 +
 43.1948 +	if (!tdb)
 43.1949 +		return NULL;
 43.1950 +	
 43.1951 +	if (tdb->map_ptr) {
 43.1952 +		if (tdb->flags & TDB_INTERNAL)
 43.1953 +			SAFE_FREE(tdb->map_ptr);
 43.1954 +		else
 43.1955 +			tdb_munmap(tdb);
 43.1956 +	}
 43.1957 +	SAFE_FREE(tdb->name);
 43.1958 +	if (tdb->fd != -1)
 43.1959 +		if (close(tdb->fd) != 0)
 43.1960 +			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
 43.1961 +	SAFE_FREE(tdb->locked);
 43.1962 +	SAFE_FREE(tdb);
 43.1963 +	errno = save_errno;
 43.1964 +	return NULL;
 43.1965 +	}
 43.1966 +}
 43.1967 +
 43.1968 +/**
 43.1969 + * Close a database.
 43.1970 + *
 43.1971 + * @returns -1 for error; 0 for success.
 43.1972 + **/
 43.1973 +int tdb_close(TDB_CONTEXT *tdb)
 43.1974 +{
 43.1975 +	TDB_CONTEXT **i;
 43.1976 +	int ret = 0;
 43.1977 +
 43.1978 +	if (tdb->map_ptr) {
 43.1979 +		if (tdb->flags & TDB_INTERNAL)
 43.1980 +			SAFE_FREE(tdb->map_ptr);
 43.1981 +		else
 43.1982 +			tdb_munmap(tdb);
 43.1983 +	}
 43.1984 +	SAFE_FREE(tdb->name);
 43.1985 +	if (tdb->fd != -1)
 43.1986 +		ret = close(tdb->fd);
 43.1987 +	SAFE_FREE(tdb->locked);
 43.1988 +
 43.1989 +	/* Remove from contexts list */
 43.1990 +	for (i = &tdbs; *i; i = &(*i)->next) {
 43.1991 +		if (*i == tdb) {
 43.1992 +			*i = tdb->next;
 43.1993 +			break;
 43.1994 +		}
 43.1995 +	}
 43.1996 +
 43.1997 +	memset(tdb, 0, sizeof(*tdb));
 43.1998 +	SAFE_FREE(tdb);
 43.1999 +
 43.2000 +	return ret;
 43.2001 +}
 43.2002 +
 43.2003 +/* lock/unlock entire database */
 43.2004 +int tdb_lockall(TDB_CONTEXT *tdb)
 43.2005 +{
 43.2006 +	u32 i;
 43.2007 +
 43.2008 +	/* There are no locks on read-only dbs */
 43.2009 +	if (tdb->read_only)
 43.2010 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 43.2011 +	for (i = 0; i < tdb->header.hash_size; i++) 
 43.2012 +		if (tdb_lock(tdb, i, F_WRLCK))
 43.2013 +			break;
 43.2014 +
 43.2015 +	/* If error, release locks we have... */
 43.2016 +	if (i < tdb->header.hash_size) {
 43.2017 +		u32 j;
 43.2018 +
 43.2019 +		for ( j = 0; j < i; j++)
 43.2020 +			tdb_unlock(tdb, j, F_WRLCK);
 43.2021 +		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
 43.2022 +	}
 43.2023 +
 43.2024 +	return 0;
 43.2025 +}
 43.2026 +void tdb_unlockall(TDB_CONTEXT *tdb)
 43.2027 +{
 43.2028 +	u32 i;
 43.2029 +	for (i=0; i < tdb->header.hash_size; i++)
 43.2030 +		tdb_unlock(tdb, i, F_WRLCK);
 43.2031 +}
 43.2032 +
 43.2033 +/* lock/unlock one hash chain. This is meant to be used to reduce
 43.2034 +   contention - it cannot guarantee how many records will be locked */
 43.2035 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
 43.2036 +{
 43.2037 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 43.2038 +}
 43.2039 +
 43.2040 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
 43.2041 +{
 43.2042 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 43.2043 +}
 43.2044 +
 43.2045 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
 43.2046 +{
 43.2047 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 43.2048 +}
 43.2049 +
 43.2050 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
 43.2051 +{
 43.2052 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 43.2053 +}
 43.2054 +
 43.2055 +
 43.2056 +/* register a loging function */
 43.2057 +void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
 43.2058 +{
 43.2059 +	tdb->log_fn = fn?fn:null_log_fn;
 43.2060 +}
 43.2061 +
 43.2062 +
 43.2063 +/* reopen a tdb - this can be used after a fork to ensure that we have an independent
 43.2064 +   seek pointer from our parent and to re-establish locks */
 43.2065 +int tdb_reopen(TDB_CONTEXT *tdb)
 43.2066 +{
 43.2067 +	struct stat st;
 43.2068 +
 43.2069 +	if (tdb->flags & TDB_INTERNAL)
 43.2070 +		return 0; /* Nothing to do. */
 43.2071 +	if (tdb_munmap(tdb) != 0) {
 43.2072 +		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
 43.2073 +		goto fail;
 43.2074 +	}
 43.2075 +	if (close(tdb->fd) != 0)
 43.2076 +		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
 43.2077 +	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
 43.2078 +	if (tdb->fd == -1) {
 43.2079 +		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
 43.2080 +		goto fail;
 43.2081 +	}
 43.2082 +	if (fstat(tdb->fd, &st) != 0) {
 43.2083 +		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
 43.2084 +		goto fail;
 43.2085 +	}
 43.2086 +	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
 43.2087 +		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
 43.2088 +		goto fail;
 43.2089 +	}
 43.2090 +	tdb_mmap(tdb);
 43.2091 +	if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
 43.2092 +		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
 43.2093 +		goto fail;
 43.2094 +	}
 43.2095 +
 43.2096 +	return 0;
 43.2097 +
 43.2098 +fail:
 43.2099 +	tdb_close(tdb);
 43.2100 +	return -1;
 43.2101 +}
 43.2102 +
 43.2103 +/* Not general: only works if single writer. */
 43.2104 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile)
 43.2105 +{
 43.2106 +	int fd, saved_errno;
 43.2107 +	TDB_CONTEXT *copy;
 43.2108 +
 43.2109 +	fd = open(outfile, O_TRUNC|O_CREAT|O_WRONLY, 0640);
 43.2110 +	if (fd < 0)
 43.2111 +		return NULL;
 43.2112 +	if (tdb->map_ptr) {
 43.2113 +		if (write(fd,tdb->map_ptr,tdb->map_size) != (int)tdb->map_size)
 43.2114 +			goto fail;
 43.2115 +	} else {
 43.2116 +		char buf[65536];
 43.2117 +		int r;
 43.2118 +
 43.2119 +		lseek(tdb->fd, 0, SEEK_SET);
 43.2120 +		while ((r = read(tdb->fd, buf, sizeof(buf))) > 0) {
 43.2121 +			if (write(fd, buf, r) != r)
 43.2122 +				goto fail;
 43.2123 +		}
 43.2124 +		if (r < 0)
 43.2125 +			goto fail;
 43.2126 +	}
 43.2127 +	copy = tdb_open(outfile, 0, 0, O_RDWR, 0);
 43.2128 +	if (!copy)
 43.2129 +		goto fail;
 43.2130 +	close(fd);
 43.2131 +	return copy;
 43.2132 +
 43.2133 +fail:
 43.2134 +	saved_errno = errno;
 43.2135 +	close(fd);
 43.2136 +	unlink(outfile);
 43.2137 +	errno = saved_errno;
 43.2138 +	return NULL;
 43.2139 +}
 43.2140 +
 43.2141 +/* reopen all tdb's */
 43.2142 +int tdb_reopen_all(void)
 43.2143 +{
 43.2144 +	TDB_CONTEXT *tdb;
 43.2145 +
 43.2146 +	for (tdb=tdbs; tdb; tdb = tdb->next) {
 43.2147 +		/* Ensure no clear-if-first. */
 43.2148 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
 43.2149 +		if (tdb_reopen(tdb) != 0)
 43.2150 +			return -1;
 43.2151 +	}
 43.2152 +
 43.2153 +	return 0;
 43.2154 +}
    44.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    44.2 +++ b/tools/xenstore/tdb.h	Mon Sep 26 11:07:49 2005 -0600
    44.3 @@ -0,0 +1,157 @@
    44.4 +#ifndef __TDB_H__
    44.5 +#define __TDB_H__
    44.6 +
    44.7 +/* 
    44.8 +   Unix SMB/CIFS implementation.
    44.9 +
   44.10 +   trivial database library
   44.11 +
   44.12 +   Copyright (C) Andrew Tridgell 1999-2004
   44.13 +   
   44.14 +     ** NOTE! The following LGPL license applies to the tdb
   44.15 +     ** library. This does NOT imply that all of Samba is released
   44.16 +     ** under the LGPL
   44.17 +   
   44.18 +   This library is free software; you can redistribute it and/or
   44.19 +   modify it under the terms of the GNU Lesser General Public
   44.20 +   License as published by the Free Software Foundation; either
   44.21 +   version 2 of the License, or (at your option) any later version.
   44.22 +
   44.23 +   This library is distributed in the hope that it will be useful,
   44.24 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
   44.25 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   44.26 +   Lesser General Public License for more details.
   44.27 +
   44.28 +   You should have received a copy of the GNU Lesser General Public
   44.29 +   License along with this library; if not, write to the Free Software
   44.30 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   44.31 +*/
   44.32 +
   44.33 +#ifdef  __cplusplus
   44.34 +extern "C" {
   44.35 +#endif
   44.36 +
   44.37 +
   44.38 +/* flags to tdb_store() */
   44.39 +#define TDB_REPLACE 1
   44.40 +#define TDB_INSERT 2
   44.41 +#define TDB_MODIFY 3
   44.42 +
   44.43 +/* flags for tdb_open() */
   44.44 +#define TDB_DEFAULT 0 /* just a readability place holder */
   44.45 +#define TDB_CLEAR_IF_FIRST 1
   44.46 +#define TDB_INTERNAL 2 /* don't store on disk */
   44.47 +#define TDB_NOLOCK   4 /* don't do any locking */
   44.48 +#define TDB_NOMMAP   8 /* don't use mmap */
   44.49 +#define TDB_CONVERT 16 /* convert endian (internal use) */
   44.50 +#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */
   44.51 +
   44.52 +#define TDB_ERRCODE(code, ret) ((tdb->ecode = (code)), ret)
   44.53 +
   44.54 +/* error codes */
   44.55 +enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
   44.56 +		TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
   44.57 +		TDB_ERR_NOEXIST};
   44.58 +
   44.59 +#ifndef u32
   44.60 +#define u32 unsigned
   44.61 +#endif
   44.62 +
   44.63 +typedef struct TDB_DATA {
   44.64 +	char *dptr;
   44.65 +	size_t dsize;
   44.66 +} TDB_DATA;
   44.67 +
   44.68 +typedef u32 tdb_len;
   44.69 +typedef u32 tdb_off;
   44.70 +
   44.71 +/* this is stored at the front of every database */
   44.72 +struct tdb_header {
   44.73 +	char magic_food[32]; /* for /etc/magic */
   44.74 +	u32 version; /* version of the code */
   44.75 +	u32 hash_size; /* number of hash entries */
   44.76 +	tdb_off rwlocks;
   44.77 +	tdb_off reserved[31];
   44.78 +};
   44.79 +
   44.80 +struct tdb_lock_type {
   44.81 +	u32 count;
   44.82 +	u32 ltype;
   44.83 +};
   44.84 +
   44.85 +struct tdb_traverse_lock {
   44.86 +	struct tdb_traverse_lock *next;
   44.87 +	u32 off;
   44.88 +	u32 hash;
   44.89 +};
   44.90 +
   44.91 +#ifndef PRINTF_ATTRIBUTE
   44.92 +#define PRINTF_ATTRIBUTE(a,b)
   44.93 +#endif
   44.94 +
   44.95 +/* this is the context structure that is returned from a db open */
   44.96 +typedef struct tdb_context {
   44.97 +	char *name; /* the name of the database */
   44.98 +	void *map_ptr; /* where it is currently mapped */
   44.99 +	int fd; /* open file descriptor for the database */
  44.100 +	tdb_len map_size; /* how much space has been mapped */
  44.101 +	int read_only; /* opened read-only */
  44.102 +	struct tdb_lock_type *locked; /* array of chain locks */
  44.103 +	enum TDB_ERROR ecode; /* error code for last tdb error */
  44.104 +	struct tdb_header header; /* a cached copy of the header */
  44.105 +	u32 flags; /* the flags passed to tdb_open */
  44.106 +	struct tdb_traverse_lock travlocks; /* current traversal locks */
  44.107 +	struct tdb_context *next; /* all tdbs to avoid multiple opens */
  44.108 +	dev_t device;	/* uniquely identifies this tdb */
  44.109 +	ino_t inode;	/* uniquely identifies this tdb */
  44.110 +	void (*log_fn)(struct tdb_context *tdb, int level, const char *, ...) PRINTF_ATTRIBUTE(3,4); /* logging function */
  44.111 +	u32 (*hash_fn)(TDB_DATA *key);
  44.112 +	int open_flags; /* flags used in the open - needed by reopen */
  44.113 +} TDB_CONTEXT;
  44.114 +
  44.115 +typedef int (*tdb_traverse_func)(TDB_CONTEXT *, TDB_DATA, TDB_DATA, void *);
  44.116 +typedef void (*tdb_log_func)(TDB_CONTEXT *, int , const char *, ...);
  44.117 +typedef u32 (*tdb_hash_func)(TDB_DATA *key);
  44.118 +
  44.119 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
  44.120 +		      int open_flags, mode_t mode);
  44.121 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
  44.122 +			 int open_flags, mode_t mode,
  44.123 +			 tdb_log_func log_fn,
  44.124 +			 tdb_hash_func hash_fn);
  44.125 +
  44.126 +int tdb_reopen(TDB_CONTEXT *tdb);
  44.127 +int tdb_reopen_all(void);
  44.128 +void tdb_logging_function(TDB_CONTEXT *tdb, tdb_log_func);
  44.129 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb);
  44.130 +const char *tdb_errorstr(TDB_CONTEXT *tdb);
  44.131 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key);
  44.132 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key);
  44.133 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag);
  44.134 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf);
  44.135 +int tdb_close(TDB_CONTEXT *tdb);
  44.136 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb);
  44.137 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key);
  44.138 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *);
  44.139 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key);
  44.140 +int tdb_lockall(TDB_CONTEXT *tdb);
  44.141 +void tdb_unlockall(TDB_CONTEXT *tdb);
  44.142 +
  44.143 +/* Low level locking functions: use with care */
  44.144 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key);
  44.145 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key);
  44.146 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
  44.147 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
  44.148 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile);
  44.149 +
  44.150 +/* Debug functions. Not used in production. */
  44.151 +void tdb_dump_all(TDB_CONTEXT *tdb);
  44.152 +int tdb_printfreelist(TDB_CONTEXT *tdb);
  44.153 +
  44.154 +extern TDB_DATA tdb_null;
  44.155 +
  44.156 +#ifdef  __cplusplus
  44.157 +}
  44.158 +#endif
  44.159 +
  44.160 +#endif /* tdb.h */
    45.1 --- a/tools/xenstore/testsuite/04rm.test	Fri Sep 23 15:41:28 2005 -0600
    45.2 +++ b/tools/xenstore/testsuite/04rm.test	Mon Sep 26 11:07:49 2005 -0600
    45.3 @@ -6,6 +6,8 @@ rm /dir/test
    45.4  # Create file and remove it
    45.5  write /test contents
    45.6  rm /test
    45.7 +expect tool
    45.8 +dir /
    45.9  
   45.10  # Create directory and remove it.
   45.11  mkdir /dir
   45.12 @@ -15,3 +17,4 @@ rm /dir
   45.13  mkdir /dir
   45.14  write /dir/test contents
   45.15  rm /dir
   45.16 +
    46.1 --- a/tools/xenstore/testsuite/08transaction.slowtest	Fri Sep 23 15:41:28 2005 -0600
    46.2 +++ b/tools/xenstore/testsuite/08transaction.slowtest	Mon Sep 26 11:07:49 2005 -0600
    46.3 @@ -1,21 +1,43 @@
    46.4 -# Test transaction timeouts.  Take a second each.
    46.5 +# Test transaction clashes.
    46.6  
    46.7  mkdir /test
    46.8  write /test/entry1 contents
    46.9  
   46.10 -# Transactions can take as long as the want...
   46.11 -start /test
   46.12 -sleep 1100
   46.13 -rm /test/entry1
   46.14 -commit
   46.15 -dir /test
   46.16 +# Start transaction, do read-only op, transaction succeeds
   46.17 +1 start
   46.18 +1 write /test/entry1 contents2
   46.19 +expect contents
   46.20 +read /test/entry1
   46.21 +1 commit
   46.22 +expect contents2
   46.23 +read /test/entry1
   46.24  
   46.25 -# ... as long as noone is waiting.
   46.26 -1 start /test
   46.27 -notimeout
   46.28 -2 mkdir /test/dir
   46.29 -1 mkdir /test/dir
   46.30 -expect 1:dir
   46.31 -1 dir /test
   46.32 -expect 1: commit failed: Connection timed out
   46.33 +# Start transaction, abort other transaction, transaction succeeds.
   46.34 +1 start
   46.35 +1 write /test/entry1 contents3
   46.36 +start
   46.37 +write /test/entry1 contents
   46.38 +abort
   46.39  1 commit
   46.40 +expect contents3
   46.41 +read /test/entry1
   46.42 +
   46.43 +# Start transaction, do write op, transaction fails
   46.44 +1 start
   46.45 +1 write /test/entry1 contents4
   46.46 +write /test/entry1 contents
   46.47 +expect 1: commit failed: Resource temporarily unavailable
   46.48 +1 commit
   46.49 +expect contents
   46.50 +read /test/entry1
   46.51 +
   46.52 +# Start transaction, do other transaction, transaction fails
   46.53 +1 start
   46.54 +1 write /test/entry1 contents4
   46.55 +start
   46.56 +write /test/entry1 contents5
   46.57 +commit
   46.58 +expect 1: commit failed: Resource temporarily unavailable
   46.59 +1 commit
   46.60 +expect contents5
   46.61 +read /test/entry1
    47.1 --- a/tools/xenstore/testsuite/08transaction.test	Fri Sep 23 15:41:28 2005 -0600
    47.2 +++ b/tools/xenstore/testsuite/08transaction.test	Mon Sep 26 11:07:49 2005 -0600
    47.3 @@ -3,7 +3,7 @@
    47.4  mkdir /test
    47.5  
    47.6  # Simple transaction: create a file inside transaction.
    47.7 -1 start /test
    47.8 +1 start
    47.9  1 write /test/entry1 contents
   47.10  2 dir /test
   47.11  expect 1:entry1
   47.12 @@ -15,7 +15,7 @@ 2 read /test/entry1
   47.13  rm /test/entry1
   47.14  
   47.15  # Create a file and abort transaction.
   47.16 -1 start /test
   47.17 +1 start
   47.18  1 write /test/entry1 contents
   47.19  2 dir /test
   47.20  expect 1:entry1
   47.21 @@ -25,7 +25,7 @@ 2 dir /test
   47.22  
   47.23  write /test/entry1 contents
   47.24  # Delete in transaction, commit
   47.25 -1 start /test
   47.26 +1 start
   47.27  1 rm /test/entry1
   47.28  expect 2:entry1
   47.29  2 dir /test
   47.30 @@ -35,7 +35,7 @@ 2 dir /test
   47.31  
   47.32  # Delete in transaction, abort.
   47.33  write /test/entry1 contents
   47.34 -1 start /test
   47.35 +1 start
   47.36  1 rm /test/entry1
   47.37  expect 2:entry1
   47.38  2 dir /test
   47.39 @@ -47,7 +47,7 @@ 2 dir /test
   47.40  # Events inside transactions don't trigger watches until (successful) commit.
   47.41  mkdir /test/dir
   47.42  1 watch /test token
   47.43 -2 start /test
   47.44 +2 start
   47.45  2 mkdir /test/dir/sub
   47.46  expect 1: waitwatch failed: Connection timed out
   47.47  1 waitwatch
   47.48 @@ -55,7 +55,7 @@ 2 close
   47.49  1 close
   47.50  
   47.51  1 watch /test token
   47.52 -2 start /test
   47.53 +2 start
   47.54  2 mkdir /test/dir/sub
   47.55  2 abort
   47.56  expect 1: waitwatch failed: Connection timed out
   47.57 @@ -63,7 +63,7 @@ 1 waitwatch
   47.58  1 close
   47.59  
   47.60  1 watch /test token
   47.61 -2 start /test
   47.62 +2 start
   47.63  2 mkdir /test/dir/sub
   47.64  2 commit
   47.65  expect 1:/test/dir/sub:token
   47.66 @@ -73,7 +73,7 @@ 1 close
   47.67  
   47.68  # Rm inside transaction works like rm outside: children get notified.
   47.69  1 watch /test/dir/sub token
   47.70 -2 start /test
   47.71 +2 start
   47.72  2 rm /test/dir
   47.73  2 commit
   47.74  expect 1:/test/dir/sub:token
   47.75 @@ -83,7 +83,7 @@ 1 close
   47.76  
   47.77  # Multiple events from single transaction don't trigger assert
   47.78  1 watch /test token
   47.79 -2 start /test
   47.80 +2 start
   47.81  2 write /test/1 contents
   47.82  2 write /test/2 contents
   47.83  2 commit
    48.1 --- a/tools/xenstore/testsuite/12readonly.test	Fri Sep 23 15:41:28 2005 -0600
    48.2 +++ b/tools/xenstore/testsuite/12readonly.test	Mon Sep 26 11:07:49 2005 -0600
    48.3 @@ -13,23 +13,23 @@ expect 0 READ
    48.4  getperm /test
    48.5  watch /test token
    48.6  unwatch /test token 
    48.7 -start /
    48.8 +start
    48.9  commit
   48.10 -start /
   48.11 +start
   48.12  abort
   48.13  
   48.14  # These don't work
   48.15 -expect write failed: Read-only file system
   48.16 +expect write failed: Permission denied
   48.17  write /test2 contents
   48.18 -expect write failed: Read-only file system
   48.19 +expect write failed: Permission denied
   48.20  write /test contents
   48.21 -expect setperm failed: Read-only file system
   48.22 +expect setperm failed: Permission denied
   48.23  setperm /test 100 NONE
   48.24 -expect setperm failed: Read-only file system
   48.25 +expect setperm failed: Permission denied
   48.26  setperm /test 100 NONE
   48.27 -expect shutdown failed: Read-only file system
   48.28 +expect shutdown failed: Permission denied
   48.29  shutdown
   48.30 -expect introduce failed: Read-only file system
   48.31 +expect introduce failed: Permission denied
   48.32  introduce 1 100 7 /home
   48.33  
   48.34  # Check that watches work like normal.
    49.1 --- a/tools/xenstore/testsuite/14complexperms.test	Fri Sep 23 15:41:28 2005 -0600
    49.2 +++ b/tools/xenstore/testsuite/14complexperms.test	Mon Sep 26 11:07:49 2005 -0600
    49.3 @@ -33,14 +33,6 @@ unwatch /dir/file token
    49.4  expect *No such file or directory
    49.5  unwatch /dir/file token 
    49.6  expect *Permission denied
    49.7 -start /dir/file
    49.8 -expect *No such file or directory
    49.9 -abort
   49.10 -expect *Permission denied
   49.11 -start /dir/file
   49.12 -expect *No such file or directory
   49.13 -commit
   49.14 -expect *Permission denied
   49.15  introduce 2 100 7 /dir/file
   49.16  
   49.17  # Now it exists
   49.18 @@ -73,12 +65,4 @@ unwatch /dir/file token
   49.19  expect *No such file or directory
   49.20  unwatch /dir/file token 
   49.21  expect *Permission denied
   49.22 -start /dir/file
   49.23 -expect *No such file or directory
   49.24 -abort
   49.25 -expect *Permission denied
   49.26 -start /dir/file
   49.27 -expect *No such file or directory
   49.28 -commit
   49.29 -expect *Permission denied
   49.30  introduce 2 100 7 /dir/file
    50.1 --- a/tools/xenstore/testsuite/16block-watch-crash.test	Fri Sep 23 15:41:28 2005 -0600
    50.2 +++ b/tools/xenstore/testsuite/16block-watch-crash.test	Mon Sep 26 11:07:49 2005 -0600
    50.3 @@ -1,13 +1,14 @@
    50.4  # Test case where blocked connection gets sent watch.
    50.5  
    50.6 -mkdir /test
    50.7 -watch /test token
    50.8 -1 start /test
    50.9 -# This will block on above
   50.10 -noackwrite /test/entry contents
   50.11 -1 write /test/entry2 contents
   50.12 -1 commit
   50.13 -readack
   50.14 -expect /test/entry2:token
   50.15 -waitwatch
   50.16 -ackwatch token
   50.17 +# FIXME: We no longer block connections 
   50.18 +# mkdir /test
   50.19 +# watch /test token
   50.20 +# 1 start
   50.21 +# # This will block on above
   50.22 +# noackwrite /test/entry contents
   50.23 +# 1 write /test/entry2 contents
   50.24 +# 1 commit
   50.25 +# readack
   50.26 +# expect /test/entry2:token
   50.27 +# waitwatch
   50.28 +# ackwatch token
    51.1 --- a/tools/xenstore/xenstore_client.c	Fri Sep 23 15:41:28 2005 -0600
    51.2 +++ b/tools/xenstore/xenstore_client.c	Mon Sep 26 11:07:49 2005 -0600
    51.3 @@ -14,6 +14,7 @@
    51.4  #include <stdlib.h>
    51.5  #include <string.h>
    51.6  #include <xs.h>
    51.7 +#include <errno.h>
    51.8  
    51.9  static void
   51.10  usage(const char *progname)
   51.11 @@ -82,8 +83,8 @@ main(int argc, char **argv)
   51.12      }
   51.13  #endif
   51.14  
   51.15 -    /* XXX maybe find longest common prefix */
   51.16 -    success = xs_transaction_start(xsh, "/");
   51.17 +  again:
   51.18 +    success = xs_transaction_start(xsh);
   51.19      if (!success)
   51.20  	errx(1, "couldn't start transaction");
   51.21  
   51.22 @@ -145,8 +146,10 @@ main(int argc, char **argv)
   51.23  
   51.24   out:
   51.25      success = xs_transaction_end(xsh, ret ? true : false);
   51.26 -    if (!success)
   51.27 +    if (!success) {
   51.28 +	if (ret == 0 && errno == EAGAIN)
   51.29 +	    goto again;
   51.30  	errx(1, "couldn't end transaction");
   51.31 -
   51.32 +    }
   51.33      return ret;
   51.34  }
    52.1 --- a/tools/xenstore/xenstored.h	Fri Sep 23 15:41:28 2005 -0600
    52.2 +++ b/tools/xenstore/xenstored.h	Mon Sep 26 11:07:49 2005 -0600
    52.3 @@ -75,7 +75,7 @@ static struct xsd_errors xsd_errors[] __
    52.4  	XSD_ERROR(ENOSYS),
    52.5  	XSD_ERROR(EROFS),
    52.6  	XSD_ERROR(EBUSY),
    52.7 -	XSD_ERROR(ETIMEDOUT),
    52.8 +	XSD_ERROR(EAGAIN),
    52.9  	XSD_ERROR(EISCONN),
   52.10  };
   52.11  struct xsd_sockmsg
    53.1 --- a/tools/xenstore/xenstored_core.c	Fri Sep 23 15:41:28 2005 -0600
    53.2 +++ b/tools/xenstore/xenstored_core.c	Mon Sep 26 11:07:49 2005 -0600
    53.3 @@ -50,10 +50,12 @@
    53.4  #include "xenstored_transaction.h"
    53.5  #include "xenstored_domain.h"
    53.6  #include "xenctrl.h"
    53.7 +#include "tdb.h"
    53.8  
    53.9  static bool verbose;
   53.10  LIST_HEAD(connections);
   53.11  static int tracefd = -1;
   53.12 +static TDB_CONTEXT *tdb_ctx;
   53.13  
   53.14  #ifdef TESTING
   53.15  static bool failtest = false;
   53.16 @@ -126,6 +128,23 @@ void __attribute__((noreturn)) corrupt(s
   53.17  	_exit(2);
   53.18  }
   53.19  
   53.20 +TDB_CONTEXT *tdb_context(struct connection *conn)
   53.21 +{
   53.22 +	/* conn = NULL used in manual_node at setup. */
   53.23 +	if (!conn || !conn->transaction)
   53.24 +		return tdb_ctx;
   53.25 +	return tdb_transaction_context(conn->transaction);
   53.26 +}
   53.27 +
   53.28 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb)
   53.29 +{
   53.30 +	if (rename(newname, xs_daemon_tdb()) != 0)
   53.31 +		return false;
   53.32 +	tdb_close(tdb_ctx);
   53.33 +	tdb_ctx = talloc_steal(talloc_autofree_context(), newtdb);
   53.34 +	return true;
   53.35 +}
   53.36 +
   53.37  static char *sockmsg_string(enum xsd_sockmsg_type type)
   53.38  {
   53.39  	switch (type) {
   53.40 @@ -202,37 +221,6 @@ void trace_destroy(const void *data, con
   53.41  	write(tracefd, string, strlen(string));
   53.42  }
   53.43  
   53.44 -void trace_watch_timeout(const struct connection *conn, const char *node, const char *token)
   53.45 -{
   53.46 -	char string[64];
   53.47 -	if (tracefd < 0)
   53.48 -		return;
   53.49 -	write(tracefd, "WATCH_TIMEOUT ", strlen("WATCH_TIMEOUT "));
   53.50 -	sprintf(string, " %p ", conn);
   53.51 -	write(tracefd, string, strlen(string));
   53.52 -	write(tracefd, " (", 2);
   53.53 -	write(tracefd, node, strlen(node));
   53.54 -	write(tracefd, " ", 1);
   53.55 -	write(tracefd, token, strlen(token));
   53.56 -	write(tracefd, ")\n", 2);
   53.57 -}
   53.58 -
   53.59 -static void trace_blocked(const struct connection *conn,
   53.60 -			  const struct buffered_data *data)
   53.61 -{
   53.62 -	char string[64];
   53.63 -
   53.64 -	if (tracefd < 0)
   53.65 -		return;
   53.66 -
   53.67 -	write(tracefd, "BLOCKED", strlen("BLOCKED"));
   53.68 -	sprintf(string, " %p (", conn);
   53.69 -	write(tracefd, string, strlen(string));
   53.70 -	write(tracefd, sockmsg_string(data->hdr.msg.type),
   53.71 -	      strlen(sockmsg_string(data->hdr.msg.type)));
   53.72 -	write(tracefd, ")\n", 2);
   53.73 -}
   53.74 -
   53.75  void trace(const char *fmt, ...)
   53.76  {
   53.77  	va_list arglist;
   53.78 @@ -253,7 +241,6 @@ static bool write_message(struct connect
   53.79  	int ret;
   53.80  	struct buffered_data *out = conn->out;
   53.81  
   53.82 -	assert(conn->state != BLOCKED);
   53.83  	if (out->inhdr) {
   53.84  		if (verbose)
   53.85  			xprintf("Writing msg %s (%s) out to %p\n",
   53.86 @@ -351,24 +338,6 @@ static int initialize_set(fd_set *inset,
   53.87  	return max;
   53.88  }
   53.89  
   53.90 -/* Read everything from a talloc_open'ed fd. */
   53.91 -void *read_all(int *fd, unsigned int *size)
   53.92 -{
   53.93 -	unsigned int max = 4;
   53.94 -	int ret;
   53.95 -	void *buffer = talloc_size(fd, max);
   53.96 -
   53.97 -	*size = 0;
   53.98 -	while ((ret = read(*fd, buffer + *size, max - *size)) > 0) {
   53.99 -		*size += ret;
  53.100 -		if (*size == max)
  53.101 -			buffer = talloc_realloc_size(fd, buffer, max *= 2);
  53.102 -	}
  53.103 -	if (ret < 0)
  53.104 -		return NULL;
  53.105 -	return buffer;
  53.106 -}
  53.107 -
  53.108  static int destroy_fd(void *_fd)
  53.109  {
  53.110  	int *fd = _fd;
  53.111 @@ -409,42 +378,167 @@ bool is_child(const char *child, const c
  53.112  	return child[len] == '/' || child[len] == '\0';
  53.113  }
  53.114  
  53.115 -/* Answer never ends in /. */
  53.116 -char *node_dir_outside_transaction(const char *node)
  53.117 +/* If it fails, returns NULL and sets errno. */
  53.118 +static struct node *read_node(struct connection *conn, const char *name)
  53.119  {
  53.120 -	if (streq(node, "/"))
  53.121 -		return talloc_strdup(node, xs_daemon_store());
  53.122 -	return talloc_asprintf(node, "%s%s", xs_daemon_store(), node);
  53.123 -}
  53.124 +	TDB_DATA key, data;
  53.125 +	u32 *p;
  53.126 +	struct node *node;
  53.127  
  53.128 -static char *node_dir(struct transaction *trans, const char *node)
  53.129 -{
  53.130 -	if (!trans || !within_transaction(trans, node))
  53.131 -		return node_dir_outside_transaction(node);
  53.132 -	return node_dir_inside_transaction(trans, node);
  53.133 +	key.dptr = (void *)name;
  53.134 +	key.dsize = strlen(name);
  53.135 +	data = tdb_fetch(tdb_context(conn), key);
  53.136 +
  53.137 +	if (data.dptr == NULL) {
  53.138 +		if (tdb_error(tdb_context(conn)) == TDB_ERR_NOEXIST)
  53.139 +			errno = ENOENT;
  53.140 +		else
  53.141 +			errno = EIO;
  53.142 +		return NULL;
  53.143 +	}
  53.144 +
  53.145 +	node = talloc(name, struct node);
  53.146 +	node->name = talloc_strdup(node, name);
  53.147 +	node->parent = NULL;
  53.148 +	node->tdb = tdb_context(conn);
  53.149 +	talloc_steal(node, data.dptr);
  53.150 +
  53.151 +	/* Datalen, childlen, number of permissions */
  53.152 +	p = (u32 *)data.dptr;
  53.153 +	node->num_perms = p[0];
  53.154 +	node->datalen = p[1];
  53.155 +	node->childlen = p[2];
  53.156 +
  53.157 +	/* Permissions are struct xs_permissions. */
  53.158 +	node->perms = (void *)&p[3];
  53.159 +	/* Data is binary blob (usually ascii, no nul). */
  53.160 +	node->data = node->perms + node->num_perms;
  53.161 +	/* Children is strings, nul separated. */
  53.162 +	node->children = node->data + node->datalen;
  53.163 +
  53.164 +	return node;
  53.165  }
  53.166  
  53.167 -static char *datafile(const char *dir)
  53.168 +static bool write_node(struct connection *conn, const struct node *node)
  53.169  {
  53.170 -	return talloc_asprintf(dir, "%s/.data", dir);
  53.171 -}
  53.172 +	TDB_DATA key, data;
  53.173 +	void *p;
  53.174  
  53.175 -static char *node_datafile(struct transaction *trans, const char *node)
  53.176 -{
  53.177 -	return datafile(node_dir(trans, node));
  53.178 +	key.dptr = (void *)node->name;
  53.179 +	key.dsize = strlen(node->name);
  53.180 +
  53.181 +	data.dsize = 3*sizeof(u32)
  53.182 +		+ node->num_perms*sizeof(node->perms[0])
  53.183 +		+ node->datalen + node->childlen;
  53.184 +	data.dptr = talloc_size(node, data.dsize);
  53.185 +	((u32 *)data.dptr)[0] = node->num_perms;
  53.186 +	((u32 *)data.dptr)[1] = node->datalen;
  53.187 +	((u32 *)data.dptr)[2] = node->childlen;
  53.188 +	p = data.dptr + 3 * sizeof(u32);
  53.189 +
  53.190 +	memcpy(p, node->perms, node->num_perms*sizeof(node->perms[0]));
  53.191 +	p += node->num_perms*sizeof(node->perms[0]);
  53.192 +	memcpy(p, node->data, node->datalen);
  53.193 +	p += node->datalen;
  53.194 +	memcpy(p, node->children, node->childlen);
  53.195 +
  53.196 +	/* TDB should set errno, but doesn't even set ecode AFAICT. */
  53.197 +	if (tdb_store(tdb_context(conn), key, data, TDB_REPLACE) != 0) {
  53.198 +		errno = ENOSPC;
  53.199 +		return false;
  53.200 +	}
  53.201 +	return true;
  53.202  }
  53.203  
  53.204 -static char *permfile(const char *dir)
  53.205 +static enum xs_perm_type perm_for_conn(struct connection *conn,
  53.206 +				       struct xs_permissions *perms,
  53.207 +				       unsigned int num)
  53.208  {
  53.209 -	return talloc_asprintf(dir, "%s/.perms", dir);
  53.210 +	unsigned int i;
  53.211 +	enum xs_perm_type mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  53.212 +
  53.213 +	if (!conn->can_write)
  53.214 +		mask &= ~XS_PERM_WRITE;
  53.215 +
  53.216 +	/* Owners and tools get it all... */
  53.217 +	if (!conn->id || perms[0].id == conn->id)
  53.218 +		return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
  53.219 +
  53.220 +	for (i = 1; i < num; i++)
  53.221 +		if (perms[i].id == conn->id)
  53.222 +			return perms[i].perms & mask;
  53.223 +
  53.224 +	return perms[0].perms & mask;
  53.225  }
  53.226  
  53.227 -static char *node_permfile(struct transaction *trans, const char *node)
  53.228 +static char *get_parent(const char *node)
  53.229  {
  53.230 -	return permfile(node_dir(trans, node));
  53.231 +	char *slash = strrchr(node + 1, '/');
  53.232 +	if (!slash)
  53.233 +		return talloc_strdup(node, "/");
  53.234 +	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  53.235  }
  53.236  
  53.237 -struct buffered_data *new_buffer(void *ctx)
  53.238 +/* What do parents say? */
  53.239 +static enum xs_perm_type ask_parents(struct connection *conn, const char *name)
  53.240 +{
  53.241 +	struct node *node;
  53.242 +
  53.243 +	do {
  53.244 +		name = get_parent(name);
  53.245 +		node = read_node(conn, name);
  53.246 +		if (node)
  53.247 +			break;
  53.248 +	} while (!streq(name, "/"));
  53.249 +
  53.250 +	/* No permission at root?  We're in trouble. */
  53.251 +	if (!node)
  53.252 +		corrupt(conn, "No permissions file at root");
  53.253 +
  53.254 +	return perm_for_conn(conn, node->perms, node->num_perms);
  53.255 +}
  53.256 +
  53.257 +/* We have a weird permissions system.  You can allow someone into a
  53.258 + * specific node without allowing it in the parents.  If it's going to
  53.259 + * fail, however, we don't want the errno to indicate any information
  53.260 + * about the node. */
  53.261 +static int errno_from_parents(struct connection *conn, const char *node,
  53.262 +			      int errnum, enum xs_perm_type perm)
  53.263 +{
  53.264 +	/* We always tell them about memory failures. */
  53.265 +	if (errnum == ENOMEM)
  53.266 +		return errnum;
  53.267 +
  53.268 +	if (ask_parents(conn, node) & perm)
  53.269 +		return errnum;
  53.270 +	return EACCES;
  53.271 +}
  53.272 +
  53.273 +/* If it fails, returns NULL and sets errno. */
  53.274 +struct node *get_node(struct connection *conn,
  53.275 +		      const char *name,
  53.276 +		      enum xs_perm_type perm)
  53.277 +{
  53.278 +	struct node *node;
  53.279 +
  53.280 +	if (!name || !is_valid_nodename(name)) {
  53.281 +		errno = EINVAL;
  53.282 +		return NULL;
  53.283 +	}
  53.284 +	node = read_node(conn, name);
  53.285 +	/* If we don't have permission, we don't have node. */
  53.286 +	if (node) {
  53.287 +		if ((perm_for_conn(conn, node->perms, node->num_perms) & perm)
  53.288 +		    != perm)
  53.289 +			node = NULL;
  53.290 +	}
  53.291 +	/* Clean up errno if they weren't supposed to know. */
  53.292 +	if (!node) 
  53.293 +		errno = errno_from_parents(conn, name, errno, perm);
  53.294 +	return node;
  53.295 +}
  53.296 +
  53.297 +static struct buffered_data *new_buffer(void *ctx)
  53.298  {
  53.299  	struct buffered_data *data;
  53.300  
  53.301 @@ -457,7 +551,8 @@ struct buffered_data *new_buffer(void *c
  53.302  }
  53.303  
  53.304  /* Return length of string (including nul) at this offset. */
  53.305 -unsigned int get_string(const struct buffered_data *data, unsigned int offset)
  53.306 +static unsigned int get_string(const struct buffered_data *data,
  53.307 +			       unsigned int offset)
  53.308  {
  53.309  	const char *nul;
  53.310  
  53.311 @@ -508,7 +603,6 @@ void send_reply(struct connection *conn,
  53.312  		conn->waiting_reply = bdata;
  53.313  	} else
  53.314  		conn->out = bdata;
  53.315 -	assert(conn->state != BLOCKED);
  53.316  	conn->state = BUSY;
  53.317  }
  53.318  
  53.319 @@ -567,29 +661,6 @@ static const char *onearg(struct buffere
  53.320  	return in->buffer;
  53.321  }
  53.322  
  53.323 -/* If it fails, returns NULL and sets errno. */
  53.324 -static struct xs_permissions *get_perms(const char *dir, unsigned int *num)
  53.325 -{
  53.326 -	unsigned int size;
  53.327 -	char *strings;
  53.328 -	struct xs_permissions *ret;
  53.329 -	int *fd;
  53.330 -
  53.331 -	fd = talloc_open(permfile(dir), O_RDONLY, 0);
  53.332 -	if (!fd)
  53.333 -		return NULL;
  53.334 -	strings = read_all(fd, &size);
  53.335 -	if (!strings)
  53.336 -		return NULL;
  53.337 -
  53.338 -	*num = xs_count_strings(strings, size);
  53.339 -	ret = talloc_array(dir, struct xs_permissions, *num);
  53.340 -	if (!xs_strings_to_perms(ret, *num, strings))
  53.341 -		corrupt(NULL, "Permissions corrupt for %s", dir);
  53.342 -
  53.343 -	return ret;
  53.344 -}
  53.345 -
  53.346  static char *perms_to_strings(const void *ctx,
  53.347  			      struct xs_permissions *perms, unsigned int num,
  53.348  			      unsigned int *len)
  53.349 @@ -610,173 +681,6 @@ static char *perms_to_strings(const void
  53.350  	return strings;
  53.351  }
  53.352  
  53.353 -/* Destroy this, and its children, and its children's children. */
  53.354 -int destroy_path(void *path)
  53.355 -{
  53.356 -	DIR *dir;
  53.357 -	struct dirent *dirent;
  53.358 -
  53.359 -	dir = opendir(path);
  53.360 -	if (!dir) {
  53.361 -		if (unlink(path) == 0 || errno == ENOENT)
  53.362 -			return 0;
  53.363 -		corrupt(NULL, "Destroying path %s", path);
  53.364 -	}
  53.365 -
  53.366 -	while ((dirent = readdir(dir)) != NULL) {
  53.367 -		char fullpath[strlen(path) + 1 + strlen(dirent->d_name) + 1];
  53.368 -		sprintf(fullpath, "%s/%s", (char *)path, dirent->d_name);
  53.369 -		if (!streq(dirent->d_name,".") && !streq(dirent->d_name,".."))
  53.370 -			destroy_path(fullpath);
  53.371 -	}
  53.372 -	closedir(dir);
  53.373 -	if (rmdir(path) != 0)
  53.374 -		corrupt(NULL, "Destroying directory %s", path);
  53.375 -	return 0;
  53.376 -}
  53.377 -
  53.378 -/* Create a self-destructing temporary path */
  53.379 -static char *temppath(const char *path)
  53.380 -{
  53.381 -	char *tmppath = talloc_asprintf(path, "%s.tmp", path);
  53.382 -	talloc_set_destructor(tmppath, destroy_path);
  53.383 -	return tmppath;
  53.384 -}
  53.385 -
  53.386 -/* Create a self-destructing temporary file */
  53.387 -static char *tempfile(const char *path, void *contents, unsigned int len)
  53.388 -{
  53.389 -	int *fd;
  53.390 -	char *tmppath = temppath(path);
  53.391 -
  53.392 -	fd = talloc_open(tmppath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  53.393 -	if (!fd)
  53.394 -		return NULL;
  53.395 -	if (!xs_write_all(*fd, contents, len))
  53.396 -		return NULL;
  53.397 -
  53.398 -	return tmppath;
  53.399 -}
  53.400 -
  53.401 -static int destroy_opendir(void *_dir)
  53.402 -{
  53.403 -	DIR **dir = _dir;
  53.404 -	closedir(*dir);
  53.405 -	return 0;
  53.406 -}
  53.407 -
  53.408 -/* Return a pointer to a DIR*, self-closing and attached to this pathname. */
  53.409 -DIR **talloc_opendir(const char *pathname)
  53.410 -{
  53.411 -	DIR **dir;
  53.412 -
  53.413 -	dir = talloc(pathname, DIR *);
  53.414 -	*dir = opendir(pathname);
  53.415 -	if (!*dir) {
  53.416 -		int saved_errno = errno;
  53.417 -		talloc_free(dir);
  53.418 -		errno = saved_errno;
  53.419 -		return NULL;
  53.420 -	}
  53.421 -	talloc_set_destructor(dir, destroy_opendir);
  53.422 -	return dir;
  53.423 -}
  53.424 -
  53.425 -/* We assume rename() doesn't fail on moves in same dir. */
  53.426 -static void commit_tempfile(const char *path)
  53.427 -{
  53.428 -	char realname[strlen(path) + 1];
  53.429 -	unsigned int len = strrchr(path, '.') - path;
  53.430 -
  53.431 -	memcpy(realname, path, len);
  53.432 -	realname[len] = '\0';
  53.433 -	if (rename(path, realname) != 0)
  53.434 -		corrupt(NULL, "Committing %s", realname);
  53.435 -	talloc_set_destructor(path, NULL);
  53.436 -}
  53.437 -
  53.438 -static bool set_perms(struct transaction *transaction,
  53.439 -		      const char *node,
  53.440 -		      struct xs_permissions *perms, unsigned int num)
  53.441 -{
  53.442 -	unsigned int len;
  53.443 -	char *permpath, *strings;
  53.444 -
  53.445 -	strings = perms_to_strings(node, perms, num, &len);
  53.446 -	if (!strings)
  53.447 -		return false;
  53.448 -
  53.449 -	/* Create then move. */
  53.450 -	permpath = tempfile(node_permfile(transaction, node), strings, len);
  53.451 -	if (!permpath)
  53.452 -		return false;
  53.453 -
  53.454 -	commit_tempfile(permpath);
  53.455 -	return true;
  53.456 -}
  53.457 -
  53.458 -static char *get_parent(const char *node)
  53.459 -{
  53.460 -	char *slash = strrchr(node + 1, '/');
  53.461 -	if (!slash)
  53.462 -		return talloc_strdup(node, "/");
  53.463 -	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  53.464 -}
  53.465 -
  53.466 -static enum xs_perm_type perm_for_id(domid_t id,
  53.467 -				     struct xs_permissions *perms,
  53.468 -				     unsigned int num)
  53.469 -{
  53.470 -	unsigned int i;
  53.471 -
  53.472 -	/* Owners and tools get it all... */
  53.473 -	if (!id || perms[0].id == id)
  53.474 -		return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  53.475 -
  53.476 -	for (i = 1; i < num; i++)
  53.477 -		if (perms[i].id == id)
  53.478 -			return perms[i].perms;
  53.479 -
  53.480 -	return perms[0].perms;
  53.481 -}
  53.482 -
  53.483 -/* What do parents say? */
  53.484 -static enum xs_perm_type ask_parents(struct connection *conn,
  53.485 -				     const char *node)
  53.486 -{
  53.487 -	struct xs_permissions *perms;
  53.488 -	unsigned int num;
  53.489 -
  53.490 -	do {
  53.491 -		node = get_parent(node);
  53.492 -		perms = get_perms(node_dir(conn->transaction, node), &num);
  53.493 -		if (perms)
  53.494 -			break;
  53.495 -	} while (!streq(node, "/"));
  53.496 -
  53.497 -	/* No permission at root?  We're in trouble. */
  53.498 -	if (!perms)
  53.499 -		corrupt(conn, "No permissions file at root");
  53.500 -
  53.501 -	return perm_for_id(conn->id, perms, num);
  53.502 -}
  53.503 -
  53.504 -/* We have a weird permissions system.  You can allow someone into a
  53.505 - * specific node without allowing it in the parents.  If it's going to
  53.506 - * fail, however, we don't want the errno to indicate any information
  53.507 - * about the node. */
  53.508 -static int errno_from_parents(struct connection *conn, const char *node,
  53.509 -			      int errnum)
  53.510 -{
  53.511 -	/* We always tell them about memory failures. */
  53.512 -	if (errnum == ENOMEM)
  53.513 -		return errnum;
  53.514 -
  53.515 -	if (ask_parents(conn, node) & XS_PERM_READ)
  53.516 -		return errnum;
  53.517 -	return EACCES;
  53.518 -}
  53.519 -
  53.520  char *canonicalize(struct connection *conn, const char *node)
  53.521  {
  53.522  	const char *prefix;
  53.523 @@ -789,46 +693,6 @@ char *canonicalize(struct connection *co
  53.524  	return (char *)node;
  53.525  }
  53.526  
  53.527 -bool check_node_perms(struct connection *conn, const char *node,
  53.528 -		      enum xs_perm_type perm)
  53.529 -{
  53.530 -	struct xs_permissions *perms;
  53.531 -	unsigned int num;
  53.532 -
  53.533 -	if (!node || !is_valid_nodename(node)) {
  53.534 -		errno = EINVAL;
  53.535 -		return false;
  53.536 -	}
  53.537 -
  53.538 -	if (!conn->can_write && (perm & XS_PERM_WRITE)) {
  53.539 -		errno = EROFS;
  53.540 -		return false;
  53.541 -	}
  53.542 -
  53.543 -	perms = get_perms(node_dir(conn->transaction, node), &num);
  53.544 -
  53.545 -	if (perms) {
  53.546 -		if (perm_for_id(conn->id, perms, num) & perm)
  53.547 -			return true;
  53.548 -		errno = EACCES;
  53.549 -		return false;
  53.550 -	}
  53.551 -
  53.552 -	/* If it's OK not to exist, we consult parents. */
  53.553 -	if (errno == ENOENT && (perm & XS_PERM_ENOENT_OK)) {
  53.554 -		if (ask_parents(conn, node) & perm)
  53.555 -			return true;
  53.556 -		/* Parents say they should not know. */
  53.557 -		errno = EACCES;
  53.558 -		return false;
  53.559 -	}
  53.560 -
  53.561 -	/* They might not have permission to even *see* this node, in
  53.562 -	 * which case we return EACCES even if it's ENOENT or EIO. */
  53.563 -	errno = errno_from_parents(conn, node, errno);
  53.564 -	return false;
  53.565 -}
  53.566 -
  53.567  bool check_event_node(const char *node)
  53.568  {
  53.569  	if (!node || !strstarts(node, "@")) {
  53.570 @@ -838,142 +702,144 @@ bool check_event_node(const char *node)
  53.571  	return true;
  53.572  }
  53.573  
  53.574 -static void send_directory(struct connection *conn, const char *node)
  53.575 +static void send_directory(struct connection *conn, const char *name)
  53.576  {
  53.577 -	char *path, *reply;
  53.578 -	unsigned int reply_len = 0;
  53.579 -	DIR **dir;
  53.580 -	struct dirent *dirent;
  53.581 -
  53.582 -	node = canonicalize(conn, node);
  53.583 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  53.584 -		send_error(conn, errno);
  53.585 -		return;
  53.586 -	}
  53.587 -
  53.588 -	path = node_dir(conn->transaction, node);
  53.589 -	dir = talloc_opendir(path);
  53.590 -	if (!dir) {
  53.591 -		send_error(conn, errno);
  53.592 -		return;
  53.593 -	}
  53.594 -
  53.595 -	reply = talloc_strdup(node, "");
  53.596 -	while ((dirent = readdir(*dir)) != NULL) {
  53.597 -		int len = strlen(dirent->d_name) + 1;
  53.598 +	struct node *node;
  53.599  
  53.600 -		if (!valid_chars(dirent->d_name))
  53.601 -			continue;
  53.602 -
  53.603 -		reply = talloc_realloc(path, reply, char, reply_len + len);
  53.604 -		strcpy(reply + reply_len, dirent->d_name);
  53.605 -		reply_len += len;
  53.606 -	}
  53.607 -
  53.608 -	send_reply(conn, XS_DIRECTORY, reply, reply_len);
  53.609 -}
  53.610 -
  53.611 -static void do_read(struct connection *conn, const char *node)
  53.612 -{
  53.613 -	char *value;
  53.614 -	unsigned int size;
  53.615 -	int *fd;
  53.616 -
  53.617 -	node = canonicalize(conn, node);
  53.618 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  53.619 -		send_error(conn, errno);
  53.620 -		return;
  53.621 -	}
  53.622 -
  53.623 -	fd = talloc_open(node_datafile(conn->transaction, node), O_RDONLY, 0);
  53.624 -	if (!fd) {
  53.625 -		/* Data file doesn't exist?  We call that a directory */
  53.626 -		if (errno == ENOENT)
  53.627 -			errno = EISDIR;
  53.628 +	name = canonicalize(conn, name);
  53.629 +	node = get_node(conn, name, XS_PERM_READ);
  53.630 +	if (!node) {
  53.631  		send_error(conn, errno);
  53.632  		return;
  53.633  	}
  53.634  
  53.635 -	value = read_all(fd, &size);
  53.636 -	if (!value)
  53.637 -		send_error(conn, errno);
  53.638 -	else
  53.639 -		send_reply(conn, XS_READ, value, size);
  53.640 -}
  53.641 -
  53.642 -/* Commit this directory, eg. comitting a/b.tmp/c causes a/b.tmp -> a.b */
  53.643 -static bool commit_dir(char *dir)
  53.644 -{
  53.645 -	char *dot, *slash, *dest;
  53.646 -
  53.647 -	dot = strrchr(dir, '.');
  53.648 -	slash = strchr(dot, '/');
  53.649 -	if (slash)
  53.650 -		*slash = '\0';
  53.651 -
  53.652 -	dest = talloc_asprintf(dir, "%.*s", (int)(dot - dir), dir);
  53.653 -	return rename(dir, dest) == 0;
  53.654 +	send_reply(conn, XS_DIRECTORY, node->children, node->childlen);
  53.655  }
  53.656  
  53.657 -/* Create a temporary directory.  Put data in it (if data != NULL) */
  53.658 -static char *tempdir(struct connection *conn,
  53.659 -		     const char *node, void *data, unsigned int datalen)
  53.660 +static void do_read(struct connection *conn, const char *name)
  53.661  {
  53.662 -	struct xs_permissions *perms;
  53.663 -	char *permstr;
  53.664 -	unsigned int num, len;
  53.665 -	int *fd;
  53.666 -	char *dir;
  53.667 +	struct node *node;
  53.668  
  53.669 -	dir = temppath(node_dir(conn->transaction, node));
  53.670 -	if (mkdir(dir, 0750) != 0) {
  53.671 -		if (errno != ENOENT)
  53.672 -			return NULL;
  53.673 -
  53.674 -		dir = tempdir(conn, get_parent(node), NULL, 0);
  53.675 -		if (!dir)
  53.676 -			return NULL;
  53.677 -
  53.678 -		dir = talloc_asprintf(dir, "%s%s", dir, strrchr(node, '/'));
  53.679 -		if (mkdir(dir, 0750) != 0)
  53.680 -			return NULL;
  53.681 -		talloc_set_destructor(dir, destroy_path);
  53.682 +	name = canonicalize(conn, name);
  53.683 +	node = get_node(conn, name, XS_PERM_READ);
  53.684 +	if (!node) {
  53.685 +		send_error(conn, errno);
  53.686 +		return;
  53.687  	}
  53.688  
  53.689 -	perms = get_perms(get_parent(dir), &num);
  53.690 -	assert(perms);
  53.691 -	/* Domains own what they create. */
  53.692 +	send_reply(conn, XS_READ, node->data, node->datalen);
  53.693 +}
  53.694 +
  53.695 +static void delete_node_single(struct connection *conn, struct node *node)
  53.696 +{
  53.697 +	TDB_DATA key;
  53.698 +
  53.699 +	key.dptr = (void *)node->name;
  53.700 +	key.dsize = strlen(node->name);
  53.701 +
  53.702 +	if (tdb_delete(tdb_context(conn), key) != 0)
  53.703 +		corrupt(conn, "Could not delete '%s'", node->name);
  53.704 +}
  53.705 +
  53.706 +/* Must not be / */
  53.707 +static char *basename(const char *name)
  53.708 +{
  53.709 +	return strrchr(name, '/') + 1;
  53.710 +}
  53.711 +
  53.712 +static struct node *construct_node(struct connection *conn, const char *name)
  53.713 +{
  53.714 +	const char *base;
  53.715 +	unsigned int baselen;
  53.716 +	struct node *parent, *node;
  53.717 +	char *children, *parentname = get_parent(name);
  53.718 +
  53.719 +	/* If parent doesn't exist, create it. */
  53.720 +	parent = read_node(conn, parentname);
  53.721 +	if (!parent)
  53.722 +		parent = construct_node(conn, parentname);
  53.723 +	if (!parent)
  53.724 +		return NULL;
  53.725 +	
  53.726 +	/* Add child to parent. */
  53.727 +	base = basename(name);
  53.728 +	baselen = strlen(base) + 1;
  53.729 +	children = talloc_array(name, char, parent->childlen + baselen);
  53.730 +	memcpy(children, parent->children, parent->childlen);
  53.731 +	memcpy(children + parent->childlen, base, baselen);
  53.732 +	parent->children = children;
  53.733 +	parent->childlen += baselen;
  53.734 +
  53.735 +	/* Allocate node */
  53.736 +	node = talloc(name, struct node);
  53.737 +	node->tdb = tdb_context(conn);
  53.738 +	node->name = talloc_strdup(node, name);
  53.739 +
  53.740 +	/* Inherit permissions, except domains own what they create */
  53.741 +	node->num_perms = parent->num_perms;
  53.742 +	node->perms = talloc_memdup(node, parent->perms,
  53.743 +				    node->num_perms * sizeof(node->perms[0]));
  53.744  	if (conn->id)
  53.745 -		perms->id = conn->id;
  53.746 +		node->perms[0].id = conn->id;
  53.747  
  53.748 -	permstr = perms_to_strings(dir, perms, num, &len);
  53.749 -	fd = talloc_open(permfile(dir), O_WRONLY|O_CREAT|O_EXCL, 0640);
  53.750 -	if (!fd || !xs_write_all(*fd, permstr, len))
  53.751 +	/* No children, no data */
  53.752 +	node->children = node->data = NULL;
  53.753 +	node->childlen = node->datalen = 0;
  53.754 +	node->parent = parent;
  53.755 +	return node;
  53.756 +}
  53.757 +
  53.758 +static int destroy_node(void *_node)
  53.759 +{
  53.760 +	struct node *node = _node;
  53.761 +	TDB_DATA key;
  53.762 +
  53.763 +	if (streq(node->name, "/"))
  53.764 +		corrupt(NULL, "Destroying root node!");
  53.765 +
  53.766 +	key.dptr = (void *)node->name;
  53.767 +	key.dsize = strlen(node->name);
  53.768 +
  53.769 +	tdb_delete(node->tdb, key);
  53.770 +	return 0;
  53.771 +}
  53.772 +
  53.773 +/* Be careful: create heirarchy, put entry in existing parent *last*.
  53.774 + * This helps fsck if we die during this. */
  53.775 +static struct node *create_node(struct connection *conn, 
  53.776 +				const char *name,
  53.777 +				void *data, unsigned int datalen)
  53.778 +{
  53.779 +	struct node *node, *i;
  53.780 +
  53.781 +	node = construct_node(conn, name);
  53.782 +	if (!node)
  53.783  		return NULL;
  53.784  
  53.785 -	if (data) {
  53.786 -		char *datapath = datafile(dir);
  53.787 +	node->data = data;
  53.788 +	node->datalen = datalen;
  53.789  
  53.790 -		fd = talloc_open(datapath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  53.791 -		if (!fd || !xs_write_all(*fd, data, datalen))
  53.792 +	/* We write out the nodes down, setting destructor in case
  53.793 +	 * something goes wrong. */
  53.794 +	for (i = node; i; i = i->parent) {
  53.795 +		if (!write_node(conn, i))
  53.796  			return NULL;
  53.797 +		talloc_set_destructor(i, destroy_node);
  53.798  	}
  53.799 -	return dir;
  53.800 -}
  53.801  
  53.802 -static bool node_exists(struct connection *conn, const char *node)
  53.803 -{
  53.804 -	struct stat st;
  53.805 -
  53.806 -	return lstat(node_dir(conn->transaction, node), &st) == 0;
  53.807 +	/* OK, now remove destructors so they stay around */
  53.808 +	for (i = node; i; i = i->parent)
  53.809 +		talloc_set_destructor(i, NULL);
  53.810 +	return node;
  53.811  }
  53.812  
  53.813  /* path, data... */
  53.814  static void do_write(struct connection *conn, struct buffered_data *in)
  53.815  {
  53.816  	unsigned int offset, datalen;
  53.817 +	struct node *node;
  53.818  	char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
  53.819 -	char *node, *tmppath;
  53.820 +	char *name;
  53.821  
  53.822  	/* Extra "strings" can be created by binary data. */
  53.823  	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) {
  53.824 @@ -981,99 +847,115 @@ static void do_write(struct connection *
  53.825  		return;
  53.826  	}
  53.827  
  53.828 -	node = canonicalize(conn, vec[0]);
  53.829 -	if (!within_transaction(conn->transaction, node)) {
  53.830 -		send_error(conn, EROFS);
  53.831 -		return;
  53.832 -	}
  53.833 -
  53.834 -	if (transaction_block(conn, node))
  53.835 -		return;
  53.836 -
  53.837  	offset = strlen(vec[0]) + 1;
  53.838  	datalen = in->used - offset;
  53.839  
  53.840 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  53.841 -		send_error(conn, errno);
  53.842 -		return;
  53.843 -	}
  53.844 -
  53.845 -	if (!node_exists(conn, node)) {
  53.846 -		char *dir;
  53.847 -
  53.848 -		/* Does not exist... */
  53.849 +	name = canonicalize(conn, vec[0]);
  53.850 +	node = get_node(conn, name, XS_PERM_WRITE);
  53.851 +	if (!node) {
  53.852 +		/* No permissions, invalid input? */
  53.853  		if (errno != ENOENT) {
  53.854  			send_error(conn, errno);
  53.855  			return;
  53.856  		}
  53.857 -
  53.858 -		dir = tempdir(conn, node, in->buffer + offset, datalen);
  53.859 -		if (!dir || !commit_dir(dir)) {
  53.860 +		node = create_node(conn, name, in->buffer + offset, datalen);
  53.861 +		if (!node) {
  53.862  			send_error(conn, errno);
  53.863  			return;
  53.864  		}
  53.865 -		
  53.866  	} else {
  53.867 -		/* Exists... */
  53.868 -		tmppath = tempfile(node_datafile(conn->transaction, node),
  53.869 -				   in->buffer + offset, datalen);
  53.870 -		if (!tmppath) {
  53.871 +		node->data = in->buffer + offset;
  53.872 +		node->datalen = datalen;
  53.873 +		if (!write_node(conn, node)){
  53.874  			send_error(conn, errno);
  53.875  			return;
  53.876  		}
  53.877 -
  53.878 -		commit_tempfile(tmppath);
  53.879  	}
  53.880  
  53.881 -	add_change_node(conn->transaction, node, false);
  53.882 -	fire_watches(conn, node, false);
  53.883 +	add_change_node(conn->transaction, name, false);
  53.884 +	fire_watches(conn, name, false);
  53.885  	send_ack(conn, XS_WRITE);
  53.886  }
  53.887  
  53.888 -static void do_mkdir(struct connection *conn, const char *node)
  53.889 +static void do_mkdir(struct connection *conn, const char *name)
  53.890  {
  53.891 -	char *dir;
  53.892 +	struct node *node;
  53.893  
  53.894 -	node = canonicalize(conn, node);
  53.895 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  53.896 -		send_error(conn, errno);
  53.897 -		return;
  53.898 -	}
  53.899 -
  53.900 -	if (!within_transaction(conn->transaction, node)) {
  53.901 -		send_error(conn, EROFS);
  53.902 -		return;
  53.903 -	}
  53.904 -
  53.905 -	if (transaction_block(conn, node))
  53.906 -		return;
  53.907 +	name = canonicalize(conn, name);
  53.908 +	node = get_node(conn, name, XS_PERM_WRITE);
  53.909  
  53.910  	/* If it already exists, fine. */
  53.911 -	if (node_exists(conn, node)) {
  53.912 -		send_ack(conn, XS_MKDIR);
  53.913 -		return;
  53.914 +	if (!node) {
  53.915 +		/* No permissions? */
  53.916 +		if (errno != ENOENT) {
  53.917 +			send_error(conn, errno);
  53.918 +			return;
  53.919 +		}
  53.920 +		node = create_node(conn, name, NULL, 0);
  53.921 +		if (!node) {
  53.922 +			send_error(conn, errno);
  53.923 +			return;
  53.924 +		}
  53.925 +		add_change_node(conn->transaction, name, false);
  53.926 +		fire_watches(conn, name, false);
  53.927  	}
  53.928 -
  53.929 -	dir = tempdir(conn, node, NULL, 0);
  53.930 -	if (!dir || !commit_dir(dir)) {
  53.931 -		send_error(conn, errno);
  53.932 -		return;
  53.933 -	}
  53.934 -
  53.935 -	add_change_node(conn->transaction, node, false);
  53.936 -	fire_watches(conn, node, false);
  53.937  	send_ack(conn, XS_MKDIR);
  53.938  }
  53.939  
  53.940 -static void do_rm(struct connection *conn, const char *node)
  53.941 +static void delete_node(struct connection *conn, struct node *node)
  53.942  {
  53.943 -	char *tmppath, *path;
  53.944 +	unsigned int i;
  53.945  
  53.946 -	node = canonicalize(conn, node);
  53.947 -	if (!check_node_perms(conn, node, XS_PERM_WRITE)) {
  53.948 +	/* Delete self, then delete children.  If something goes wrong,
  53.949 +	 * consistency check will clean up this way. */
  53.950 +	delete_node_single(conn, node);
  53.951 +
  53.952 +	/* Delete children, too. */
  53.953 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  53.954 +		struct node *child;
  53.955 +
  53.956 +		child = read_node(conn, 
  53.957 +				  talloc_asprintf(node, "%s/%s", node->name,
  53.958 +						  node->children + i));
  53.959 +		if (!child)
  53.960 +			corrupt(conn, "No child '%s' found", child);
  53.961 +		delete_node(conn, child);
  53.962 +	}
  53.963 +}
  53.964 +
  53.965 +/* Delete memory using memmove. */
  53.966 +static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
  53.967 +{
  53.968 +	memmove(mem + off, mem + off + len, total - off - len);
  53.969 +}
  53.970 +
  53.971 +static bool delete_child(struct connection *conn,
  53.972 +			 struct node *node, const char *childname)
  53.973 +{
  53.974 +	unsigned int i;
  53.975 +
  53.976 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  53.977 +		if (streq(node->children+i, childname)) {
  53.978 +			memdel(node->children, i, strlen(childname) + 1,
  53.979 +			       node->childlen);
  53.980 +			node->childlen -= strlen(childname) + 1;
  53.981 +			return write_node(conn, node);
  53.982 +		}
  53.983 +	}
  53.984 +	corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
  53.985 +}
  53.986 +
  53.987 +static void do_rm(struct connection *conn, const char *name)
  53.988 +{
  53.989 +	struct node *node, *parent;
  53.990 +
  53.991 +	name = canonicalize(conn, name);
  53.992 +	node = get_node(conn, name, XS_PERM_WRITE);
  53.993 +	if (!node) {
  53.994  		/* Didn't exist already?  Fine, if parent exists. */
  53.995  		if (errno == ENOENT) {
  53.996 -			if (node_exists(conn, get_parent(node))) {
  53.997 +			node = read_node(conn, get_parent(name));
  53.998 +			if (node) {
  53.999  				send_ack(conn, XS_RM);
 53.1000  				return;
 53.1001  			}
 53.1002 @@ -1084,53 +966,43 @@ static void do_rm(struct connection *con
 53.1003  		return;
 53.1004  	}
 53.1005  
 53.1006 -	if (!within_transaction(conn->transaction, node)) {
 53.1007 -		send_error(conn, EROFS);
 53.1008 -		return;
 53.1009 -	}
 53.1010 -
 53.1011 -	if (transaction_block(conn, node))
 53.1012 -		return;
 53.1013 -
 53.1014 -	if (streq(node, "/")) {
 53.1015 +	if (streq(name, "/")) {
 53.1016  		send_error(conn, EINVAL);
 53.1017  		return;
 53.1018  	}
 53.1019  
 53.1020 -	/* We move the directory to temporary name, destructor cleans up. */
 53.1021 -	path = node_dir(conn->transaction, node);
 53.1022 -	tmppath = talloc_asprintf(node, "%s.tmp", path);
 53.1023 -	talloc_set_destructor(tmppath, destroy_path);
 53.1024 +	/* Delete from parent first, then if something explodes fsck cleans. */
 53.1025 +	parent = read_node(conn, get_parent(name));
 53.1026 +	if (!parent) {
 53.1027 +		send_error(conn, EINVAL);
 53.1028 +		return;
 53.1029 +	}
 53.1030  
 53.1031 -	if (rename(path, tmppath) != 0) {
 53.1032 +	if (!delete_child(conn, parent, basename(name))) {
 53.1033 +		send_error(conn, EINVAL);
 53.1034 +		return;
 53.1035 +	}
 53.1036 +
 53.1037 +	delete_node(conn, node);
 53.1038 +	add_change_node(conn->transaction, name, true);
 53.1039 +	fire_watches(conn, name, true);
 53.1040 +	send_ack(conn, XS_RM);
 53.1041 +}
 53.1042 +
 53.1043 +static void do_get_perms(struct connection *conn, const char *name)
 53.1044 +{
 53.1045 +	struct node *node;
 53.1046 +	char *strings;
 53.1047 +	unsigned int len;
 53.1048 +
 53.1049 +	name = canonicalize(conn, name);
 53.1050 +	node = get_node(conn, name, XS_PERM_READ);
 53.1051 +	if (!node) {
 53.1052  		send_error(conn, errno);
 53.1053  		return;
 53.1054  	}
 53.1055  
 53.1056 -	add_change_node(conn->transaction, node, true);
 53.1057 -	fire_watches(conn, node, true);
 53.1058 -	send_ack(conn, XS_RM);
 53.1059 -}
 53.1060 -
 53.1061 -static void do_get_perms(struct connection *conn, const char *node)
 53.1062 -{
 53.1063 -	struct xs_permissions *perms;
 53.1064 -	char *strings;
 53.1065 -	unsigned int len, num;
 53.1066 -
 53.1067 -	node = canonicalize(conn, node);
 53.1068 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
 53.1069 -		send_error(conn, errno);
 53.1070 -		return;
 53.1071 -	}
 53.1072 -
 53.1073 -	perms = get_perms(node_dir(conn->transaction, node), &num);
 53.1074 -	if (!perms) {
 53.1075 -		send_error(conn, errno);
 53.1076 -		return;
 53.1077 -	}
 53.1078 -
 53.1079 -	strings = perms_to_strings(node, perms, num, &len);
 53.1080 +	strings = perms_to_strings(node, node->perms, node->num_perms, &len);
 53.1081  	if (!strings)
 53.1082  		send_error(conn, errno);
 53.1083  	else
 53.1084 @@ -1140,8 +1012,8 @@ static void do_get_perms(struct connecti
 53.1085  static void do_set_perms(struct connection *conn, struct buffered_data *in)
 53.1086  {
 53.1087  	unsigned int num;
 53.1088 -	char *node, *permstr;
 53.1089 -	struct xs_permissions *perms;
 53.1090 +	char *name, *permstr;
 53.1091 +	struct node *node;
 53.1092  
 53.1093  	num = xs_count_strings(in->buffer, in->used);
 53.1094  	if (num < 2) {
 53.1095 @@ -1150,37 +1022,30 @@ static void do_set_perms(struct connecti
 53.1096  	}
 53.1097  
 53.1098  	/* First arg is node name. */
 53.1099 -	node = canonicalize(conn, in->buffer);
 53.1100 +	name = canonicalize(conn, in->buffer);
 53.1101  	permstr = in->buffer + strlen(in->buffer) + 1;
 53.1102  	num--;
 53.1103  
 53.1104 -	if (!within_transaction(conn->transaction, node)) {
 53.1105 -		send_error(conn, EROFS);
 53.1106 -		return;
 53.1107 -	}
 53.1108 -
 53.1109 -	if (transaction_block(conn, node))
 53.1110 -		return;
 53.1111 -
 53.1112  	/* We must own node to do this (tools can do this too). */
 53.1113 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_OWNER)) {
 53.1114 +	node = get_node(conn, name, XS_PERM_WRITE|XS_PERM_OWNER);
 53.1115 +	if (!node) {
 53.1116  		send_error(conn, errno);
 53.1117  		return;
 53.1118  	}
 53.1119  
 53.1120 -	perms = talloc_array(node, struct xs_permissions, num);
 53.1121 -	if (!xs_strings_to_perms(perms, num, permstr)) {
 53.1122 +	node->perms = talloc_array(node, struct xs_permissions, num);
 53.1123 +	node->num_perms = num;
 53.1124 +	if (!xs_strings_to_perms(node->perms, num, permstr)) {
 53.1125 +		send_error(conn, errno);
 53.1126 +		return;
 53.1127 +	}
 53.1128 +	if (!write_node(conn, node)) {
 53.1129  		send_error(conn, errno);
 53.1130  		return;
 53.1131  	}
 53.1132  
 53.1133 -	if (!set_perms(conn->transaction, node, perms, num)) {
 53.1134 -		send_error(conn, errno);
 53.1135 -		return;
 53.1136 -	}
 53.1137 -
 53.1138 -	add_change_node(conn->transaction, node, false);
 53.1139 -	fire_watches(conn, node, false);
 53.1140 +	add_change_node(conn->transaction, name, false);
 53.1141 +	fire_watches(conn, name, false);
 53.1142  	send_ack(conn, XS_SET_PERMS);
 53.1143  }
 53.1144  
 53.1145 @@ -1221,14 +1086,10 @@ static void process_message(struct conne
 53.1146  	case XS_SHUTDOWN:
 53.1147  		/* FIXME: Implement gentle shutdown too. */
 53.1148  		/* Only tools can do this. */
 53.1149 -		if (conn->id != 0) {
 53.1150 +		if (conn->id != 0 || !conn->can_write) {
 53.1151  			send_error(conn, EACCES);
 53.1152  			break;
 53.1153  		}
 53.1154 -		if (!conn->can_write) {
 53.1155 -			send_error(conn, EROFS);
 53.1156 -			break;
 53.1157 -		}
 53.1158  		send_ack(conn, XS_SHUTDOWN);
 53.1159  		/* Everything hangs off auto-free context, freed at exit. */
 53.1160  		exit(0);
 53.1161 @@ -1263,7 +1124,7 @@ static void process_message(struct conne
 53.1162  		break;
 53.1163  
 53.1164  	case XS_TRANSACTION_START:
 53.1165 -		do_transaction_start(conn, onearg(in));
 53.1166 +		do_transaction_start(conn, in);
 53.1167  		break;
 53.1168  
 53.1169  	case XS_TRANSACTION_END:
 53.1170 @@ -1309,6 +1170,8 @@ static void consider_message(struct conn
 53.1171  	/* For simplicity, we kill the connection on OOM. */
 53.1172  	talloc_set_fail_handler(out_of_mem, &talloc_fail);
 53.1173  	if (setjmp(talloc_fail)) {
 53.1174 +		/* Free in before conn, in case it needs something. */
 53.1175 +		talloc_free(in);
 53.1176  		talloc_free(conn);
 53.1177  		goto end;
 53.1178  	}
 53.1179 @@ -1330,16 +1193,8 @@ static void consider_message(struct conn
 53.1180  	conn->in = new_buffer(conn);
 53.1181  	process_message(conn, in);
 53.1182  
 53.1183 -	if (conn->state == BLOCKED) {
 53.1184 -		/* Blocked by transaction: queue for re-xmit. */
 53.1185 -		talloc_free(conn->in);
 53.1186 -		conn->in = in;
 53.1187 -		in = NULL;
 53.1188 -		trace_blocked(conn, conn->in);
 53.1189 -	}
 53.1190 -
 53.1191 +	talloc_free(in);
 53.1192  end:
 53.1193 -	talloc_free(in);
 53.1194  	talloc_set_fail_handler(NULL, NULL);
 53.1195  	if (talloc_total_blocks(NULL)
 53.1196  	    != talloc_total_blocks(talloc_autofree_context()) + 1) {
 53.1197 @@ -1350,7 +1205,7 @@ end:
 53.1198  
 53.1199  /* Errors in reading or allocating here mean we get out of sync, so we
 53.1200   * drop the whole client connection. */
 53.1201 -void handle_input(struct connection *conn)
 53.1202 +static void handle_input(struct connection *conn)
 53.1203  {
 53.1204  	int bytes;
 53.1205  	struct buffered_data *in;
 53.1206 @@ -1402,41 +1257,12 @@ bad_client:
 53.1207  	talloc_free(conn);
 53.1208  }
 53.1209  
 53.1210 -void handle_output(struct connection *conn)
 53.1211 +static void handle_output(struct connection *conn)
 53.1212  {
 53.1213  	if (!write_message(conn))
 53.1214  		talloc_free(conn);
 53.1215  }
 53.1216  
 53.1217 -/* If a transaction has ended, see if we can unblock any connections. */
 53.1218 -static void unblock_connections(void)
 53.1219 -{
 53.1220 -	struct connection *i, *tmp;
 53.1221 -
 53.1222 -	list_for_each_entry_safe(i, tmp, &connections, list) {
 53.1223 -		switch (i->state) {
 53.1224 -		case BLOCKED:
 53.1225 -			if (!transaction_covering_node(i->blocked_by)) {
 53.1226 -				talloc_free(i->blocked_by);
 53.1227 -				i->blocked_by = NULL;
 53.1228 -				i->state = OK;
 53.1229 -				consider_message(i);
 53.1230 -			}
 53.1231 -			break;
 53.1232 -		case BUSY:
 53.1233 -		case OK:
 53.1234 -			break;
 53.1235 -		}
 53.1236 -	}
 53.1237 -
 53.1238 -	/* To balance bias, move first entry to end. */
 53.1239 -	if (!list_empty(&connections)) {
 53.1240 -		i = list_top(&connections, struct connection, list);
 53.1241 -		list_del(&i->list);
 53.1242 -		list_add_tail(&i->list, &connections);
 53.1243 -	}
 53.1244 -}
 53.1245 -
 53.1246  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
 53.1247  {
 53.1248  	/*
 53.1249 @@ -1451,7 +1277,6 @@ struct connection *new_connection(connwr
 53.1250  		return NULL;
 53.1251  
 53.1252  	new->state = OK;
 53.1253 -	new->blocked_by = NULL;
 53.1254  	new->out = new->waiting_reply = NULL;
 53.1255  	new->waiting_for_ack = NULL;
 53.1256  	new->fd = -1;
 53.1257 @@ -1504,25 +1329,9 @@ static void accept_connection(int sock, 
 53.1258  		close(fd);
 53.1259  }
 53.1260  
 53.1261 -/* Calc timespan from now to absolute time. */
 53.1262 -static void time_relative_to_now(struct timeval *tv)
 53.1263 -{
 53.1264 -	struct timeval now;
 53.1265 -
 53.1266 -	gettimeofday(&now, NULL);
 53.1267 -	if (timercmp(&now, tv, >))
 53.1268 -		timerclear(tv);
 53.1269 -	else {
 53.1270 -		tv->tv_sec -= now.tv_sec;
 53.1271 -		if (now.tv_usec > tv->tv_usec) {
 53.1272 -			tv->tv_sec--;
 53.1273 -			tv->tv_usec += 1000000;
 53.1274 -		}
 53.1275 -		tv->tv_usec -= now.tv_usec;
 53.1276 -	}
 53.1277 -}
 53.1278 -
 53.1279  #ifdef TESTING
 53.1280 +/* Valgrind can check our writes better if we don't use mmap */
 53.1281 +#define TDB_FLAGS TDB_NOMMAP
 53.1282  /* Useful for running under debugger. */
 53.1283  void dump_connection(void)
 53.1284  {
 53.1285 @@ -1532,13 +1341,10 @@ void dump_connection(void)
 53.1286  		printf("Connection %p:\n", i);
 53.1287  		printf("    state = %s\n",
 53.1288  		       i->state == OK ? "OK"
 53.1289 -		       : i->state == BLOCKED ? "BLOCKED"
 53.1290  		       : i->state == BUSY ? "BUSY"
 53.1291  		       : "INVALID");
 53.1292  		if (i->id)
 53.1293  			printf("    id = %i\n", i->id);
 53.1294 -		if (i->blocked_by)
 53.1295 -			printf("    blocked on = %s\n", i->blocked_by);
 53.1296  		if (!i->in->inhdr || i->in->used)
 53.1297  			printf("    got %i bytes of %s\n",
 53.1298  			       i->in->used, i->in->inhdr ? "header" : "data");
 53.1299 @@ -1559,44 +1365,53 @@ void dump_connection(void)
 53.1300  		dump_watches(i);
 53.1301  	}
 53.1302  }
 53.1303 +#else
 53.1304 +#define TDB_FLAGS 0
 53.1305  #endif
 53.1306  
 53.1307 +/* We create initial nodes manually. */
 53.1308 +static void manual_node(const char *name, const char *child)
 53.1309 +{
 53.1310 +	struct node *node;
 53.1311 +	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 53.1312 +
 53.1313 +	node = talloc(NULL, struct node);
 53.1314 +	node->name = name;
 53.1315 +	node->perms = &perms;
 53.1316 +	node->num_perms = 1;
 53.1317 +	node->data = NULL;
 53.1318 +	node->datalen = 0;
 53.1319 +	node->children = (char *)child;
 53.1320 +	if (child)
 53.1321 +		node->childlen = strlen(child) + 1;
 53.1322 +	else
 53.1323 +		node->childlen = 0;
 53.1324 +
 53.1325 +	if (!write_node(NULL, node))
 53.1326 +		barf_perror("Could not create initial node %s", name);
 53.1327 +	talloc_free(node);
 53.1328 +}
 53.1329 +
 53.1330 +#
 53.1331 +
 53.1332  static void setup_structure(void)
 53.1333  {
 53.1334 -	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 53.1335 -	char *root, *dir, *permfile;
 53.1336 -
 53.1337 -	/* Create root directory, with permissions. */
 53.1338 -	if (mkdir(xs_daemon_store(), 0750) != 0) {
 53.1339 -		if (errno != EEXIST)
 53.1340 -			barf_perror("Could not create root %s",
 53.1341 -				    xs_daemon_store());
 53.1342 -		return;
 53.1343 -	}
 53.1344 -	root = talloc_strdup(talloc_autofree_context(), "/");
 53.1345 -	if (!set_perms(NULL, root, &perms, 1))
 53.1346 -		barf_perror("Could not create permissions in root");
 53.1347 +	char *tdbname;
 53.1348 +	tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
 53.1349 +	tdb_ctx = tdb_open(tdbname, 0, TDB_FLAGS, O_RDWR, 0);
 53.1350  
 53.1351 -	/* Create tool directory, with xenstored subdir. */
 53.1352 -	dir = talloc_asprintf(root, "%s/%s", xs_daemon_store(), "tool");
 53.1353 -	if (mkdir(dir, 0750) != 0)
 53.1354 -		barf_perror("Making dir %s", dir);
 53.1355 -	
 53.1356 -	permfile = talloc_strdup(root, "/tool");
 53.1357 -	if (!set_perms(NULL, permfile, &perms, 1))
 53.1358 -		barf_perror("Could not create permissions on %s", permfile);
 53.1359 +	if (!tdb_ctx) {
 53.1360 +		tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
 53.1361 +				   0640);
 53.1362 +		if (!tdb_ctx)
 53.1363 +			barf_perror("Could not create tdb file %s", tdbname);
 53.1364  
 53.1365 -	dir = talloc_asprintf(root, "%s/%s", dir, "xenstored");
 53.1366 -	if (mkdir(dir, 0750) != 0)
 53.1367 -		barf_perror("Making dir %s", dir);
 53.1368 -	
 53.1369 -	permfile = talloc_strdup(root, "/tool/xenstored");
 53.1370 -	if (!set_perms(NULL, permfile, &perms, 1))
 53.1371 -		barf_perror("Could not create permissions on %s", permfile);
 53.1372 -	talloc_free(root);
 53.1373 -	if (mkdir(xs_daemon_transactions(), 0750) != 0)
 53.1374 -		barf_perror("Could not create transaction dir %s",
 53.1375 -			    xs_daemon_transactions());
 53.1376 +		manual_node("/", "tool");
 53.1377 +		manual_node("/tool", "xenstored");
 53.1378 +		manual_node("/tool/xenstored", NULL);
 53.1379 +	}
 53.1380 +
 53.1381 +	/* FIXME: Fsck */
 53.1382  }
 53.1383  
 53.1384  static void write_pidfile(const char *pidfile)
 53.1385 @@ -1759,17 +1574,8 @@ int main(int argc, char *argv[])
 53.1386  	/* FIXME: Rewrite so noone can starve. */
 53.1387  	for (;;) {
 53.1388  		struct connection *i;
 53.1389 -		struct timeval *tvp = NULL, tv;
 53.1390  
 53.1391 -		timerclear(&tv);
 53.1392 -		shortest_transaction_timeout(&tv);
 53.1393 -		shortest_watch_ack_timeout(&tv);
 53.1394 -		if (timerisset(&tv)) {
 53.1395 -			time_relative_to_now(&tv);
 53.1396 -			tvp = &tv;
 53.1397 -		}
 53.1398 -
 53.1399 -		if (select(max+1, &inset, &outset, NULL, tvp) < 0) {
 53.1400 +		if (select(max+1, &inset, &outset, NULL, NULL) < 0) {
 53.1401  			if (errno == EINTR)
 53.1402  				continue;
 53.1403  			barf_perror("Select failed");
 53.1404 @@ -1818,14 +1624,6 @@ int main(int argc, char *argv[])
 53.1405  			}
 53.1406  		}
 53.1407  
 53.1408 -		if (tvp) {
 53.1409 -			check_transaction_timeout();
 53.1410 -			check_watch_ack_timeout();
 53.1411 -		}
 53.1412 -
 53.1413 -		/* If transactions ended, we might be able to do more work. */
 53.1414 -		unblock_connections();
 53.1415 -
 53.1416  		max = initialize_set(&inset, &outset, *sock, *ro_sock,
 53.1417  				     event_fd);
 53.1418  	}
    54.1 --- a/tools/xenstore/xenstored_core.h	Fri Sep 23 15:41:28 2005 -0600
    54.2 +++ b/tools/xenstore/xenstored_core.h	Mon Sep 26 11:07:49 2005 -0600
    54.3 @@ -28,6 +28,7 @@
    54.4  #include "xs_lib.h"
    54.5  #include "xenstored.h"
    54.6  #include "list.h"
    54.7 +#include "tdb.h"
    54.8  
    54.9  struct buffered_data
   54.10  {
   54.11 @@ -49,8 +50,6 @@ typedef int connreadfn_t(struct connecti
   54.12  
   54.13  enum state
   54.14  {
   54.15 -	/* Blocked by transaction. */
   54.16 -	BLOCKED,
   54.17  	/* Doing action, not listening */
   54.18  	BUSY,
   54.19  	/* Completed */
   54.20 @@ -70,9 +69,6 @@ struct connection
   54.21  	/* Blocked on transaction?  Busy? */
   54.22  	enum state state;
   54.23  
   54.24 -	/* Node we are waiting for (if state == BLOCKED) */
   54.25 -	char *blocked_by;
   54.26 -
   54.27  	/* Is this a read-only connection? */
   54.28  	bool can_write;
   54.29  
   54.30 @@ -103,9 +99,27 @@ struct connection
   54.31  };
   54.32  extern struct list_head connections;
   54.33  
   54.34 -/* Return length of string (including nul) at this offset. */
   54.35 -unsigned int get_string(const struct buffered_data *data,
   54.36 -			unsigned int offset);
   54.37 +struct node {
   54.38 +	const char *name;
   54.39 +
   54.40 +	/* Database I came from */
   54.41 +	TDB_CONTEXT *tdb;
   54.42 +
   54.43 +	/* Parent (optional) */
   54.44 +	struct node *parent;
   54.45 +
   54.46 +	/* Permissions. */
   54.47 +	unsigned int num_perms;
   54.48 +	struct xs_permissions *perms;
   54.49 +
   54.50 +	/* Contents. */
   54.51 +	unsigned int datalen;
   54.52 +	void *data;
   54.53 +
   54.54 +	/* Children, each nul-terminated. */
   54.55 +	unsigned int childlen;
   54.56 +	char *children;
   54.57 +};
   54.58  
   54.59  /* Break input into vectors, return the number, fill in up to num of them. */
   54.60  unsigned int get_strings(struct buffered_data *data,
   54.61 @@ -114,9 +128,6 @@ unsigned int get_strings(struct buffered
   54.62  /* Is child node a child or equal to parent node? */
   54.63  bool is_child(const char *child, const char *parent);
   54.64  
   54.65 -/* Create a new buffer with lifetime of context. */
   54.66 -struct buffered_data *new_buffer(void *ctx);
   54.67 -
   54.68  void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
   54.69  		const void *data, unsigned int len);
   54.70  
   54.71 @@ -129,15 +140,22 @@ void send_error(struct connection *conn,
   54.72  /* Canonicalize this path if possible. */
   54.73  char *canonicalize(struct connection *conn, const char *node);
   54.74  
   54.75 -/* Check permissions on this node. */
   54.76 -bool check_node_perms(struct connection *conn, const char *node,
   54.77 -		      enum xs_perm_type perm);
   54.78 -
   54.79  /* Check if node is an event node. */
   54.80  bool check_event_node(const char *node);
   54.81  
   54.82 -/* Path to this node outside transaction. */
   54.83 -char *node_dir_outside_transaction(const char *node);
   54.84 +/* Get this node, checking we have permissions. */
   54.85 +struct node *get_node(struct connection *conn,
   54.86 +		      const char *name,
   54.87 +		      enum xs_perm_type perm);
   54.88 +
   54.89 +/* Get TDB context for this connection */
   54.90 +TDB_CONTEXT *tdb_context(struct connection *conn);
   54.91 +
   54.92 +/* Destructor for tdbs: required for transaction code */
   54.93 +int destroy_tdb(void *_tdb);
   54.94 +
   54.95 +/* Replace the tdb: required for transaction code */
   54.96 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
   54.97  
   54.98  /* Fail due to excessive corruption, capitalist pigdogs! */
   54.99  void __attribute__((noreturn)) corrupt(struct connection *conn,
  54.100 @@ -145,23 +163,9 @@ void __attribute__((noreturn)) corrupt(s
  54.101  
  54.102  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
  54.103  
  54.104 -void handle_input(struct connection *conn);
  54.105 -void handle_output(struct connection *conn);
  54.106 -
  54.107  /* Is this a valid node name? */
  54.108  bool is_valid_nodename(const char *node);
  54.109  
  54.110 -/* Return a pointer to an open dir, self-closig and attached to pathname. */
  54.111 -DIR **talloc_opendir(const char *pathname);
  54.112 -
  54.113 -/* Return a pointer to an fd, self-closing and attached to this pathname. */
  54.114 -int *talloc_open(const char *pathname, int flags, int mode);
  54.115 -
  54.116 -/* Convenient talloc-style destructor for paths. */
  54.117 -int destroy_path(void *path);
  54.118 -
  54.119 -/* Read entire contents of a talloced fd. */
  54.120 -void *read_all(int *fd, unsigned int *size);
  54.121  
  54.122  /* Tracing infrastructure. */
  54.123  void trace_create(const void *data, const char *type);
    55.1 --- a/tools/xenstore/xenstored_domain.c	Fri Sep 23 15:41:28 2005 -0600
    55.2 +++ b/tools/xenstore/xenstored_domain.c	Mon Sep 26 11:07:49 2005 -0600
    55.3 @@ -309,16 +309,11 @@ void do_introduce(struct connection *con
    55.4  		return;
    55.5  	}
    55.6  
    55.7 -	if (conn->id != 0) {
    55.8 +	if (conn->id != 0 || !conn->can_write) {
    55.9  		send_error(conn, EACCES);
   55.10  		return;
   55.11  	}
   55.12  
   55.13 -	if (!conn->can_write) {
   55.14 -		send_error(conn, EROFS);
   55.15 -		return;
   55.16 -	}
   55.17 -
   55.18  	/* Sanity check args. */
   55.19  	if ((atoi(vec[2]) <= 0) || !is_valid_nodename(vec[3])) {
   55.20  		send_error(conn, EINVAL);
   55.21 @@ -386,7 +381,7 @@ void do_release(struct connection *conn,
   55.22  
   55.23  	talloc_free(domain->conn);
   55.24  
   55.25 -	fire_watches(NULL, "@releaseDomain", false);
   55.26 +	fire_watches(conn, "@releaseDomain", false);
   55.27  
   55.28  	send_ack(conn, XS_RELEASE);
   55.29  }
    56.1 --- a/tools/xenstore/xenstored_transaction.c	Fri Sep 23 15:41:28 2005 -0600
    56.2 +++ b/tools/xenstore/xenstored_transaction.c	Mon Sep 26 11:07:49 2005 -0600
    56.3 @@ -26,6 +26,7 @@
    56.4  #include <stdarg.h>
    56.5  #include <stdlib.h>
    56.6  #include <fcntl.h>
    56.7 +#include <unistd.h>
    56.8  #include "talloc.h"
    56.9  #include "list.h"
   56.10  #include "xenstored_transaction.h"
   56.11 @@ -51,74 +52,26 @@ struct transaction
   56.12  	/* Global list of transactions. */
   56.13  	struct list_head list;
   56.14  
   56.15 +	/* Generation when transaction started. */
   56.16 +	unsigned int generation;
   56.17 +
   56.18  	/* My owner (conn->transaction == me). */
   56.19  	struct connection *conn;
   56.20  
   56.21 -	/* Subtree this transaction covers */
   56.22 -	char *node;
   56.23 -
   56.24 -	/* Base for this transaction. */
   56.25 -	char *divert;
   56.26 +	/* TDB to work on, and filename */
   56.27 +	TDB_CONTEXT *tdb;
   56.28 +	char *tdb_name;
   56.29  
   56.30  	/* List of changed nodes. */
   56.31  	struct list_head changes;
   56.32 -
   56.33 -	/* Someone's waiting: time limit. */
   56.34 -	struct timeval timeout;
   56.35 -
   56.36 -	/* We've timed out. */
   56.37 -	bool destined_to_fail;
   56.38  };
   56.39  static LIST_HEAD(transactions);
   56.40 -
   56.41 -bool within_transaction(struct transaction *trans, const char *node)
   56.42 -{
   56.43 -	if (!trans)
   56.44 -		return true;
   56.45 -	return is_child(node, trans->node);
   56.46 -}
   56.47 -
   56.48 -/* You are on notice: this transaction is blocking someone. */
   56.49 -static void start_transaction_timeout(struct transaction *trans)
   56.50 -{
   56.51 -	if (timerisset(&trans->timeout))
   56.52 -		return;
   56.53 -
   56.54 -	/* One second timeout. */
   56.55 -	gettimeofday(&trans->timeout, NULL);
   56.56 -	trans->timeout.tv_sec += 1;
   56.57 -}
   56.58 -
   56.59 -struct transaction *transaction_covering_node(const char *node)
   56.60 -{
   56.61 -	struct transaction *i;
   56.62 +static unsigned int generation;
   56.63  
   56.64 -	list_for_each_entry(i, &transactions, list) {
   56.65 -		if (i->destined_to_fail)
   56.66 -			continue;
   56.67 -		if (is_child(i->node, node) || is_child(node, i->node))
   56.68 -			return i;
   56.69 -	}
   56.70 -	return NULL;
   56.71 -}
   56.72 -
   56.73 -bool transaction_block(struct connection *conn, const char *node)
   56.74 +/* Return tdb context to use for this connection. */
   56.75 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans)
   56.76  {
   56.77 -	struct transaction *trans;
   56.78 -
   56.79 -	/* Transactions don't overlap, so we can't be blocked by
   56.80 -	 * others if we're in one. */
   56.81 -	if (conn->transaction)
   56.82 -		return false;
   56.83 -
   56.84 -	trans = transaction_covering_node(node);
   56.85 -	if (trans) {
   56.86 -		start_transaction_timeout(trans);
   56.87 -		conn->state = BLOCKED;
   56.88 -		conn->blocked_by = talloc_strdup(conn, node);
   56.89 -		return true;
   56.90 -	}
   56.91 -	return false;
   56.92 +	return trans->tdb;
   56.93  }
   56.94  
   56.95  /* Callers get a change node (which can fail) and only commit after they've
   56.96 @@ -127,8 +80,11 @@ void add_change_node(struct transaction 
   56.97  {
   56.98  	struct changed_node *i;
   56.99  
  56.100 -	if (!trans)
  56.101 +	if (!trans) {
  56.102 +		/* They're changing the global database. */
  56.103 +		generation++;
  56.104  		return;
  56.105 +	}
  56.106  
  56.107  	list_for_each_entry(i, &trans->changes, list)
  56.108  		if (streq(i->node, node))
  56.109 @@ -140,167 +96,47 @@ void add_change_node(struct transaction 
  56.110  	list_add_tail(&i->list, &trans->changes);
  56.111  }
  56.112  
  56.113 -char *node_dir_inside_transaction(struct transaction *trans, const char *node)
  56.114 -{
  56.115 -	return talloc_asprintf(node, "%s/%s", trans->divert,
  56.116 -			       node + strlen(trans->node));
  56.117 -}
  56.118 -
  56.119 -void shortest_transaction_timeout(struct timeval *tv)
  56.120 -{
  56.121 -	struct transaction *i;
  56.122 -
  56.123 -	list_for_each_entry(i, &transactions, list) {
  56.124 -		if (!timerisset(&i->timeout))
  56.125 -			continue;
  56.126 -
  56.127 -		if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
  56.128 -			*tv = i->timeout;
  56.129 -	}
  56.130 -}	
  56.131 -
  56.132 -void check_transaction_timeout(void)
  56.133 -{
  56.134 -	struct transaction *i;
  56.135 -	struct timeval now;
  56.136 -
  56.137 -	gettimeofday(&now, NULL);
  56.138 -
  56.139 -	list_for_each_entry(i, &transactions, list) {
  56.140 -		if (!timerisset(&i->timeout))
  56.141 -			continue;
  56.142 -
  56.143 -		if (timercmp(&i->timeout, &now, <))
  56.144 -			i->destined_to_fail = true;
  56.145 -	}
  56.146 -}
  56.147 -
  56.148  static int destroy_transaction(void *_transaction)
  56.149  {
  56.150  	struct transaction *trans = _transaction;
  56.151  
  56.152  	list_del(&trans->list);
  56.153  	trace_destroy(trans, "transaction");
  56.154 -	return destroy_path(trans->divert);
  56.155 -}
  56.156 -
  56.157 -static bool copy_file(const char *src, const char *dst)
  56.158 -{
  56.159 -	int *infd, *outfd;
  56.160 -	void *data;
  56.161 -	unsigned int size;
  56.162 -
  56.163 -	infd = talloc_open(src, O_RDONLY, 0);
  56.164 -	if (!infd)
  56.165 -		return false;
  56.166 -	outfd = talloc_open(dst, O_WRONLY|O_CREAT|O_EXCL, 0640);
  56.167 -	if (!outfd)
  56.168 -		return false;
  56.169 -	data = read_all(infd, &size);
  56.170 -	if (!data)
  56.171 -		return false;
  56.172 -	return xs_write_all(*outfd, data, size);
  56.173 +	if (trans->tdb)
  56.174 +		tdb_close(trans->tdb);
  56.175 +	unlink(trans->tdb_name);
  56.176 +	return 0;
  56.177  }
  56.178  
  56.179 -static bool copy_dir(const char *src, const char *dst)
  56.180 +void do_transaction_start(struct connection *conn, struct buffered_data *in)
  56.181  {
  56.182 -	DIR **dir;
  56.183 -	struct dirent *dirent;
  56.184 -
  56.185 -	if (mkdir(dst, 0750) != 0)
  56.186 -		return false;
  56.187 -
  56.188 -	dir = talloc_opendir(src);
  56.189 -	if (!dir)
  56.190 -		return false;
  56.191 -
  56.192 -	while ((dirent = readdir(*dir)) != NULL) {
  56.193 -		struct stat st;
  56.194 -		char *newsrc, *newdst;
  56.195 -
  56.196 -		if (streq(dirent->d_name, ".") || streq(dirent->d_name, ".."))
  56.197 -			continue;
  56.198 -
  56.199 -		newsrc = talloc_asprintf(src, "%s/%s", src, dirent->d_name);
  56.200 -		newdst = talloc_asprintf(src, "%s/%s", dst, dirent->d_name);
  56.201 -		if (stat(newsrc, &st) != 0)
  56.202 -			return false;
  56.203 -		
  56.204 -		if (S_ISDIR(st.st_mode)) {
  56.205 -			if (!copy_dir(newsrc, newdst))
  56.206 -				return false;
  56.207 -		} else {
  56.208 -			if (!copy_file(newsrc, newdst))
  56.209 -				return false;
  56.210 -		}
  56.211 -		/* Free now so we don't run out of file descriptors */
  56.212 -		talloc_free(newsrc);
  56.213 -		talloc_free(newdst);
  56.214 -	}
  56.215 -	return true;
  56.216 -}
  56.217 -
  56.218 -void do_transaction_start(struct connection *conn, const char *node)
  56.219 -{
  56.220 -	struct transaction *transaction;
  56.221 -	char *dir;
  56.222 +	struct transaction *trans;
  56.223  
  56.224  	if (conn->transaction) {
  56.225  		send_error(conn, EBUSY);
  56.226  		return;
  56.227  	}
  56.228  
  56.229 -	node = canonicalize(conn, node);
  56.230 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  56.231 +	/* Attach transaction to input for autofree until it's complete */
  56.232 +	trans = talloc(in, struct transaction);
  56.233 +	INIT_LIST_HEAD(&trans->changes);
  56.234 +	trans->conn = conn;
  56.235 +	trans->generation = generation;
  56.236 +	trans->tdb_name = talloc_asprintf(trans, "%s.%p",
  56.237 +					  xs_daemon_tdb(), trans);
  56.238 +	trans->tdb = tdb_copy(tdb_context(conn), trans->tdb_name);
  56.239 +	if (!trans->tdb) {
  56.240  		send_error(conn, errno);
  56.241  		return;
  56.242  	}
  56.243 -
  56.244 -	if (transaction_block(conn, node))
  56.245 -		return;
  56.246 -
  56.247 -	dir = node_dir_outside_transaction(node);
  56.248 -
  56.249 -	/* Attach transaction to node for autofree until it's complete */
  56.250 -	transaction = talloc(node, struct transaction);
  56.251 -	transaction->node = talloc_strdup(transaction, node);
  56.252 -	transaction->divert = talloc_asprintf(transaction, "%s/%p", 
  56.253 -					      xs_daemon_transactions(),
  56.254 -					      transaction);
  56.255 -	INIT_LIST_HEAD(&transaction->changes);
  56.256 -	transaction->conn = conn;
  56.257 -	timerclear(&transaction->timeout);
  56.258 -	transaction->destined_to_fail = false;
  56.259 -	list_add_tail(&transaction->list, &transactions);
  56.260 -	talloc_set_destructor(transaction, destroy_transaction);
  56.261 -	trace_create(transaction, "transaction");
  56.262 +	/* Make it close if we go away. */
  56.263 +	talloc_steal(trans, trans->tdb);
  56.264  
  56.265 -	if (!copy_dir(dir, transaction->divert)) {
  56.266 -		send_error(conn, errno);
  56.267 -		return;
  56.268 -	}
  56.269 -
  56.270 -	talloc_steal(conn, transaction);
  56.271 -	conn->transaction = transaction;
  56.272 -	send_ack(transaction->conn, XS_TRANSACTION_START);
  56.273 -}
  56.274 -
  56.275 -static bool commit_transaction(struct transaction *trans)
  56.276 -{
  56.277 -	char *tmp, *dir;
  56.278 -
  56.279 -	/* Move: orig -> .old, repl -> orig.  Cleanup deletes .old. */
  56.280 -	dir = node_dir_outside_transaction(trans->node);
  56.281 -	tmp = talloc_asprintf(trans, "%s.old", dir);
  56.282 -
  56.283 -	if (rename(dir, tmp) != 0)
  56.284 -		return false;
  56.285 -	if (rename(trans->divert, dir) != 0)
  56.286 -		corrupt(trans->conn, "Failed rename %s to %s",
  56.287 -			trans->divert, dir);
  56.288 -
  56.289 -	trans->divert = tmp;
  56.290 -	return true;
  56.291 +	/* Now we own it. */
  56.292 +	conn->transaction = talloc_steal(conn, trans);
  56.293 +	list_add_tail(&trans->list, &transactions);
  56.294 +	talloc_set_destructor(trans, destroy_transaction);
  56.295 +	send_ack(conn, XS_TRANSACTION_START);
  56.296  }
  56.297  
  56.298  void do_transaction_end(struct connection *conn, const char *arg)
  56.299 @@ -318,25 +154,29 @@ void do_transaction_end(struct connectio
  56.300  		return;
  56.301  	}
  56.302  
  56.303 -	/* Set to NULL so fire_watches sends events. */
  56.304 +	/* Set to NULL so fire_watches sends events, tdb_context works. */
  56.305  	trans = conn->transaction;
  56.306  	conn->transaction = NULL;
  56.307  	/* Attach transaction to arg for auto-cleanup */
  56.308  	talloc_steal(arg, trans);
  56.309  
  56.310  	if (streq(arg, "T")) {
  56.311 -		if (trans->destined_to_fail) {
  56.312 -			send_error(conn, ETIMEDOUT);
  56.313 +		/* FIXME: Merge, rather failing on any change. */
  56.314 +		if (trans->generation != generation) {
  56.315 +			send_error(conn, EAGAIN);
  56.316  			return;
  56.317  		}
  56.318 -		if (!commit_transaction(trans)) {
  56.319 +		if (!replace_tdb(trans->tdb_name, trans->tdb)) {
  56.320  			send_error(conn, errno);
  56.321  			return;
  56.322  		}
  56.323 +		/* Don't close this: we won! */
  56.324 +		trans->tdb = NULL;
  56.325  
  56.326  		/* Fire off the watches for everything that changed. */
  56.327  		list_for_each_entry(i, &trans->changes, list)
  56.328  			fire_watches(conn, i->node, i->recurse);
  56.329 +		generation++;
  56.330  	}
  56.331  	send_ack(conn, XS_TRANSACTION_END);
  56.332  }
    57.1 --- a/tools/xenstore/xenstored_transaction.h	Fri Sep 23 15:41:28 2005 -0600
    57.2 +++ b/tools/xenstore/xenstored_transaction.h	Mon Sep 26 11:07:49 2005 -0600
    57.3 @@ -22,29 +22,14 @@
    57.4  
    57.5  struct transaction;
    57.6  
    57.7 -void do_transaction_start(struct connection *conn, const char *node);
    57.8 +void do_transaction_start(struct connection *conn, struct buffered_data *node);
    57.9  void do_transaction_end(struct connection *conn, const char *arg);
   57.10  
   57.11 -/* Is node covered by this transaction? */
   57.12 -bool within_transaction(struct transaction *trans, const char *node);
   57.13 -
   57.14 -/* If a write op on this node blocked by another connections' transaction,
   57.15 - * mark conn, setup transaction timeout and return true.
   57.16 - */
   57.17 -bool transaction_block(struct connection *conn, const char *node);
   57.18 -
   57.19 -/* Return transaction which covers this node. */
   57.20 -struct transaction *transaction_covering_node(const char *node);
   57.21 -
   57.22 -/* Return directory of node within transaction t. */
   57.23 -char *node_dir_inside_transaction(struct transaction *t, const char *node);
   57.24 +bool transaction_block(struct connection *conn);
   57.25  
   57.26  /* This node was changed: can fail and longjmp. */
   57.27  void add_change_node(struct transaction *trans, const char *node, bool recurse);
   57.28  
   57.29 -/* Get shortest timeout: leave tv unset if none. */
   57.30 -void shortest_transaction_timeout(struct timeval *tv);
   57.31 -
   57.32 -/* Have any transactions timed out yet? */
   57.33 -void check_transaction_timeout(void);
   57.34 +/* Return tdb context to use for this connection. */
   57.35 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans);
   57.36  #endif /* _XENSTORED_TRANSACTION_H */
    58.1 --- a/tools/xenstore/xenstored_watch.c	Fri Sep 23 15:41:28 2005 -0600
    58.2 +++ b/tools/xenstore/xenstored_watch.c	Mon Sep 26 11:07:49 2005 -0600
    58.3 @@ -96,36 +96,38 @@ static int destroy_watch_event(void *_ev
    58.4  }
    58.5  
    58.6  static void add_event(struct connection *conn,
    58.7 -		      struct watch *watch, const char *node)
    58.8 +		      struct watch *watch,
    58.9 +		      const char *name)
   58.10  {
   58.11  	struct watch_event *event;
   58.12  
   58.13 -	/* Check read permission: no permission, no watch event.
   58.14 -	 * If it doesn't exist, we need permission to read parent.
   58.15 -	 */
   58.16 -	if (!check_node_perms(conn, node, XS_PERM_READ|XS_PERM_ENOENT_OK) &&
   58.17 -	    !check_event_node(node)) {
   58.18 -		return;
   58.19 +	if (!check_event_node(name)) {
   58.20 +		/* Can this conn load node, or see that it doesn't exist? */
   58.21 +		struct node *node;
   58.22 +
   58.23 +		node = get_node(conn, name, XS_PERM_READ);
   58.24 +		if (!node && errno != ENOENT)
   58.25 +			return;
   58.26  	}
   58.27  
   58.28  	if (watch->relative_path) {
   58.29 -		node += strlen(watch->relative_path);
   58.30 -		if (*node == '/') /* Could be "" */
   58.31 -			node++;
   58.32 +		name += strlen(watch->relative_path);
   58.33 +		if (*name == '/') /* Could be "" */
   58.34 +			name++;
   58.35  	}
   58.36  
   58.37  	event = talloc(watch, struct watch_event);
   58.38 -	event->len = strlen(node) + 1 + strlen(watch->token) + 1;
   58.39 +	event->len = strlen(name) + 1 + strlen(watch->token) + 1;
   58.40  	event->data = talloc_array(event, char, event->len);
   58.41 -	strcpy(event->data, node);
   58.42 -	strcpy(event->data + strlen(node) + 1, watch->token);
   58.43 +	strcpy(event->data, name);
   58.44 +	strcpy(event->data + strlen(name) + 1, watch->token);
   58.45  	talloc_set_destructor(event, destroy_watch_event);
   58.46  	list_add_tail(&event->list, &watch->events);
   58.47  	trace_create(event, "watch_event");
   58.48  }
   58.49  
   58.50  /* FIXME: we fail to fire on out of memory.  Should drop connections. */
   58.51 -void fire_watches(struct connection *conn, const char *node, bool recurse)
   58.52 +void fire_watches(struct connection *conn, const char *name, bool recurse)
   58.53  {
   58.54  	struct connection *i;
   58.55  	struct watch *watch;
   58.56 @@ -137,9 +139,9 @@ void fire_watches(struct connection *con
   58.57  	/* Create an event for each watch. */
   58.58  	list_for_each_entry(i, &connections, list) {
   58.59  		list_for_each_entry(watch, &i->watches, list) {
   58.60 -			if (is_child(node, watch->node))
   58.61 -				add_event(i, watch, node);
   58.62 -			else if (recurse && is_child(watch->node, node))
   58.63 +			if (is_child(name, watch->node))
   58.64 +				add_event(i, watch, name);
   58.65 +			else if (recurse && is_child(watch->node, name))
   58.66  				add_event(i, watch, watch->node);
   58.67  			else
   58.68  				continue;
   58.69 @@ -156,49 +158,6 @@ static int destroy_watch(void *_watch)
   58.70  	return 0;
   58.71  }
   58.72  
   58.73 -void shortest_watch_ack_timeout(struct timeval *tv)
   58.74 -{
   58.75 -	(void)tv;
   58.76 -#if 0 /* FIXME */
   58.77 -	struct watch *watch;
   58.78 -
   58.79 -	list_for_each_entry(watch, &watches, list) {
   58.80 -		struct watch_event *i;
   58.81 -		list_for_each_entry(i, &watch->events, list) {
   58.82 -			if (!timerisset(&i->timeout))
   58.83 -				continue;
   58.84 -			if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
   58.85 -				*tv = i->timeout;
   58.86 -		}
   58.87 -	}
   58.88 -#endif
   58.89 -}	
   58.90 -
   58.91 -void check_watch_ack_timeout(void)
   58.92 -{
   58.93 -#if 0
   58.94 -	struct watch *watch;
   58.95 -	struct timeval now;
   58.96 -
   58.97 -	gettimeofday(&now, NULL);
   58.98 -	list_for_each_entry(watch, &watches, list) {
   58.99 -		struct watch_event *i, *tmp;
  58.100 -		list_for_each_entry_safe(i, tmp, &watch->events, list) {
  58.101 -			if (!timerisset(&i->timeout))
  58.102 -				continue;
  58.103 -			if (timercmp(&i->timeout, &now, <)) {
  58.104 -				xprintf("Warning: timeout on watch event %s"
  58.105 -					" token %s\n",
  58.106 -					i->node, watch->token);
  58.107 -				trace_watch_timeout(watch->conn, i->node,
  58.108 -						    watch->token);
  58.109 -				timerclear(&i->timeout);
  58.110 -			}
  58.111 -		}
  58.112 -	}
  58.113 -#endif
  58.114 -}
  58.115 -
  58.116  void do_watch(struct connection *conn, struct buffered_data *in)
  58.117  {
  58.118  	struct watch *watch;
    59.1 --- a/tools/xenstore/xenstored_watch.h	Fri Sep 23 15:41:28 2005 -0600
    59.2 +++ b/tools/xenstore/xenstored_watch.h	Mon Sep 26 11:07:49 2005 -0600
    59.3 @@ -32,15 +32,9 @@ bool is_watch_event(struct connection *c
    59.4  /* Look through our watches: if any of them have an event, queue it. */
    59.5  void queue_next_event(struct connection *conn);
    59.6  
    59.7 -/* Fire all watches: recurse means all the children are effected (ie. rm).
    59.8 +/* Fire all watches: recurse means all the children are affected (ie. rm).
    59.9   */
   59.10 -void fire_watches(struct connection *conn, const char *node, bool recurse);
   59.11 -
   59.12 -/* Find shortest timeout: if any, reduce tv (may already be set). */
   59.13 -void shortest_watch_ack_timeout(struct timeval *tv);
   59.14 -
   59.15 -/* Check for watches which may have timed out. */
   59.16 -void check_watch_ack_timeout(void);
   59.17 +void fire_watches(struct connection *conn, const char *name, bool recurse);
   59.18  
   59.19  void dump_watches(struct connection *conn);
   59.20  
    60.1 --- a/tools/xenstore/xs.c	Fri Sep 23 15:41:28 2005 -0600
    60.2 +++ b/tools/xenstore/xs.c	Mon Sep 26 11:07:49 2005 -0600
    60.3 @@ -497,13 +497,12 @@ bool xs_unwatch(struct xs_handle *h, con
    60.4  
    60.5  /* Start a transaction: changes by others will not be seen during this
    60.6   * transaction, and changes will not be visible to others until end.
    60.7 - * Transaction only applies to the given subtree.
    60.8   * You can only have one transaction at any time.
    60.9   * Returns false on failure.
   60.10   */
   60.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree)
   60.12 +bool xs_transaction_start(struct xs_handle *h)
   60.13  {
   60.14 -	return xs_bool(xs_single(h, XS_TRANSACTION_START, subtree, NULL));
   60.15 +	return xs_bool(xs_single(h, XS_TRANSACTION_START, "", NULL));
   60.16  }
   60.17  
   60.18  /* End a transaction.
    61.1 --- a/tools/xenstore/xs.h	Fri Sep 23 15:41:28 2005 -0600
    61.2 +++ b/tools/xenstore/xs.h	Mon Sep 26 11:07:49 2005 -0600
    61.3 @@ -109,16 +109,15 @@ bool xs_unwatch(struct xs_handle *h, con
    61.4  
    61.5  /* Start a transaction: changes by others will not be seen during this
    61.6   * transaction, and changes will not be visible to others until end.
    61.7 - * Transaction only applies to the given subtree.
    61.8   * You can only have one transaction at any time.
    61.9   * Returns false on failure.
   61.10   */
   61.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree);
   61.12 +bool xs_transaction_start(struct xs_handle *h);
   61.13  
   61.14  /* End a transaction.
   61.15   * If abandon is true, transaction is discarded instead of committed.
   61.16 - * Returns false on failure, which indicates an error: transactions will
   61.17 - * not fail spuriously.
   61.18 + * Returns false on failure: if errno == EAGAIN, you have to restart
   61.19 + * transaction.
   61.20   */
   61.21  bool xs_transaction_end(struct xs_handle *h, bool abort);
   61.22  
    62.1 --- a/tools/xenstore/xs_lib.c	Fri Sep 23 15:41:28 2005 -0600
    62.2 +++ b/tools/xenstore/xs_lib.c	Mon Sep 26 11:07:49 2005 -0600
    62.3 @@ -50,6 +50,13 @@ static const char *xs_daemon_path(void)
    62.4  	return buf;
    62.5  }
    62.6  
    62.7 +const char *xs_daemon_tdb(void)
    62.8 +{
    62.9 +	static char buf[PATH_MAX];
   62.10 +	sprintf(buf, "%s/tdb", xs_daemon_rootdir());
   62.11 +	return buf;
   62.12 +}
   62.13 +
   62.14  const char *xs_daemon_socket(void)
   62.15  {
   62.16  	return xs_daemon_path();
   62.17 @@ -66,24 +73,6 @@ const char *xs_daemon_socket_ro(void)
   62.18  	return buf;
   62.19  }
   62.20  
   62.21 -const char *xs_daemon_store(void)
   62.22 -{
   62.23 -	static char buf[PATH_MAX];
   62.24 -	if (snprintf(buf, PATH_MAX, "%s/store",
   62.25 -		     xs_daemon_rootdir()) >= PATH_MAX)
   62.26 -		return NULL;
   62.27 -	return buf;
   62.28 -}
   62.29 -
   62.30 -const char *xs_daemon_transactions(void)
   62.31 -{
   62.32 -	static char buf[PATH_MAX];
   62.33 -	if (snprintf(buf, PATH_MAX, "%s/transactions",
   62.34 -		     xs_daemon_rootdir()) >= PATH_MAX)
   62.35 -		return NULL;
   62.36 -	return buf;
   62.37 -}
   62.38 -
   62.39  const char *xs_domain_dev(void)
   62.40  {
   62.41  	char *s = getenv("XENSTORED_PATH");
    63.1 --- a/tools/xenstore/xs_lib.h	Fri Sep 23 15:41:28 2005 -0600
    63.2 +++ b/tools/xenstore/xs_lib.h	Mon Sep 26 11:07:49 2005 -0600
    63.3 @@ -36,7 +36,7 @@ enum xs_perm_type {
    63.4  
    63.5  struct xs_permissions
    63.6  {
    63.7 -	domid_t id;
    63.8 +	unsigned int id;
    63.9  	enum xs_perm_type perms;
   63.10  };
   63.11  
   63.12 @@ -46,9 +46,8 @@ struct xs_permissions
   63.13  /* Path for various daemon things: env vars can override. */
   63.14  const char *xs_daemon_socket(void);
   63.15  const char *xs_daemon_socket_ro(void);
   63.16 -const char *xs_daemon_store(void);
   63.17 -const char *xs_daemon_transactions(void);
   63.18  const char *xs_domain_dev(void);
   63.19 +const char *xs_daemon_tdb(void);
   63.20  
   63.21  /* Simple write function: loops for you. */
   63.22  bool xs_write_all(int fd, const void *data, unsigned int len);
    64.1 --- a/tools/xenstore/xs_random.c	Fri Sep 23 15:41:28 2005 -0600
    64.2 +++ b/tools/xenstore/xs_random.c	Mon Sep 26 11:07:49 2005 -0600
    64.3 @@ -41,7 +41,7 @@ struct ops
    64.4  			  struct xs_permissions *perms,
    64.5  			  unsigned int num);
    64.6  
    64.7 -	bool (*transaction_start)(void *h, const char *subtree);
    64.8 +	bool (*transaction_start)(void *h);
    64.9  	bool (*transaction_end)(void *h, bool abort);
   64.10  
   64.11  	/* Create and destroy a new handle. */
   64.12 @@ -53,7 +53,6 @@ struct file_ops_info
   64.13  {
   64.14  	const char *base;
   64.15  	char *transact_base;
   64.16 -	char *transact;
   64.17  };
   64.18  
   64.19  static void convert_to_dir(const char *dirname)
   64.20 @@ -96,31 +95,6 @@ static char *path_to_name(struct file_op
   64.21  	return filename;
   64.22  }
   64.23  
   64.24 -/* Is child a subnode of parent, or equal? */
   64.25 -static bool is_child(const char *child, const char *parent)
   64.26 -{
   64.27 -	unsigned int len = strlen(parent);
   64.28 -
   64.29 -	/* / should really be "" for this algorithm to work, but that's a
   64.30 -	 * usability nightmare. */
   64.31 -	if (streq(parent, "/"))
   64.32 -		return true;
   64.33 -
   64.34 -	if (strncmp(child, parent, len) != 0)
   64.35 -		return false;
   64.36 -
   64.37 -	return child[len] == '/' || child[len] == '\0';
   64.38 -}
   64.39 -
   64.40 -static bool write_ok(struct file_ops_info *info, const char *path)
   64.41 -{
   64.42 -	if (info->transact && !is_child(path, info->transact)) {
   64.43 -		errno = EROFS;
   64.44 -		return false;
   64.45 -	}
   64.46 -	return true;
   64.47 -}	
   64.48 -
   64.49  static char **file_directory(struct file_ops_info *info,
   64.50  			     const char *path, unsigned int *num)
   64.51  {
   64.52 @@ -184,8 +158,10 @@ static void *file_read(struct file_ops_i
   64.53  
   64.54  	ret = grab_file(filename, &size);
   64.55  	/* Directory exists, .DATA doesn't. */
   64.56 -	if (!ret && errno == ENOENT && strends(filename, ".DATA"))
   64.57 -		errno = EISDIR;
   64.58 +	if (!ret && errno == ENOENT && strends(filename, ".DATA")) {
   64.59 +		ret = strdup("");
   64.60 +		size = 0;
   64.61 +	}
   64.62  	*len = size;
   64.63  	return ret;
   64.64  }
   64.65 @@ -270,9 +246,6 @@ static bool file_set_perms(struct file_o
   64.66  		return false;
   64.67  	}
   64.68  
   64.69 -	if (!write_ok(info, path))
   64.70 -		return false;
   64.71 -
   64.72  	/* Check non-perm file exists/ */
   64.73  	if (lstat(filename, &st) != 0)
   64.74  		return false;
   64.75 @@ -338,9 +311,6 @@ static bool file_write(struct file_ops_i
   64.76  	char *filename = filename_to_data(path_to_name(info, path));
   64.77  	int fd;
   64.78  
   64.79 -	if (!write_ok(info, path))
   64.80 -		return false;
   64.81 -
   64.82  	make_dirs(parent_filename(filename));
   64.83  	fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0600);
   64.84  	if (fd < 0)
   64.85 @@ -358,9 +328,6 @@ static bool file_mkdir(struct file_ops_i
   64.86  {
   64.87  	char *dirname = path_to_name(info, path);
   64.88  
   64.89 -	if (!write_ok(info, path))
   64.90 -		return false;
   64.91 -
   64.92  	make_dirs(parent_filename(dirname));
   64.93  	if (mkdir(dirname, 0700) != 0)
   64.94  		return (errno == EEXIST);
   64.95 @@ -374,20 +341,12 @@ static bool file_rm(struct file_ops_info
   64.96  	char *filename = path_to_name(info, path);
   64.97  	struct stat st;
   64.98  
   64.99 -	if (info->transact && streq(info->transact, path)) {
  64.100 -		errno = EINVAL;
  64.101 -		return false;
  64.102 -	}
  64.103 -
  64.104  	if (lstat(filename, &st) != 0) {
  64.105  		if (lstat(parent_filename(filename), &st) != 0)
  64.106  			return false;
  64.107  		return true;
  64.108  	}
  64.109  
  64.110 -	if (!write_ok(info, path))
  64.111 -		return false;
  64.112 -
  64.113  	if (streq(path, "/")) {
  64.114  		errno = EINVAL;
  64.115  		return false;
  64.116 @@ -398,28 +357,20 @@ static bool file_rm(struct file_ops_info
  64.117  	return true;
  64.118  }
  64.119  
  64.120 -static bool file_transaction_start(struct file_ops_info *info,
  64.121 -				   const char *subtree)
  64.122 +static bool file_transaction_start(struct file_ops_info *info)
  64.123  {
  64.124  	char *cmd;
  64.125 -	char *filename = path_to_name(info, subtree);
  64.126 -	struct stat st;
  64.127  
  64.128 -	if (info->transact) {
  64.129 +	if (info->transact_base) {
  64.130  		errno = EBUSY;
  64.131  		return false;
  64.132  	}
  64.133  
  64.134 -	if (lstat(filename, &st) != 0)
  64.135 -		return false;
  64.136 -
  64.137 -	cmd = talloc_asprintf(NULL, "cp -r %s %s.transact",
  64.138 -			      info->base, info->base);
  64.139 +	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  64.140 +	cmd = talloc_asprintf(NULL, "cp -r %s %s",
  64.141 +			      info->base, info->transact_base);
  64.142  	do_command(cmd);
  64.143  	talloc_free(cmd);
  64.144 -
  64.145 -	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  64.146 -	info->transact = talloc_strdup(NULL, subtree);
  64.147  	return true;
  64.148  }
  64.149  
  64.150 @@ -427,7 +378,7 @@ static bool file_transaction_end(struct 
  64.151  {
  64.152  	char *old, *cmd;
  64.153  
  64.154 -	if (!info->transact) {
  64.155 +	if (!info->transact_base) {
  64.156  		errno = ENOENT;
  64.157  		return false;
  64.158  	}
  64.159 @@ -448,9 +399,7 @@ static bool file_transaction_end(struct 
  64.160  
  64.161  success:
  64.162  	talloc_free(cmd);
  64.163 -	talloc_free(info->transact);
  64.164  	talloc_free(info->transact_base);
  64.165 -	info->transact = NULL;
  64.166  	info->transact_base = NULL;
  64.167  	return true;
  64.168  }
  64.169 @@ -461,7 +410,6 @@ static struct file_ops_info *file_handle
  64.170  
  64.171  	info->base = dir;
  64.172  	info->transact_base = NULL;
  64.173 -	info->transact = NULL;
  64.174  	return info;
  64.175  }
  64.176  
  64.177 @@ -898,11 +846,10 @@ static char *do_next_op(struct ops *ops,
  64.178  	case 7: {
  64.179  		if (verbose)
  64.180  			printf("START %s\n", name);
  64.181 -		ret = bool_to_errstring(ops->transaction_start(h, name));
  64.182 +		ret = bool_to_errstring(ops->transaction_start(h));
  64.183  		if (streq(ret, "OK")) {
  64.184  			talloc_free(ret);
  64.185 -			ret = talloc_asprintf(NULL, "OK:START-TRANSACT:%s",
  64.186 -					      name);
  64.187 +			ret = talloc_asprintf(NULL, "OK:START-TRANSACT");
  64.188  		}
  64.189  
  64.190  		break;
  64.191 @@ -978,6 +925,8 @@ static void setup_file_ops(const char *d
  64.192  		barf_perror("Creating directory %s/tool", dir);
  64.193  	if (!file_set_perms(h, talloc_strdup(h, "/"), &perm, 1))
  64.194  		barf_perror("Setting root perms in %s", dir);
  64.195 +	if (!file_set_perms(h, talloc_strdup(h, "/tool"), &perm, 1))
  64.196 +		barf_perror("Setting root perms in %s/tool", dir);
  64.197  	file_close(h);
  64.198  }
  64.199  
  64.200 @@ -1071,7 +1020,7 @@ static unsigned int try_simple(const boo
  64.201  			goto out;
  64.202  
  64.203  		if (!data->fast) {
  64.204 -			if (strstarts(ret, "OK:START-TRANSACT:")) {
  64.205 +			if (streq(ret, "OK:START-TRANSACT")) {
  64.206  				void *pre = data->ops->handle(data->dir);
  64.207  
  64.208  				snapshot = dump(data->ops, pre);
  64.209 @@ -1303,7 +1252,7 @@ static unsigned int try_diff(const bool 
  64.210  			     void *_data)
  64.211  {
  64.212  	void *fileh, *xsh;
  64.213 -	char *transact = NULL;
  64.214 +	bool transact = false;
  64.215  	struct ops *fail;
  64.216  	struct diff_data *data = _data;
  64.217  	unsigned int i, print;
  64.218 @@ -1348,13 +1297,9 @@ static unsigned int try_diff(const bool 
  64.219  			goto out;
  64.220  
  64.221  		if (strstarts(file, "OK:START-TRANSACT:"))
  64.222 -			transact = talloc_strdup(NULL,
  64.223 -						 file +
  64.224 -						 strlen("OK:START-TRANSACT:"));
  64.225 -		else if (streq(file, "OK:STOP-TRANSACT")) {
  64.226 -			talloc_free(transact);
  64.227 -			transact = NULL;
  64.228 -		}
  64.229 +			transact = true;
  64.230 +		else if (streq(file, "OK:STOP-TRANSACT"))
  64.231 +			transact = false;
  64.232  
  64.233  		talloc_free(file);
  64.234  		talloc_free(xs);
  64.235 @@ -1379,7 +1324,7 @@ static unsigned int try_diff(const bool 
  64.236  
  64.237  			fail = NULL;
  64.238  			if (!ops_equal(&xs_ops, xsh_pre, &file_ops, fileh_pre,
  64.239 -				       transact, &fail)) {
  64.240 +				       "/", &fail)) {
  64.241  				if (fail)
  64.242  					barf("%s failed during transact\n",
  64.243  					     fail->name);
  64.244 @@ -1456,9 +1401,6 @@ static unsigned int try_fail(const bool 
  64.245  	fileh = file_handle(data->dir);
  64.246  	xsh = xs_handle(data->dir);
  64.247  
  64.248 -	sprintf(seed, "%i", data->seed);
  64.249 -	free(xs_debug_command(xsh, "failtest", seed, strlen(seed)+1));
  64.250 -
  64.251  	print = number / 76;
  64.252  	if (!print)
  64.253  		print = 1;
  64.254 @@ -1491,8 +1433,12 @@ static unsigned int try_fail(const bool 
  64.255  		if (trymap && !trymap[i])
  64.256  			continue;
  64.257  
  64.258 +		/* Turn on failure. */
  64.259 +		sprintf(seed, "%i", data->seed + i);
  64.260 +		free(xs_debug_command(xsh, "failtest",seed,strlen(seed)+1));
  64.261 +
  64.262  		if (verbose)
  64.263 -			printf("(%i) ", i);
  64.264 +			printf("(%i) seed %s ", i, seed);
  64.265  		ret = do_next_op(&xs_ops, xsh, i + data->seed, verbose);
  64.266  		if (streq(ret, "FAILED:Connection reset by peer")
  64.267  		    || streq(ret, "FAILED:Bad file descriptor")
  64.268 @@ -1549,8 +1495,6 @@ static unsigned int try_fail(const bool 
  64.269  		fail = NULL;
  64.270  		if (!ops_equal(&xs_ops, tmpxsh, &file_ops, tmpfileh, "/",
  64.271  			       &fail)) {
  64.272 -			xs_close(tmpxsh);
  64.273 -			file_close(tmpfileh);
  64.274  			if (fail) {
  64.275  				if (verbose)
  64.276  					printf("%s failed\n", fail->name);
  64.277 @@ -1561,10 +1505,16 @@ static unsigned int try_fail(const bool 
  64.278  				failed = 0;
  64.279  				if (verbose)
  64.280  					printf("(Looks like it succeeded)\n");
  64.281 +				xs_close(tmpxsh);
  64.282 +				file_close(tmpfileh);
  64.283  				goto try_applying;
  64.284  			}
  64.285  			if (verbose)
  64.286 -				printf("Two backends not equal\n");
  64.287 +				printf("Trees differ:\nXS:%s\nFILE:%s\n",
  64.288 +				       dump(&xs_ops, tmpxsh),
  64.289 +				       dump(&file_ops, tmpfileh));
  64.290 +			xs_close(tmpxsh);
  64.291 +			file_close(tmpfileh);
  64.292  			goto out;
  64.293  		}
  64.294  
  64.295 @@ -1572,8 +1522,6 @@ static unsigned int try_fail(const bool 
  64.296  		if (!xsh)
  64.297  			file_transaction_end(fileh, true);
  64.298  
  64.299 -		/* Turn failures back on. */
  64.300 -		free(xs_debug_command(tmpxsh, "failtest",  NULL, 0));
  64.301  		xs_close(tmpxsh);
  64.302  		file_close(tmpfileh);
  64.303  	}
    65.1 --- a/tools/xenstore/xs_stress.c	Fri Sep 23 15:41:28 2005 -0600
    65.2 +++ b/tools/xenstore/xs_stress.c	Mon Sep 26 11:07:49 2005 -0600
    65.3 @@ -8,6 +8,7 @@
    65.4  #include <sys/stat.h>
    65.5  #include <fcntl.h>
    65.6  #include <unistd.h>
    65.7 +#include <errno.h>
    65.8  
    65.9  #define NUM_HANDLES 2
   65.10  #define DIR_FANOUT 3
   65.11 @@ -36,24 +37,18 @@ static void work(unsigned int cycles, un
   65.12  
   65.13  	srandom(childnum);
   65.14  	for (i = 0; i < cycles; i++) {
   65.15 -		unsigned int lockdepth, j, len;
   65.16 -		char file[100] = "", lockdir[100];
   65.17 +		unsigned int j, len;
   65.18 +		char file[100] = "";
   65.19  		char *contents, tmp[100];
   65.20  		struct xs_handle *h = handles[random() % NUM_HANDLES];
   65.21  
   65.22 -		lockdepth = random() % DIR_DEPTH;
   65.23 -		for (j = 0; j < DIR_DEPTH; j++) {
   65.24 -			if (j == lockdepth)
   65.25 -				strcpy(lockdir, file);
   65.26 +		for (j = 0; j < DIR_DEPTH; j++)
   65.27  			sprintf(file + strlen(file), "/%li",
   65.28  				random()%DIR_FANOUT);
   65.29 -		}
   65.30 -		if (streq(lockdir, ""))
   65.31 -			strcpy(lockdir, "/");
   65.32  
   65.33 -		if (!xs_transaction_start(h, lockdir))
   65.34 -			barf_perror("%i: starting transaction %i on %s",
   65.35 -				    childnum, i, lockdir);
   65.36 +		if (!xs_transaction_start(h))
   65.37 +			barf_perror("%i: starting transaction %i",
   65.38 +				    childnum, i);
   65.39  
   65.40  		sprintf(file + strlen(file), "/count");
   65.41  		contents = xs_read(h, file, &len);
   65.42 @@ -68,18 +63,23 @@ static void work(unsigned int cycles, un
   65.43  		/* Abandon 1 in 10 */
   65.44  		if (random() % 10 == 0) {
   65.45  			if (!xs_transaction_end(h, true))
   65.46 -				barf_perror("%i: can't abort transact %s",
   65.47 -					    childnum, lockdir);
   65.48 +				barf_perror("%i: can't abort transact",
   65.49 +					    childnum);
   65.50  			i--;
   65.51  		} else {
   65.52 -			if (!xs_transaction_end(h, false))
   65.53 -				barf_perror("%i: can't commit transact %s",
   65.54 -					    childnum, lockdir);
   65.55 -
   65.56 -			/* Offset when we print . so kids don't all
   65.57 -			 * print at once. */
   65.58 -			if ((i + print/(childnum+1)) % print == 0)
   65.59 -				write(STDOUT_FILENO, &id, 1);
   65.60 +			if (!xs_transaction_end(h, false)) {
   65.61 +				if (errno == EAGAIN) {
   65.62 +					write(STDOUT_FILENO, "!", 1);
   65.63 +					i--;
   65.64 +				} else
   65.65 +					barf_perror("%i: can't commit trans",
   65.66 +						    childnum);
   65.67 +			} else {
   65.68 +				/* Offset when we print . so kids don't all
   65.69 +				 * print at once. */
   65.70 +				if ((i + print/(childnum+1)) % print == 0)
   65.71 +					write(STDOUT_FILENO, &id, 1);
   65.72 +			}
   65.73  		}
   65.74  	}
   65.75  }
   65.76 @@ -201,7 +201,7 @@ int main(int argc, char *argv[])
   65.77  	printf("\nCounting results...\n");
   65.78  	i = tally_counts();
   65.79  	if (i != (unsigned)atoi(argv[1]))
   65.80 -		barf("Total counts %i not %s", i, atoi(argv[1]));
   65.81 +		barf("Total counts %i not %s", i, argv[1]);
   65.82  	printf("Success!\n");
   65.83  	exit(0);
   65.84  }
    66.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    66.2 +++ b/tools/xenstore/xs_tdb_dump.c	Mon Sep 26 11:07:49 2005 -0600
    66.3 @@ -0,0 +1,82 @@
    66.4 +/* Simple program to dump out all records of TDB */
    66.5 +#include <stdint.h>
    66.6 +#include <stdlib.h>
    66.7 +#include <fcntl.h>
    66.8 +#include <stdio.h>
    66.9 +#include <stdarg.h>
   66.10 +
   66.11 +#include "xs_lib.h"
   66.12 +#include "tdb.h"
   66.13 +#include "talloc.h"
   66.14 +#include "utils.h"
   66.15 +
   66.16 +struct record_hdr {
   66.17 +	u32 num_perms;
   66.18 +	u32 datalen;
   66.19 +	u32 childlen;
   66.20 +	struct xs_permissions perms[0];
   66.21 +};
   66.22 +
   66.23 +static u32 total_size(struct record_hdr *hdr)
   66.24 +{
   66.25 +	return sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) 
   66.26 +		+ hdr->datalen + hdr->childlen;
   66.27 +}
   66.28 +
   66.29 +static char perm_to_char(enum xs_perm_type perm)
   66.30 +{
   66.31 +	return perm == XS_PERM_READ ? 'r' :
   66.32 +		perm == XS_PERM_WRITE ? 'w' :
   66.33 +		perm == XS_PERM_NONE ? '-' :
   66.34 +		perm == (XS_PERM_READ|XS_PERM_WRITE) ? 'b' :
   66.35 +		'?';
   66.36 +}
   66.37 +
   66.38 +int main(int argc, char *argv[])
   66.39 +{
   66.40 +	TDB_DATA key;
   66.41 +	TDB_CONTEXT *tdb;
   66.42 +
   66.43 +	if (argc != 2)
   66.44 +		barf("Usage: xs_tdb_dump <tdbfile>");
   66.45 +
   66.46 +	tdb = tdb_open(talloc_strdup(NULL, argv[1]), 0, 0, O_RDONLY, 0);
   66.47 +	if (!tdb)
   66.48 +		barf_perror("Could not open %s", argv[1]);
   66.49 +
   66.50 +	key = tdb_firstkey(tdb);
   66.51 +	while (key.dptr) {
   66.52 +		TDB_DATA data;
   66.53 +		struct record_hdr *hdr;
   66.54 +
   66.55 +		data = tdb_fetch(tdb, key);
   66.56 +		hdr = (void *)data.dptr;
   66.57 +		if (data.dsize < sizeof(*hdr))
   66.58 +			fprintf(stderr, "%.*s: BAD truncated\n",
   66.59 +				key.dsize, key.dptr);
   66.60 +		else if (data.dsize != total_size(hdr))
   66.61 +			fprintf(stderr, "%.*s: BAD length %i for %i/%i/%i (%i)\n",
   66.62 +				key.dsize, key.dptr, data.dsize,
   66.63 +				hdr->num_perms, hdr->datalen,
   66.64 +				hdr->childlen, total_size(hdr));
   66.65 +		else {
   66.66 +			unsigned int i;
   66.67 +			char *p;
   66.68 +
   66.69 +			printf("%.*s: ", key.dsize, key.dptr);
   66.70 +			for (i = 0; i < hdr->num_perms; i++)
   66.71 +				printf("%s%c%i",
   66.72 +				       i == 0 ? "" : ",",
   66.73 +				       perm_to_char(hdr->perms[i].perms),
   66.74 +				       hdr->perms[i].id);
   66.75 +			p = (void *)&hdr->perms[hdr->num_perms];
   66.76 +			printf(" %.*s\n", hdr->datalen, p);
   66.77 +			p += hdr->datalen;
   66.78 +			for (i = 0; i < hdr->childlen; i += strlen(p+i)+1)
   66.79 +				printf("\t-> %s\n", p+i);
   66.80 +		}
   66.81 +		key = tdb_nextkey(tdb, key);
   66.82 +	}
   66.83 +	return 0;
   66.84 +}
   66.85 +
    67.1 --- a/tools/xenstore/xs_test.c	Fri Sep 23 15:41:28 2005 -0600
    67.2 +++ b/tools/xenstore/xs_test.c	Mon Sep 26 11:07:49 2005 -0600
    67.3 @@ -562,9 +562,9 @@ static void do_unwatch(unsigned int hand
    67.4  		failed(handle);
    67.5  }
    67.6  
    67.7 -static void do_start(unsigned int handle, const char *node)
    67.8 +static void do_start(unsigned int handle)
    67.9  {
   67.10 -	if (!xs_transaction_start(handles[handle], node))
   67.11 +	if (!xs_transaction_start(handles[handle]))
   67.12  		failed(handle);
   67.13  }
   67.14  
   67.15 @@ -791,7 +791,7 @@ static void do_command(unsigned int defa
   67.16  		xs_daemon_close(handles[handle]);
   67.17  		handles[handle] = NULL;
   67.18  	} else if (streq(command, "start"))
   67.19 -		do_start(handle, arg(line, 1));
   67.20 +		do_start(handle);
   67.21  	else if (streq(command, "commit"))
   67.22  		do_end(handle, false);
   67.23  	else if (streq(command, "abort"))
    68.1 --- a/xen/arch/x86/mm.c	Fri Sep 23 15:41:28 2005 -0600
    68.2 +++ b/xen/arch/x86/mm.c	Mon Sep 26 11:07:49 2005 -0600
    68.3 @@ -2273,8 +2273,7 @@ int do_mmu_update(
    68.4  
    68.5  
    68.6  int update_grant_pte_mapping(
    68.7 -    unsigned long pte_addr, l1_pgentry_t _nl1e, 
    68.8 -    struct domain *d, struct vcpu *v)
    68.9 +    unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v)
   68.10  {
   68.11      int rc = GNTST_okay;
   68.12      void *va;
   68.13 @@ -2282,6 +2281,7 @@ int update_grant_pte_mapping(
   68.14      struct pfn_info *page;
   68.15      u32 type_info;
   68.16      l1_pgentry_t ol1e;
   68.17 +    struct domain *d = v->domain;
   68.18  
   68.19      ASSERT(spin_is_locked(&d->big_lock));
   68.20      ASSERT(!shadow_mode_refcounts(d));
   68.21 @@ -2319,8 +2319,6 @@ int update_grant_pte_mapping(
   68.22  
   68.23      put_page_from_l1e(ol1e, d);
   68.24  
   68.25 -    rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_all : GNTST_okay;
   68.26 -
   68.27      if ( unlikely(shadow_mode_enabled(d)) )
   68.28      {
   68.29          struct domain_mmap_cache sh_mapcache;
   68.30 @@ -2415,10 +2413,10 @@ int clear_grant_pte_mapping(
   68.31  
   68.32  
   68.33  int update_grant_va_mapping(
   68.34 -    unsigned long va, l1_pgentry_t _nl1e, struct domain *d, struct vcpu *v)
   68.35 +    unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v)
   68.36  {
   68.37 -    int rc = GNTST_okay;
   68.38      l1_pgentry_t *pl1e, ol1e;
   68.39 +    struct domain *d = v->domain;
   68.40      
   68.41      ASSERT(spin_is_locked(&d->big_lock));
   68.42      ASSERT(!shadow_mode_refcounts(d));
   68.43 @@ -2439,12 +2437,10 @@ int update_grant_va_mapping(
   68.44  
   68.45      put_page_from_l1e(ol1e, d);
   68.46  
   68.47 -    rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_one : GNTST_okay;
   68.48 -
   68.49      if ( unlikely(shadow_mode_enabled(d)) )
   68.50          shadow_do_update_va_mapping(va, _nl1e, v);
   68.51  
   68.52 -    return rc;
   68.53 +    return GNTST_okay;
   68.54  }
   68.55  
   68.56  int clear_grant_va_mapping(unsigned long addr, unsigned long frame)
    69.1 --- a/xen/arch/x86/vmx_vmcs.c	Fri Sep 23 15:41:28 2005 -0600
    69.2 +++ b/xen/arch/x86/vmx_vmcs.c	Mon Sep 26 11:07:49 2005 -0600
    69.3 @@ -37,19 +37,19 @@
    69.4  #endif
    69.5  #ifdef CONFIG_VMX
    69.6  
    69.7 -struct vmcs_struct *alloc_vmcs(void) 
    69.8 +struct vmcs_struct *alloc_vmcs(void)
    69.9  {
   69.10      struct vmcs_struct *vmcs;
   69.11      u32 vmx_msr_low, vmx_msr_high;
   69.12  
   69.13      rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
   69.14      vmcs_size = vmx_msr_high & 0x1fff;
   69.15 -    vmcs = alloc_xenheap_pages(get_order_from_bytes(vmcs_size)); 
   69.16 +    vmcs = alloc_xenheap_pages(get_order_from_bytes(vmcs_size));
   69.17      memset((char *)vmcs, 0, vmcs_size); /* don't remove this */
   69.18  
   69.19      vmcs->vmcs_revision_id = vmx_msr_low;
   69.20      return vmcs;
   69.21 -} 
   69.22 +}
   69.23  
   69.24  void free_vmcs(struct vmcs_struct *vmcs)
   69.25  {
   69.26 @@ -65,7 +65,7 @@ static inline int construct_vmcs_control
   69.27      void *io_bitmap_a;
   69.28      void *io_bitmap_b;
   69.29  
   69.30 -    error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL, 
   69.31 +    error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
   69.32                         MONITOR_PIN_BASED_EXEC_CONTROLS);
   69.33  
   69.34      error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS);
   69.35 @@ -73,8 +73,8 @@ static inline int construct_vmcs_control
   69.36      error |= __vmwrite(VM_ENTRY_CONTROLS, MONITOR_VM_ENTRY_CONTROLS);
   69.37  
   69.38      /* need to use 0x1000 instead of PAGE_SIZE */
   69.39 -    io_bitmap_a = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000)); 
   69.40 -    io_bitmap_b = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000)); 
   69.41 +    io_bitmap_a = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000));
   69.42 +    io_bitmap_b = (void*) alloc_xenheap_pages(get_order_from_bytes(0x1000));
   69.43      memset(io_bitmap_a, 0xff, 0x1000);
   69.44      /* don't bother debug port access */
   69.45      clear_bit(PC_DEBUG_PORT, io_bitmap_a);
   69.46 @@ -89,8 +89,10 @@ static inline int construct_vmcs_control
   69.47      return error;
   69.48  }
   69.49  
   69.50 -#define GUEST_SEGMENT_LIMIT     0xffffffff      
   69.51 -#define HOST_SEGMENT_LIMIT      0xffffffff      
   69.52 +#define GUEST_LAUNCH_DS         0x08
   69.53 +#define GUEST_LAUNCH_CS         0x10
   69.54 +#define GUEST_SEGMENT_LIMIT     0xffffffff
   69.55 +#define HOST_SEGMENT_LIMIT      0xffffffff
   69.56  
   69.57  struct host_execution_env {
   69.58      /* selectors */
   69.59 @@ -110,72 +112,76 @@ struct host_execution_env {
   69.60      unsigned long tr_base;
   69.61      unsigned long ds_base;
   69.62      unsigned long cs_base;
   69.63 -#ifdef __x86_64__ 
   69.64 -    unsigned long fs_base; 
   69.65 -    unsigned long gs_base; 
   69.66 -#endif 
   69.67 +#ifdef __x86_64__
   69.68 +    unsigned long fs_base;
   69.69 +    unsigned long gs_base;
   69.70 +#endif
   69.71  };
   69.72  
   69.73 -#define round_pgdown(_p) ((_p)&PAGE_MASK) /* coped from domain.c */
   69.74 -
   69.75 -int vmx_setup_platform(struct vcpu *d, struct cpu_user_regs *regs)
   69.76 +static void vmx_setup_platform(struct vcpu *v, struct cpu_user_regs *regs)
   69.77  {
   69.78      int i;
   69.79 -    unsigned int n;
   69.80 -    unsigned long *p, mpfn, offset, addr;
   69.81 -    struct e820entry *e820p;
   69.82 +    unsigned char e820_map_nr;
   69.83 +    struct e820entry *e820entry;
   69.84 +    unsigned char *p;
   69.85 +    unsigned long mpfn;
   69.86      unsigned long gpfn = 0;
   69.87  
   69.88      local_flush_tlb_pge();
   69.89 -    regs->ebx = 0;   /* Linux expects ebx to be 0 for boot proc */
   69.90  
   69.91 -    n = regs->ecx;
   69.92 -    if (n > 32) {
   69.93 -        VMX_DBG_LOG(DBG_LEVEL_1, "Too many e820 entries: %d", n);
   69.94 -        return -1;
   69.95 +    mpfn = get_mfn_from_pfn(E820_MAP_PAGE >> PAGE_SHIFT);
   69.96 +    if (mpfn == INVALID_MFN) {
   69.97 +        printk("Can not find E820 memory map page for VMX domain.\n");
   69.98 +        domain_crash();
   69.99      }
  69.100  
  69.101 -    addr = regs->edi;
  69.102 -    offset = (addr & ~PAGE_MASK);
  69.103 -    addr = round_pgdown(addr);
  69.104 -
  69.105 -    mpfn = get_mfn_from_pfn(addr >> PAGE_SHIFT);
  69.106      p = map_domain_page(mpfn);
  69.107 -
  69.108 -    e820p = (struct e820entry *) ((unsigned long) p + offset); 
  69.109 +    if (p == NULL) {
  69.110 +        printk("Can not map E820 memory map page for VMX domain.\n");
  69.111 +        domain_crash();
  69.112 +    }
  69.113  
  69.114 -#ifndef NDEBUG
  69.115 -    print_e820_memory_map(e820p, n);
  69.116 -#endif
  69.117 +    e820_map_nr = *(p + E820_MAP_NR_OFFSET);
  69.118 +    e820entry = (struct e820entry *)(p + E820_MAP_OFFSET);
  69.119  
  69.120 -    for ( i = 0; i < n; i++ )
  69.121 +    for ( i = 0; i < e820_map_nr; i++ )
  69.122      {
  69.123 -        if ( e820p[i].type == E820_SHARED_PAGE )
  69.124 +        if (e820entry[i].type == E820_SHARED_PAGE)
  69.125          {
  69.126 -            gpfn = (e820p[i].addr >> PAGE_SHIFT);
  69.127 +            gpfn = (e820entry[i].addr >> PAGE_SHIFT);
  69.128              break;
  69.129          }
  69.130      }
  69.131  
  69.132 -    if ( gpfn == 0 )
  69.133 -    {
  69.134 -        unmap_domain_page(p);        
  69.135 -        return -1;
  69.136 -    }   
  69.137 +    if ( gpfn == 0 ) {
  69.138 +        printk("Can not get io request shared page"
  69.139 +               " from E820 memory map for VMX domain.\n");
  69.140 +        unmap_domain_page(p);
  69.141 +        domain_crash();
  69.142 +    }
  69.143 +    unmap_domain_page(p);
  69.144  
  69.145 -    unmap_domain_page(p);        
  69.146 +    if (v->vcpu_id)
  69.147 +        return;
  69.148  
  69.149      /* Initialise shared page */
  69.150      mpfn = get_mfn_from_pfn(gpfn);
  69.151 -    p = map_domain_page(mpfn);
  69.152 -    d->domain->arch.vmx_platform.shared_page_va = (unsigned long)p;
  69.153 +    if (mpfn == INVALID_MFN) {
  69.154 +        printk("Can not find io request shared page for VMX domain.\n");
  69.155 +        domain_crash();
  69.156 +    }
  69.157  
  69.158 -    VMX_DBG_LOG(DBG_LEVEL_1, "eport: %x\n", iopacket_port(d->domain));
  69.159 +    p = map_domain_page(mpfn);
  69.160 +    if (p == NULL) {
  69.161 +        printk("Can not map io request shared page for VMX domain.\n");
  69.162 +        domain_crash();
  69.163 +    }
  69.164 +    v->domain->arch.vmx_platform.shared_page_va = (unsigned long)p;
  69.165  
  69.166 -    clear_bit(iopacket_port(d->domain), 
  69.167 -              &d->domain->shared_info->evtchn_mask[0]);
  69.168 +    VMX_DBG_LOG(DBG_LEVEL_1, "eport: %x\n", iopacket_port(v->domain));
  69.169  
  69.170 -    return 0;
  69.171 +    clear_bit(iopacket_port(v->domain),
  69.172 +              &v->domain->shared_info->evtchn_mask[0]);
  69.173  }
  69.174  
  69.175  void vmx_set_host_env(struct vcpu *v)
  69.176 @@ -203,7 +209,7 @@ void vmx_set_host_env(struct vcpu *v)
  69.177      error |= __vmwrite(HOST_TR_BASE, host_env.tr_base);
  69.178  }
  69.179  
  69.180 -void vmx_do_launch(struct vcpu *v) 
  69.181 +void vmx_do_launch(struct vcpu *v)
  69.182  {
  69.183  /* Update CR3, GDT, LDT, TR */
  69.184      unsigned int  error = 0;
  69.185 @@ -217,7 +223,7 @@ void vmx_do_launch(struct vcpu *v)
  69.186      error |= __vmwrite(GUEST_CR0, cr0);
  69.187      cr0 &= ~X86_CR0_PG;
  69.188      error |= __vmwrite(CR0_READ_SHADOW, cr0);
  69.189 -    error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, 
  69.190 +    error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL,
  69.191                         MONITOR_CPU_BASED_EXEC_CONTROLS);
  69.192  
  69.193      __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (cr4) : );
  69.194 @@ -247,7 +253,7 @@ void vmx_do_launch(struct vcpu *v)
  69.195      error |= __vmwrite(GUEST_LDTR_SELECTOR, 0);
  69.196      error |= __vmwrite(GUEST_LDTR_BASE, 0);
  69.197      error |= __vmwrite(GUEST_LDTR_LIMIT, 0);
  69.198 -        
  69.199 +
  69.200      error |= __vmwrite(GUEST_TR_BASE, 0);
  69.201      error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
  69.202  
  69.203 @@ -261,10 +267,8 @@ void vmx_do_launch(struct vcpu *v)
  69.204  /*
  69.205   * Initially set the same environement as host.
  69.206   */
  69.207 -static inline int 
  69.208 -construct_init_vmcs_guest(struct cpu_user_regs *regs, 
  69.209 -                          struct vcpu_guest_context *ctxt,
  69.210 -                          struct host_execution_env *host_env)
  69.211 +static inline int
  69.212 +construct_init_vmcs_guest(struct cpu_user_regs *regs)
  69.213  {
  69.214      int error = 0;
  69.215      union vmcs_arbytes arbytes;
  69.216 @@ -292,31 +296,37 @@ construct_init_vmcs_guest(struct cpu_use
  69.217      error |= __vmwrite(CR3_TARGET_COUNT, 0);
  69.218  
  69.219      /* Guest Selectors */
  69.220 -    error |= __vmwrite(GUEST_CS_SELECTOR, regs->cs);
  69.221 -    error |= __vmwrite(GUEST_ES_SELECTOR, regs->es);
  69.222 -    error |= __vmwrite(GUEST_SS_SELECTOR, regs->ss);
  69.223 -    error |= __vmwrite(GUEST_DS_SELECTOR, regs->ds);
  69.224 -    error |= __vmwrite(GUEST_FS_SELECTOR, regs->fs);
  69.225 -    error |= __vmwrite(GUEST_GS_SELECTOR, regs->gs);
  69.226 +    error |= __vmwrite(GUEST_ES_SELECTOR, GUEST_LAUNCH_DS);
  69.227 +    error |= __vmwrite(GUEST_SS_SELECTOR, GUEST_LAUNCH_DS);
  69.228 +    error |= __vmwrite(GUEST_DS_SELECTOR, GUEST_LAUNCH_DS);
  69.229 +    error |= __vmwrite(GUEST_FS_SELECTOR, GUEST_LAUNCH_DS);
  69.230 +    error |= __vmwrite(GUEST_GS_SELECTOR, GUEST_LAUNCH_DS);
  69.231 +    error |= __vmwrite(GUEST_CS_SELECTOR, GUEST_LAUNCH_CS);
  69.232 +
  69.233 +    /* Guest segment bases */
  69.234 +    error |= __vmwrite(GUEST_ES_BASE, 0);
  69.235 +    error |= __vmwrite(GUEST_SS_BASE, 0);
  69.236 +    error |= __vmwrite(GUEST_DS_BASE, 0);
  69.237 +    error |= __vmwrite(GUEST_FS_BASE, 0);
  69.238 +    error |= __vmwrite(GUEST_GS_BASE, 0);
  69.239 +    error |= __vmwrite(GUEST_CS_BASE, 0);
  69.240  
  69.241      /* Guest segment Limits */
  69.242 -    error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
  69.243      error |= __vmwrite(GUEST_ES_LIMIT, GUEST_SEGMENT_LIMIT);
  69.244      error |= __vmwrite(GUEST_SS_LIMIT, GUEST_SEGMENT_LIMIT);
  69.245      error |= __vmwrite(GUEST_DS_LIMIT, GUEST_SEGMENT_LIMIT);
  69.246      error |= __vmwrite(GUEST_FS_LIMIT, GUEST_SEGMENT_LIMIT);
  69.247      error |= __vmwrite(GUEST_GS_LIMIT, GUEST_SEGMENT_LIMIT);
  69.248 +    error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
  69.249  
  69.250 -    error |= __vmwrite(GUEST_IDTR_LIMIT, host_env->idtr_limit);
  69.251 -
  69.252 -    /* AR bytes */
  69.253 +    /* Guest segment AR bytes */
  69.254      arbytes.bytes = 0;
  69.255      arbytes.fields.seg_type = 0x3;          /* type = 3 */
  69.256      arbytes.fields.s = 1;                   /* code or data, i.e. not system */
  69.257      arbytes.fields.dpl = 0;                 /* DPL = 3 */
  69.258      arbytes.fields.p = 1;                   /* segment present */
  69.259      arbytes.fields.default_ops_size = 1;    /* 32-bit */
  69.260 -    arbytes.fields.g = 1;   
  69.261 +    arbytes.fields.g = 1;
  69.262      arbytes.fields.null_bit = 0;            /* not null */
  69.263  
  69.264      error |= __vmwrite(GUEST_ES_AR_BYTES, arbytes.bytes);
  69.265 @@ -328,35 +338,31 @@ construct_init_vmcs_guest(struct cpu_use
  69.266      arbytes.fields.seg_type = 0xb;          /* type = 0xb */
  69.267      error |= __vmwrite(GUEST_CS_AR_BYTES, arbytes.bytes);
  69.268  
  69.269 -    error |= __vmwrite(GUEST_GDTR_BASE, regs->edx);
  69.270 -    regs->edx = 0;
  69.271 -    error |= __vmwrite(GUEST_GDTR_LIMIT, regs->eax);
  69.272 -    regs->eax = 0;
  69.273 +    /* Guest GDT */
  69.274 +    error |= __vmwrite(GUEST_GDTR_BASE, 0);
  69.275 +    error |= __vmwrite(GUEST_GDTR_LIMIT, 0);
  69.276  
  69.277 +    /* Guest IDT */
  69.278 +    error |= __vmwrite(GUEST_IDTR_BASE, 0);
  69.279 +    error |= __vmwrite(GUEST_IDTR_LIMIT, 0);
  69.280 +
  69.281 +    /* Guest LDT & TSS */
  69.282      arbytes.fields.s = 0;                   /* not code or data segement */
  69.283      arbytes.fields.seg_type = 0x2;          /* LTD */
  69.284      arbytes.fields.default_ops_size = 0;    /* 16-bit */
  69.285 -    arbytes.fields.g = 0;   
  69.286 +    arbytes.fields.g = 0;
  69.287      error |= __vmwrite(GUEST_LDTR_AR_BYTES, arbytes.bytes);
  69.288  
  69.289      arbytes.fields.seg_type = 0xb;          /* 32-bit TSS (busy) */
  69.290      error |= __vmwrite(GUEST_TR_AR_BYTES, arbytes.bytes);
  69.291      /* CR3 is set in vmx_final_setup_guest */
  69.292  
  69.293 -    error |= __vmwrite(GUEST_ES_BASE, host_env->ds_base);
  69.294 -    error |= __vmwrite(GUEST_CS_BASE, host_env->cs_base);
  69.295 -    error |= __vmwrite(GUEST_SS_BASE, host_env->ds_base);
  69.296 -    error |= __vmwrite(GUEST_DS_BASE, host_env->ds_base);
  69.297 -    error |= __vmwrite(GUEST_FS_BASE, host_env->ds_base);
  69.298 -    error |= __vmwrite(GUEST_GS_BASE, host_env->ds_base);
  69.299 -    error |= __vmwrite(GUEST_IDTR_BASE, host_env->idtr_base);
  69.300 -
  69.301 -    error |= __vmwrite(GUEST_RSP, regs->esp);
  69.302 +    error |= __vmwrite(GUEST_RSP, 0);
  69.303      error |= __vmwrite(GUEST_RIP, regs->eip);
  69.304  
  69.305 +    /* Guest EFLAGS */
  69.306      eflags = regs->eflags & ~VMCS_EFLAGS_RESERVED_0; /* clear 0s */
  69.307      eflags |= VMCS_EFLAGS_RESERVED_1; /* set 1s */
  69.308 -
  69.309      error |= __vmwrite(GUEST_RFLAGS, eflags);
  69.310  
  69.311      error |= __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
  69.312 @@ -381,14 +387,14 @@ static inline int construct_vmcs_host(st
  69.313  #if defined (__i386__)
  69.314      error |= __vmwrite(HOST_FS_SELECTOR, host_env->ds_selector);
  69.315      error |= __vmwrite(HOST_GS_SELECTOR, host_env->ds_selector);
  69.316 -    error |= __vmwrite(HOST_FS_BASE, host_env->ds_base); 
  69.317 -    error |= __vmwrite(HOST_GS_BASE, host_env->ds_base); 
  69.318 +    error |= __vmwrite(HOST_FS_BASE, host_env->ds_base);
  69.319 +    error |= __vmwrite(HOST_GS_BASE, host_env->ds_base);
  69.320  
  69.321  #else
  69.322 -    rdmsrl(MSR_FS_BASE, host_env->fs_base); 
  69.323 -    rdmsrl(MSR_GS_BASE, host_env->gs_base); 
  69.324 -    error |= __vmwrite(HOST_FS_BASE, host_env->fs_base); 
  69.325 -    error |= __vmwrite(HOST_GS_BASE, host_env->gs_base); 
  69.326 +    rdmsrl(MSR_FS_BASE, host_env->fs_base);
  69.327 +    rdmsrl(MSR_GS_BASE, host_env->gs_base);
  69.328 +    error |= __vmwrite(HOST_FS_BASE, host_env->fs_base);
  69.329 +    error |= __vmwrite(HOST_GS_BASE, host_env->gs_base);
  69.330  
  69.331  #endif
  69.332      host_env->cs_selector = __HYPERVISOR_CS;
  69.333 @@ -401,16 +407,16 @@ static inline int construct_vmcs_host(st
  69.334      error |= __vmwrite(HOST_CR0, crn); /* same CR0 */
  69.335  
  69.336      /* CR3 is set in vmx_final_setup_hostos */
  69.337 -    __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) : ); 
  69.338 +    __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) : );
  69.339      error |= __vmwrite(HOST_CR4, crn);
  69.340  
  69.341      error |= __vmwrite(HOST_RIP, (unsigned long) vmx_asm_vmexit_handler);
  69.342 -#ifdef __x86_64__ 
  69.343 -    /* TBD: support cr8 for 64-bit guest */ 
  69.344 -    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0); 
  69.345 -    __vmwrite(TPR_THRESHOLD, 0); 
  69.346 -    __vmwrite(SECONDARY_VM_EXEC_CONTROL, 0); 
  69.347 -#endif 
  69.348 +#ifdef __x86_64__
  69.349 +    /* TBD: support cr8 for 64-bit guest */
  69.350 +    __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0);
  69.351 +    __vmwrite(TPR_THRESHOLD, 0);
  69.352 +    __vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
  69.353 +#endif
  69.354  
  69.355      return error;
  69.356  }
  69.357 @@ -440,37 +446,37 @@ int construct_vmcs(struct arch_vmx_struc
  69.358  
  69.359      if ((error = __vmpclear (vmcs_phys_ptr))) {
  69.360          printk("construct_vmcs: VMCLEAR failed\n");
  69.361 -        return -EINVAL;         
  69.362 +        return -EINVAL;
  69.363      }
  69.364      if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
  69.365          printk("construct_vmcs: load_vmcs failed: VMCS = %lx\n",
  69.366                 (unsigned long) vmcs_phys_ptr);
  69.367 -        return -EINVAL; 
  69.368 +        return -EINVAL;
  69.369      }
  69.370      if ((error = construct_vmcs_controls(arch_vmx))) {
  69.371          printk("construct_vmcs: construct_vmcs_controls failed\n");
  69.372 -        return -EINVAL;         
  69.373 +        return -EINVAL;
  69.374      }
  69.375      /* host selectors */
  69.376      if ((error = construct_vmcs_host(&host_env))) {
  69.377          printk("construct_vmcs: construct_vmcs_host failed\n");
  69.378 -        return -EINVAL;         
  69.379 +        return -EINVAL;
  69.380      }
  69.381      /* guest selectors */
  69.382 -    if ((error = construct_init_vmcs_guest(regs, ctxt, &host_env))) {
  69.383 +    if ((error = construct_init_vmcs_guest(regs))) {
  69.384          printk("construct_vmcs: construct_vmcs_guest failed\n");
  69.385 -        return -EINVAL;         
  69.386 -    }       
  69.387 +        return -EINVAL;
  69.388 +    }
  69.389  
  69.390 -    if ((error |= __vmwrite(EXCEPTION_BITMAP, 
  69.391 +    if ((error |= __vmwrite(EXCEPTION_BITMAP,
  69.392                              MONITOR_DEFAULT_EXCEPTION_BITMAP))) {
  69.393          printk("construct_vmcs: setting Exception bitmap failed\n");
  69.394 -        return -EINVAL;         
  69.395 +        return -EINVAL;
  69.396      }
  69.397  
  69.398      if (regs->eflags & EF_TF)
  69.399          __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
  69.400 -    else 
  69.401 +    else
  69.402          __vm_clear_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_DB);
  69.403  
  69.404      return 0;
  69.405 @@ -491,7 +497,7 @@ int modify_vmcs(struct arch_vmx_struct *
  69.406      if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
  69.407          printk("modify_vmcs: load_vmcs failed: VMCS = %lx\n",
  69.408                 (unsigned long) vmcs_phys_ptr);
  69.409 -        return -EINVAL; 
  69.410 +        return -EINVAL;
  69.411      }
  69.412      load_cpu_user_regs(regs);
  69.413  
  69.414 @@ -500,23 +506,23 @@ int modify_vmcs(struct arch_vmx_struct *
  69.415      return 0;
  69.416  }
  69.417  
  69.418 -int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
  69.419 +int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
  69.420  {
  69.421      int error;
  69.422  
  69.423      if ((error = __vmptrld(phys_ptr))) {
  69.424 -        clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  69.425 +        clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  69.426          return error;
  69.427      }
  69.428 -    set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  69.429 +    set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  69.430      return 0;
  69.431  }
  69.432  
  69.433 -int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
  69.434 +int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr)
  69.435  {
  69.436      /* take the current VMCS */
  69.437      __vmptrst(phys_ptr);
  69.438 -    clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
  69.439 +    clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags);
  69.440      return 0;
  69.441  }
  69.442  
  69.443 @@ -536,7 +542,7 @@ void vm_resume_fail(unsigned long eflags
  69.444      __vmx_bug(guest_cpu_user_regs());
  69.445  }
  69.446  
  69.447 -void arch_vmx_do_resume(struct vcpu *v) 
  69.448 +void arch_vmx_do_resume(struct vcpu *v)
  69.449  {
  69.450      u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
  69.451  
  69.452 @@ -545,7 +551,7 @@ void arch_vmx_do_resume(struct vcpu *v)
  69.453      reset_stack_and_jump(vmx_asm_do_resume);
  69.454  }
  69.455  
  69.456 -void arch_vmx_do_launch(struct vcpu *v) 
  69.457 +void arch_vmx_do_launch(struct vcpu *v)
  69.458  {
  69.459      u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs);
  69.460  
    70.1 --- a/xen/common/grant_table.c	Fri Sep 23 15:41:28 2005 -0600
    70.2 +++ b/xen/common/grant_table.c	Mon Sep 26 11:07:49 2005 -0600
    70.3 @@ -24,10 +24,6 @@
    70.4   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    70.5   */
    70.6  
    70.7 -#define GRANT_DEBUG 0
    70.8 -#define GRANT_DEBUG_VERBOSE 0
    70.9 -
   70.10 -#include <xen/config.h>
   70.11  #include <xen/lib.h>
   70.12  #include <xen/sched.h>
   70.13  #include <xen/shadow.h>
   70.14 @@ -68,39 +64,32 @@ put_maptrack_handle(
   70.15      t->map_count--;
   70.16  }
   70.17  
   70.18 +/*
   70.19 + * Returns 0 if TLB flush / invalidate required by caller.
   70.20 + * va will indicate the address to be invalidated.
   70.21 + * 
   70.22 + * addr is _either_ a host virtual address, or the address of the pte to
   70.23 + * update, as indicated by the GNTMAP_contains_pte flag.
   70.24 + */
   70.25  static int
   70.26 -__gnttab_activate_grant_ref(
   70.27 -    struct domain   *mapping_d,          /* IN */
   70.28 -    struct vcpu     *mapping_ed,
   70.29 -    struct domain   *granting_d,
   70.30 -    grant_ref_t      ref,
   70.31 -    u16              dev_hst_ro_flags,
   70.32 -    u64              addr,
   70.33 -    unsigned long   *pframe )            /* OUT */
   70.34 +__gnttab_map_grant_ref(
   70.35 +    gnttab_map_grant_ref_t *uop)
   70.36  {
   70.37 -    domid_t               sdom;
   70.38 -    u16                   sflags;
   70.39 +    domid_t        dom;
   70.40 +    grant_ref_t    ref;
   70.41 +    struct domain *ld, *rd;
   70.42 +    struct vcpu   *led;
   70.43 +    u16            dev_hst_ro_flags;
   70.44 +    int            handle;
   70.45 +    u64            addr;
   70.46 +    unsigned long  frame = 0;
   70.47 +    int            rc = GNTST_okay;
   70.48      active_grant_entry_t *act;
   70.49 -    grant_entry_t        *sha;
   70.50 -    s16                   rc = 1;
   70.51 -    unsigned long         frame = 0;
   70.52 -    int                   retries = 0;
   70.53  
   70.54 -    /*
   70.55 -     * Objectives of this function:
   70.56 -     * . Make the record ( granting_d, ref ) active, if not already.
   70.57 -     * . Update shared grant entry of owner, indicating frame is mapped.
   70.58 -     * . Increment the owner act->pin reference counts.
   70.59 -     * . get_page on shared frame if new mapping.
   70.60 -     * . get_page_type if this is first RW mapping of frame.
   70.61 -     * . Add PTE to virtual address space of mapping_d, if necessary.
   70.62 -     * Returns:
   70.63 -     * .  -ve: error
   70.64 -     * .    1: ok
   70.65 -     * .    0: ok and TLB invalidate of host_addr needed.
   70.66 -     *
   70.67 -     * On success, *pframe contains mfn.
   70.68 -     */
   70.69 +    /* Entry details from @rd's shared grant table. */
   70.70 +    grant_entry_t *sha;
   70.71 +    domid_t        sdom;
   70.72 +    u16            sflags;
   70.73  
   70.74      /*
   70.75       * We bound the number of times we retry CMPXCHG on memory locations that
   70.76 @@ -110,11 +99,88 @@ static int
   70.77       * the guest to race our updates (e.g., to change the GTF_readonly flag),
   70.78       * so we allow a few retries before failing.
   70.79       */
   70.80 +    int retries = 0;
   70.81  
   70.82 -    act = &granting_d->grant_table->active[ref];
   70.83 -    sha = &granting_d->grant_table->shared[ref];
   70.84 +    led = current;
   70.85 +    ld = led->domain;
   70.86  
   70.87 -    spin_lock(&granting_d->grant_table->lock);
   70.88 +    /* Bitwise-OR avoids short-circuiting which screws control flow. */
   70.89 +    if ( unlikely(__get_user(dom, &uop->dom) |
   70.90 +                  __get_user(ref, &uop->ref) |
   70.91 +                  __get_user(addr, &uop->host_addr) |
   70.92 +                  __get_user(dev_hst_ro_flags, &uop->flags)) )
   70.93 +    {
   70.94 +        DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n");
   70.95 +        return -EFAULT; /* don't set status */
   70.96 +    }
   70.97 +
   70.98 +    if ( unlikely(ref >= NR_GRANT_ENTRIES) ||
   70.99 +         unlikely((dev_hst_ro_flags &
  70.100 +                   (GNTMAP_device_map|GNTMAP_host_map)) == 0) )
  70.101 +    {
  70.102 +        DPRINTK("Bad ref (%d) or flags (%x).\n", ref, dev_hst_ro_flags);
  70.103 +        (void)__put_user(GNTST_bad_gntref, &uop->handle);
  70.104 +        return GNTST_bad_gntref;
  70.105 +    }
  70.106 +
  70.107 +    if ( acm_pre_grant_map_ref(dom) )
  70.108 +    {
  70.109 +        (void)__put_user(GNTST_permission_denied, &uop->handle);
  70.110 +        return GNTST_permission_denied;
  70.111 +    }
  70.112 +
  70.113 +    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
  70.114 +         unlikely(ld == rd) )
  70.115 +    {
  70.116 +        if ( rd != NULL )
  70.117 +            put_domain(rd);
  70.118 +        DPRINTK("Could not find domain %d\n", dom);
  70.119 +        (void)__put_user(GNTST_bad_domain, &uop->handle);
  70.120 +        return GNTST_bad_domain;
  70.121 +    }
  70.122 +
  70.123 +    /* Get a maptrack handle. */
  70.124 +    if ( unlikely((handle = get_maptrack_handle(ld->grant_table)) == -1) )
  70.125 +    {
  70.126 +        int              i;
  70.127 +        grant_mapping_t *new_mt;
  70.128 +        grant_table_t   *lgt = ld->grant_table;
  70.129 +
  70.130 +        if ( (lgt->maptrack_limit << 1) > MAPTRACK_MAX_ENTRIES )
  70.131 +        {
  70.132 +            put_domain(rd);
  70.133 +            DPRINTK("Maptrack table is at maximum size.\n");
  70.134 +            (void)__put_user(GNTST_no_device_space, &uop->handle);
  70.135 +            return GNTST_no_device_space;
  70.136 +        }
  70.137 +
  70.138 +        /* Grow the maptrack table. */
  70.139 +        new_mt = alloc_xenheap_pages(lgt->maptrack_order + 1);
  70.140 +        if ( new_mt == NULL )
  70.141 +        {
  70.142 +            put_domain(rd);
  70.143 +            DPRINTK("No more map handles available.\n");
  70.144 +            (void)__put_user(GNTST_no_device_space, &uop->handle);
  70.145 +            return GNTST_no_device_space;
  70.146 +        }
  70.147 +
  70.148 +        memcpy(new_mt, lgt->maptrack, PAGE_SIZE << lgt->maptrack_order);
  70.149 +        for ( i = lgt->maptrack_limit; i < (lgt->maptrack_limit << 1); i++ )
  70.150 +            new_mt[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;
  70.151 +
  70.152 +        free_xenheap_pages(lgt->maptrack, lgt->maptrack_order);
  70.153 +        lgt->maptrack          = new_mt;
  70.154 +        lgt->maptrack_order   += 1;
  70.155 +        lgt->maptrack_limit  <<= 1;
  70.156 +
  70.157 +        DPRINTK("Doubled maptrack size\n");
  70.158 +        handle = get_maptrack_handle(ld->grant_table);
  70.159 +    }
  70.160 +
  70.161 +    act = &rd->grant_table->active[ref];
  70.162 +    sha = &rd->grant_table->shared[ref];
  70.163 +
  70.164 +    spin_lock(&rd->grant_table->lock);
  70.165  
  70.166      if ( act->pin == 0 )
  70.167      {
  70.168 @@ -132,10 +198,10 @@ static int
  70.169              u32 scombo, prev_scombo, new_scombo;
  70.170  
  70.171              if ( unlikely((sflags & GTF_type_mask) != GTF_permit_access) ||
  70.172 -                 unlikely(sdom != mapping_d->domain_id) )
  70.173 +                 unlikely(sdom != led->domain->domain_id) )
  70.174                  PIN_FAIL(unlock_out, GNTST_general_error,
  70.175                           "Bad flags (%x) or dom (%d). (NB. expected dom %d)\n",
  70.176 -                        sflags, sdom, mapping_d->domain_id);
  70.177 +                        sflags, sdom, led->domain->domain_id);
  70.178  
  70.179              /* Merge two 16-bit values into a 32-bit combined update. */
  70.180              /* NB. Endianness! */
  70.181 @@ -173,12 +239,12 @@ static int
  70.182  
  70.183          /* rmb(); */ /* not on x86 */
  70.184  
  70.185 -        frame = __gpfn_to_mfn_foreign(granting_d, sha->frame);
  70.186 +        frame = __gpfn_to_mfn_foreign(rd, sha->frame);
  70.187  
  70.188          if ( unlikely(!pfn_valid(frame)) ||
  70.189               unlikely(!((dev_hst_ro_flags & GNTMAP_readonly) ?
  70.190 -                        get_page(&frame_table[frame], granting_d) :
  70.191 -                        get_page_and_type(&frame_table[frame], granting_d,
  70.192 +                        get_page(&frame_table[frame], rd) :
  70.193 +                        get_page_and_type(&frame_table[frame], rd,
  70.194                                            PGT_writable_page))) )
  70.195          {
  70.196              clear_bit(_GTF_writing, &sha->flags);
  70.197 @@ -208,10 +274,11 @@ static int
  70.198              PIN_FAIL(unlock_out, ENOSPC,
  70.199                       "Risk of counter overflow %08x\n", act->pin);
  70.200  
  70.201 -        frame = act->frame;
  70.202 +        sflags = sha->flags;
  70.203 +        frame  = act->frame;
  70.204  
  70.205 -        if ( !(dev_hst_ro_flags & GNTMAP_readonly) && 
  70.206 -             !((sflags = sha->flags) & GTF_writing) )
  70.207 +        if ( !(dev_hst_ro_flags & GNTMAP_readonly) &&
  70.208 +             !(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) )
  70.209          {
  70.210              for ( ; ; )
  70.211              {
  70.212 @@ -264,9 +331,9 @@ static int
  70.213       * frame contains the mfn.
  70.214       */
  70.215  
  70.216 -    spin_unlock(&granting_d->grant_table->lock);
  70.217 +    spin_unlock(&rd->grant_table->lock);
  70.218  
  70.219 -    if ( (addr != 0) && (dev_hst_ro_flags & GNTMAP_host_map) )
  70.220 +    if ( dev_hst_ro_flags & GNTMAP_host_map )
  70.221      {
  70.222          /* Write update into the pagetable. */
  70.223          l1_pgentry_t pte;
  70.224 @@ -278,18 +345,15 @@ static int
  70.225              l1e_add_flags(pte,_PAGE_RW);
  70.226  
  70.227          if ( dev_hst_ro_flags & GNTMAP_contains_pte )
  70.228 -            rc = update_grant_pte_mapping(addr, pte, mapping_d, mapping_ed);
  70.229 +            rc = update_grant_pte_mapping(addr, pte, led);
  70.230          else
  70.231 -            rc = update_grant_va_mapping(addr, pte, mapping_d, mapping_ed);
  70.232 +            rc = update_grant_va_mapping(addr, pte, led);
  70.233  
  70.234 -        /* IMPORTANT: rc indicates the degree of TLB flush that is required.
  70.235 -         * GNTST_flush_one (1) or GNTST_flush_all (2). This is done in the 
  70.236 -         * outer gnttab_map_grant_ref. */
  70.237          if ( rc < 0 )
  70.238          {
  70.239              /* Failure: undo and abort. */
  70.240  
  70.241 -            spin_lock(&granting_d->grant_table->lock);
  70.242 +            spin_lock(&rd->grant_table->lock);
  70.243  
  70.244              if ( dev_hst_ro_flags & GNTMAP_readonly )
  70.245              {
  70.246 @@ -311,186 +375,44 @@ static int
  70.247                  put_page(&frame_table[frame]);
  70.248              }
  70.249  
  70.250 -            spin_unlock(&granting_d->grant_table->lock);
  70.251 +            spin_unlock(&rd->grant_table->lock);
  70.252          }
  70.253 -
  70.254 -    }
  70.255 -
  70.256 -    *pframe = frame;
  70.257 -    return rc;
  70.258 -
  70.259 - unlock_out:
  70.260 -    spin_unlock(&granting_d->grant_table->lock);
  70.261 -    return rc;
  70.262 -}
  70.263 -
  70.264 -/*
  70.265 - * Returns 0 if TLB flush / invalidate required by caller.
  70.266 - * va will indicate the address to be invalidated.
  70.267 - * 
  70.268 - * addr is _either_ a host virtual address, or the address of the pte to
  70.269 - * update, as indicated by the GNTMAP_contains_pte flag.
  70.270 - */
  70.271 -static int
  70.272 -__gnttab_map_grant_ref(
  70.273 -    gnttab_map_grant_ref_t *uop,
  70.274 -    unsigned long *va)
  70.275 -{
  70.276 -    domid_t        dom;
  70.277 -    grant_ref_t    ref;
  70.278 -    struct domain *ld, *rd;
  70.279 -    struct vcpu   *led;
  70.280 -    u16            dev_hst_ro_flags;
  70.281 -    int            handle;
  70.282 -    u64            addr;
  70.283 -    unsigned long  frame = 0;
  70.284 -    int            rc;
  70.285 -
  70.286 -    led = current;
  70.287 -    ld = led->domain;
  70.288 -
  70.289 -    /* Bitwise-OR avoids short-circuiting which screws control flow. */
  70.290 -    if ( unlikely(__get_user(dom, &uop->dom) |
  70.291 -                  __get_user(ref, &uop->ref) |
  70.292 -                  __get_user(addr, &uop->host_addr) |
  70.293 -                  __get_user(dev_hst_ro_flags, &uop->flags)) )
  70.294 -    {
  70.295 -        DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n");
  70.296 -        return -EFAULT; /* don't set status */
  70.297 -    }
  70.298 -
  70.299 -    if ( (dev_hst_ro_flags & GNTMAP_host_map) &&
  70.300 -         ( (addr == 0) ||
  70.301 -           (!(dev_hst_ro_flags & GNTMAP_contains_pte) && 
  70.302 -            unlikely(!__addr_ok(addr))) ) )
  70.303 -    {
  70.304 -        DPRINTK("Bad virtual address (%"PRIx64") or flags (%"PRIx16").\n",
  70.305 -                addr, dev_hst_ro_flags);
  70.306 -        (void)__put_user(GNTST_bad_virt_addr, &uop->handle);
  70.307 -        return GNTST_bad_gntref;
  70.308 -    }
  70.309 -
  70.310 -    if ( unlikely(ref >= NR_GRANT_ENTRIES) ||
  70.311 -         unlikely((dev_hst_ro_flags &
  70.312 -                   (GNTMAP_device_map|GNTMAP_host_map)) == 0) )
  70.313 -    {
  70.314 -        DPRINTK("Bad ref (%d) or flags (%x).\n", ref, dev_hst_ro_flags);
  70.315 -        (void)__put_user(GNTST_bad_gntref, &uop->handle);
  70.316 -        return GNTST_bad_gntref;
  70.317 -    }
  70.318 -
  70.319 -    if (acm_pre_grant_map_ref(dom)) {
  70.320 -        (void)__put_user(GNTST_permission_denied, &uop->handle);
  70.321 -        return GNTST_permission_denied;
  70.322      }
  70.323  
  70.324 -    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
  70.325 -         unlikely(ld == rd) )
  70.326 -    {
  70.327 -        if ( rd != NULL )
  70.328 -            put_domain(rd);
  70.329 -        DPRINTK("Could not find domain %d\n", dom);
  70.330 -        (void)__put_user(GNTST_bad_domain, &uop->handle);
  70.331 -        return GNTST_bad_domain;
  70.332 -    }
  70.333 -
  70.334 -    /* Get a maptrack handle. */
  70.335 -    if ( unlikely((handle = get_maptrack_handle(ld->grant_table)) == -1) )
  70.336 -    {
  70.337 -        int              i;
  70.338 -        grant_mapping_t *new_mt;
  70.339 -        grant_table_t   *lgt = ld->grant_table;
  70.340 -
  70.341 -        if ( (lgt->maptrack_limit << 1) > MAPTRACK_MAX_ENTRIES )
  70.342 -        {
  70.343 -            put_domain(rd);
  70.344 -            DPRINTK("Maptrack table is at maximum size.\n");
  70.345 -            (void)__put_user(GNTST_no_device_space, &uop->handle);
  70.346 -            return GNTST_no_device_space;
  70.347 -        }
  70.348 -
  70.349 -        /* Grow the maptrack table. */
  70.350 -        new_mt = alloc_xenheap_pages(lgt->maptrack_order + 1);
  70.351 -        if ( new_mt == NULL )
  70.352 -        {
  70.353 -            put_domain(rd);
  70.354 -            DPRINTK("No more map handles available.\n");
  70.355 -            (void)__put_user(GNTST_no_device_space, &uop->handle);
  70.356 -            return GNTST_no_device_space;
  70.357 -        }
  70.358 -
  70.359 -        memcpy(new_mt, lgt->maptrack, PAGE_SIZE << lgt->maptrack_order);
  70.360 -        for ( i = lgt->maptrack_limit; i < (lgt->maptrack_limit << 1); i++ )
  70.361 -            new_mt[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;
  70.362 +    ld->grant_table->maptrack[handle].domid         = dom;
  70.363 +    ld->grant_table->maptrack[handle].ref_and_flags =
  70.364 +        (ref << MAPTRACK_REF_SHIFT) |
  70.365 +        (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK);
  70.366  
  70.367 -        free_xenheap_pages(lgt->maptrack, lgt->maptrack_order);
  70.368 -        lgt->maptrack          = new_mt;
  70.369 -        lgt->maptrack_order   += 1;
  70.370 -        lgt->maptrack_limit  <<= 1;
  70.371 -
  70.372 -        DPRINTK("Doubled maptrack size\n");
  70.373 -        handle = get_maptrack_handle(ld->grant_table);
  70.374 -    }
  70.375 -
  70.376 -#if GRANT_DEBUG_VERBOSE
  70.377 -    DPRINTK("Mapping grant ref (%hu) for domain (%hu) with flags (%x)\n",
  70.378 -            ref, dom, dev_hst_ro_flags);
  70.379 -#endif
  70.380 -
  70.381 -    if ( (rc = __gnttab_activate_grant_ref(ld, led, rd, ref, dev_hst_ro_flags,
  70.382 -                                           addr, &frame)) >= 0 )
  70.383 -    {
  70.384 -        /*
  70.385 -         * Only make the maptrack live _after_ writing the pte, in case we 
  70.386 -         * overwrite the same frame number, causing a maptrack walk to find it
  70.387 -         */
  70.388 -        ld->grant_table->maptrack[handle].domid = dom;
  70.389 -
  70.390 -        ld->grant_table->maptrack[handle].ref_and_flags
  70.391 -            = (ref << MAPTRACK_REF_SHIFT) |
  70.392 -              (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK);
  70.393 -
  70.394 -        (void)__put_user((u64)frame << PAGE_SHIFT, &uop->dev_bus_addr);
  70.395 -
  70.396 -        if ( ( dev_hst_ro_flags & GNTMAP_host_map ) &&
  70.397 -             !( dev_hst_ro_flags & GNTMAP_contains_pte) )
  70.398 -            *va = addr;
  70.399 -
  70.400 -        (void)__put_user(handle, &uop->handle);
  70.401 -    }
  70.402 -    else
  70.403 -    {
  70.404 -        (void)__put_user(rc, &uop->handle);
  70.405 -        put_maptrack_handle(ld->grant_table, handle);
  70.406 -    }
  70.407 +    (void)__put_user((u64)frame << PAGE_SHIFT, &uop->dev_bus_addr);
  70.408 +    (void)__put_user(handle, &uop->handle);
  70.409  
  70.410      put_domain(rd);
  70.411      return rc;
  70.412 +
  70.413 +
  70.414 + unlock_out:
  70.415 +    spin_unlock(&rd->grant_table->lock);
  70.416 +    (void)__put_user(rc, &uop->handle);
  70.417 +    put_maptrack_handle(ld->grant_table, handle);
  70.418 +    return rc;
  70.419  }
  70.420  
  70.421  static long
  70.422  gnttab_map_grant_ref(
  70.423      gnttab_map_grant_ref_t *uop, unsigned int count)
  70.424  {
  70.425 -    int i, rc, flush = 0;
  70.426 -    unsigned long va = 0;
  70.427 +    int i;
  70.428  
  70.429      for ( i = 0; i < count; i++ )
  70.430 -        if ( (rc =__gnttab_map_grant_ref(&uop[i], &va)) >= 0 )
  70.431 -            flush += rc;
  70.432 -
  70.433 -    if ( flush == 1 )
  70.434 -        flush_tlb_one_mask(current->domain->cpumask, va);
  70.435 -    else if ( flush != 0 ) 
  70.436 -        flush_tlb_mask(current->domain->cpumask);
  70.437 +        (void)__gnttab_map_grant_ref(&uop[i]);
  70.438  
  70.439      return 0;
  70.440  }
  70.441  
  70.442  static int
  70.443  __gnttab_unmap_grant_ref(
  70.444 -    gnttab_unmap_grant_ref_t *uop,
  70.445 -    unsigned long *va)
  70.446 +    gnttab_unmap_grant_ref_t *uop)
  70.447  {
  70.448      domid_t          dom;
  70.449      grant_ref_t      ref;
  70.450 @@ -500,7 +422,7 @@ static int
  70.451      grant_entry_t   *sha;
  70.452      grant_mapping_t *map;
  70.453      u16              flags;
  70.454 -    s16              rc = 1;
  70.455 +    s16              rc = 0;
  70.456      u64              addr, dev_bus_addr;
  70.457      unsigned long    frame;
  70.458  
  70.459 @@ -541,11 +463,6 @@ static int
  70.460          return GNTST_bad_domain;
  70.461      }
  70.462  
  70.463 -#if GRANT_DEBUG_VERBOSE
  70.464 -    DPRINTK("Unmapping grant ref (%hu) for domain (%hu) with handle (%hu)\n",
  70.465 -            ref, dom, handle);
  70.466 -#endif
  70.467 -
  70.468      act = &rd->grant_table->active[ref];
  70.469      sha = &rd->grant_table->shared[ref];
  70.470  
  70.471 @@ -566,8 +483,6 @@ static int
  70.472  
  70.473          map->ref_and_flags &= ~GNTMAP_device_map;
  70.474          (void)__put_user(0, &uop->dev_bus_addr);
  70.475 -
  70.476 -        /* Frame is now unmapped for device access. */
  70.477      }
  70.478  
  70.479      if ( (addr != 0) &&
  70.480 @@ -589,10 +504,6 @@ static int
  70.481  
  70.482          act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_hstr_inc
  70.483                                                : GNTPIN_hstw_inc;
  70.484 -
  70.485 -        rc = 0;
  70.486 -        if ( !( flags & GNTMAP_contains_pte) )
  70.487 -            *va = addr;
  70.488      }
  70.489  
  70.490      if ( (map->ref_and_flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0)
  70.491 @@ -632,17 +543,12 @@ static long
  70.492  gnttab_unmap_grant_ref(
  70.493      gnttab_unmap_grant_ref_t *uop, unsigned int count)
  70.494  {
  70.495 -    int i, flush = 0;
  70.496 -    unsigned long va = 0;
  70.497 +    int i;
  70.498  
  70.499      for ( i = 0; i < count; i++ )
  70.500 -        if ( __gnttab_unmap_grant_ref(&uop[i], &va) == 0 )
  70.501 -            flush++;
  70.502 +        (void)__gnttab_unmap_grant_ref(&uop[i]);
  70.503  
  70.504 -    if ( flush == 1 )
  70.505 -        flush_tlb_one_mask(current->domain->cpumask, va);
  70.506 -    else if ( flush != 0 ) 
  70.507 -        flush_tlb_mask(current->domain->cpumask);
  70.508 +    flush_tlb_mask(current->domain->cpumask);
  70.509  
  70.510      return 0;
  70.511  }
  70.512 @@ -703,9 +609,9 @@ gnttab_setup_table(
  70.513      return 0;
  70.514  }
  70.515  
  70.516 -#if GRANT_DEBUG
  70.517  static int
  70.518 -gnttab_dump_table(gnttab_dump_table_t *uop)
  70.519 +gnttab_dump_table(
  70.520 +    gnttab_dump_table_t *uop)
  70.521  {
  70.522      grant_table_t        *gt;
  70.523      gnttab_dump_table_t   op;
  70.524 @@ -716,6 +622,8 @@ gnttab_dump_table(gnttab_dump_table_t *u
  70.525      grant_mapping_t      *maptrack;
  70.526      int                   i;
  70.527  
  70.528 +    if ( !IS_PRIV(current->domain) )
  70.529 +        return -EPERM;
  70.530  
  70.531      if ( unlikely(copy_from_user(&op, uop, sizeof(op)) != 0) )
  70.532      {
  70.533 @@ -724,9 +632,7 @@ gnttab_dump_table(gnttab_dump_table_t *u
  70.534      }
  70.535  
  70.536      if ( op.dom == DOMID_SELF )
  70.537 -    {
  70.538          op.dom = current->domain->domain_id;
  70.539 -    }
  70.540  
  70.541      if ( unlikely((d = find_domain_by_id(op.dom)) == NULL) )
  70.542      {
  70.543 @@ -750,14 +656,11 @@ gnttab_dump_table(gnttab_dump_table_t *u
  70.544  
  70.545      for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
  70.546      {
  70.547 -        sha_copy =  gt->shared[i];
  70.548 -
  70.549 +        sha_copy = gt->shared[i];
  70.550          if ( sha_copy.flags )
  70.551 -        {
  70.552              DPRINTK("Grant: dom (%hu) SHARED (%d) flags:(%hx) "
  70.553                      "dom:(%hu) frame:(%x)\n",
  70.554                      op.dom, i, sha_copy.flags, sha_copy.domid, sha_copy.frame);
  70.555 -        }
  70.556      }
  70.557  
  70.558      spin_lock(&gt->lock);
  70.559 @@ -765,28 +668,22 @@ gnttab_dump_table(gnttab_dump_table_t *u
  70.560      for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
  70.561      {
  70.562          act = &gt->active[i];
  70.563 -
  70.564          if ( act->pin )
  70.565 -        {
  70.566              DPRINTK("Grant: dom (%hu) ACTIVE (%d) pin:(%x) "
  70.567                      "dom:(%hu) frame:(%lx)\n",
  70.568                      op.dom, i, act->pin, act->domid, act->frame);
  70.569 -        }
  70.570      }
  70.571  
  70.572      for ( i = 0; i < gt->maptrack_limit; i++ )
  70.573      {
  70.574          maptrack = &gt->maptrack[i];
  70.575 -
  70.576          if ( maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK )
  70.577 -        {
  70.578              DPRINTK("Grant: dom (%hu) MAP (%d) ref:(%hu) flags:(%x) "
  70.579                      "dom:(%hu)\n",
  70.580                      op.dom, i,
  70.581                      maptrack->ref_and_flags >> MAPTRACK_REF_SHIFT,
  70.582                      maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK,
  70.583                      maptrack->domid);
  70.584 -        }
  70.585      }
  70.586  
  70.587      spin_unlock(&gt->lock);
  70.588 @@ -794,10 +691,10 @@ gnttab_dump_table(gnttab_dump_table_t *u
  70.589      put_domain(d);
  70.590      return 0;
  70.591  }
  70.592 -#endif
  70.593  
  70.594  static long
  70.595 -gnttab_transfer(gnttab_transfer_t *uop, unsigned int count)
  70.596 +gnttab_transfer(
  70.597 +    gnttab_transfer_t *uop, unsigned int count)
  70.598  {
  70.599      struct domain *d = current->domain;
  70.600      struct domain *e;
  70.601 @@ -810,10 +707,7 @@ gnttab_transfer(gnttab_transfer_t *uop, 
  70.602      for ( i = 0; i < count; i++ )
  70.603      {
  70.604          gnttab_transfer_t *gop = &uop[i];
  70.605 -#if GRANT_DEBUG
  70.606 -        printk("gnttab_transfer: i=%d mfn=%lx domid=%d gref=%08x\n",
  70.607 -               i, gop->mfn, gop->domid, gop->handle);
  70.608 -#endif
  70.609 +
  70.610          page = &frame_table[gop->mfn];
  70.611          
  70.612          if ( unlikely(IS_XEN_HEAP_FRAME(page)))
  70.613 @@ -956,11 +850,9 @@ do_grant_table_op(
  70.614      case GNTTABOP_setup_table:
  70.615          rc = gnttab_setup_table((gnttab_setup_table_t *)uop, count);
  70.616          break;
  70.617 -#if GRANT_DEBUG
  70.618      case GNTTABOP_dump_table:
  70.619          rc = gnttab_dump_table((gnttab_dump_table_t *)uop);
  70.620          break;
  70.621 -#endif
  70.622      case GNTTABOP_transfer:
  70.623          if (unlikely(!array_access_ok(
  70.624              uop, count, sizeof(gnttab_transfer_t))))
  70.625 @@ -1002,12 +894,6 @@ gnttab_check_unmap(
  70.626      
  70.627      lgt = ld->grant_table;
  70.628      
  70.629 -#if GRANT_DEBUG_VERBOSE
  70.630 -    if ( ld->domain_id != 0 )
  70.631 -        DPRINTK("Foreign unref rd(%d) ld(%d) frm(%lx) flgs(%x).\n",
  70.632 -                rd->domain_id, ld->domain_id, frame, readonly);
  70.633 -#endif
  70.634 -    
  70.635      /* Fast exit if we're not mapping anything using grant tables */
  70.636      if ( lgt->map_count == 0 )
  70.637          return 0;
  70.638 @@ -1098,11 +984,6 @@ gnttab_prepare_for_transfer(
  70.639      int            retries = 0;
  70.640      unsigned long  target_pfn;
  70.641  
  70.642 -#if GRANT_DEBUG_VERBOSE
  70.643 -    DPRINTK("gnttab_prepare_for_transfer rd(%hu) ld(%hu) ref(%hu).\n",
  70.644 -            rd->domain_id, ld->domain_id, ref);
  70.645 -#endif
  70.646 -
  70.647      if ( unlikely((rgt = rd->grant_table) == NULL) ||
  70.648           unlikely(ref >= NR_GRANT_ENTRIES) )
  70.649      {
    71.1 --- a/xen/include/asm-x86/e820.h	Fri Sep 23 15:41:28 2005 -0600
    71.2 +++ b/xen/include/asm-x86/e820.h	Mon Sep 26 11:07:49 2005 -0600
    71.3 @@ -11,6 +11,11 @@
    71.4  #define E820_NVS          4
    71.5  #define E820_IO          16
    71.6  #define E820_SHARED_PAGE 17
    71.7 +#define E820_XENSTORE    18
    71.8 +
    71.9 +#define E820_MAP_PAGE        0x00090000
   71.10 +#define E820_MAP_NR_OFFSET   0x000001E8
   71.11 +#define E820_MAP_OFFSET      0x000002D0
   71.12  
   71.13  #ifndef __ASSEMBLY__
   71.14  struct e820entry {
    72.1 --- a/xen/include/asm-x86/mm.h	Fri Sep 23 15:41:28 2005 -0600
    72.2 +++ b/xen/include/asm-x86/mm.h	Mon Sep 26 11:07:49 2005 -0600
    72.3 @@ -380,11 +380,9 @@ extern int __sync_lazy_execstate(void);
    72.4   * hold a reference to the page.
    72.5   */
    72.6  int update_grant_va_mapping(
    72.7 -    unsigned long va, l1_pgentry_t _nl1e, 
    72.8 -    struct domain *d, struct vcpu *v);
    72.9 +    unsigned long va, l1_pgentry_t _nl1e, struct vcpu *v);
   72.10  int update_grant_pte_mapping(
   72.11 -    unsigned long pte_addr, l1_pgentry_t _nl1e, 
   72.12 -    struct domain *d, struct vcpu *v);
   72.13 +    unsigned long pte_addr, l1_pgentry_t _nl1e, struct vcpu *v);
   72.14  int clear_grant_va_mapping(unsigned long addr, unsigned long frame);
   72.15  int clear_grant_pte_mapping(
   72.16      unsigned long addr, unsigned long frame, struct domain *d);
    73.1 --- a/xen/include/asm-x86/vmx_platform.h	Fri Sep 23 15:41:28 2005 -0600
    73.2 +++ b/xen/include/asm-x86/vmx_platform.h	Mon Sep 26 11:07:49 2005 -0600
    73.3 @@ -93,7 +93,6 @@ struct virtual_platform_def {
    73.4  
    73.5  extern void handle_mmio(unsigned long, unsigned long);
    73.6  extern void vmx_wait_io(void);
    73.7 -extern int vmx_setup_platform(struct vcpu *, struct cpu_user_regs *);
    73.8  extern void vmx_io_assist(struct vcpu *v);
    73.9  
   73.10  // XXX - think about this -- maybe use bit 30 of the mfn to signify an MMIO frame.
    74.1 --- a/xen/include/xen/grant_table.h	Fri Sep 23 15:41:28 2005 -0600
    74.2 +++ b/xen/include/xen/grant_table.h	Mon Sep 26 11:07:49 2005 -0600
    74.3 @@ -110,8 +110,4 @@ gnttab_prepare_for_transfer(
    74.4  void
    74.5  gnttab_release_dev_mappings(grant_table_t *gt);
    74.6  
    74.7 -/* Extra GNTST_ values, for internal use only. */
    74.8 -#define GNTST_flush_all        (2)  /* Success, need to flush entire TLB.    */
    74.9 -#define GNTST_flush_one        (1)  /* Success, need to flush a vaddr.       */
   74.10 -
   74.11  #endif /* __XEN_GRANT_H__ */