ia64/xen-unstable

changeset 4175:7a5ec83c604e

bitkeeper revision 1.1236.25.27 (4237c845Ia9bDdH7U9EUF2Xr-QIXqA)

Merge bk://xen.bkbits.net/xeno-unstable.bk
into bkbits.net:/repos/x/xen-ia64/xeno-unstable-ia64.bk
author xen-ia64.adm@bkbits.net
date Tue Mar 15 23:44:44 2005 +0000 (2005-03-15)
parents 52d3f3c5aed0 6a97e01e0c4e
children 5b5ebfc03e24 74080d40b2e9
files .rootkeys BitKeeper/etc/ignore BitKeeper/etc/logging_ok linux-2.4.29-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/Makefile linux-2.6.11-xen-sparse/arch/xen/i386/mm/pageattr.c linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c linux-2.6.11-xen-sparse/drivers/xen/blkfront/vbd.c linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h patches/linux-2.6.11/iomap.patch tools/blktap/Makefile tools/blktap/blockstore.c tools/blktap/blockstore.h tools/blktap/blockstored.c tools/blktap/bstest.c tools/misc/xend tools/tests/test_x86_emulator.c xen/arch/x86/mm.c xen/arch/x86/x86_32/mm.c xen/arch/x86/x86_64/mm.c xen/arch/x86/x86_emulate.c xen/include/asm-x86/mm.h xen/include/asm-x86/x86_emulate.h xen/include/xen/irq_cpustat.h xen/include/xen/perfc_defn.h xen/include/xen/sched.h xen/include/xen/softirq.h
line diff
     1.1 --- a/.rootkeys	Wed Mar 16 05:46:44 2005 +0000
     1.2 +++ b/.rootkeys	Tue Mar 15 23:44:44 2005 +0000
     1.3 @@ -172,7 +172,6 @@ 4118cc35CbY8rfGVspF5O-7EkXBEAA linux-2.6
     1.4  40f562383SKvDStdtrvzr5fyCbW4rw linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c
     1.5  40f56239xcNylAxuGsQHwi1AyMLV8w linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c
     1.6  41062ab7CjxC1UBaFhOMWWdhHkIUyg linux-2.6.11-xen-sparse/arch/xen/i386/mm/ioremap.c
     1.7 -413b5ab8LIowAnQrEmaOJSdmqm96jQ linux-2.6.11-xen-sparse/arch/xen/i386/mm/pageattr.c
     1.8  40f5623906UYHv1rsVUeRc0tFT0dWw linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c
     1.9  4107adf12ndy94MidCaivDibJ3pPAg linux-2.6.11-xen-sparse/arch/xen/i386/pci/Makefile
    1.10  4107adf1WcCgkhsdLTRGX52cOG1vJg linux-2.6.11-xen-sparse/arch/xen/i386/pci/direct.c
    1.11 @@ -316,6 +315,7 @@ 422e4430vKaHLOOGS7X-SUUe3EBCgw netbsd-2.
    1.12  422e4430-gOD358H8nGGnNWes08Nng netbsd-2.0-xen-sparse/sys/miscfs/kernfs/kernfs_vnops.c
    1.13  413cb3b53nyOv1OIeDSsCXhBFDXvJA netbsd-2.0-xen-sparse/sys/nfs/files.nfs
    1.14  413aa1d0oNP8HXLvfPuMe6cSroUfSA patches/linux-2.6.11/agpgart.patch
    1.15 +42372652KCUP-IOH9RN19YQmGhs4aA patches/linux-2.6.11/iomap.patch
    1.16  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
    1.17  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Rules.mk
    1.18  4209033eUwhDBJ_bxejiv5c6gjXS4A tools/blktap/Makefile
    1.19 @@ -341,6 +341,8 @@ 42090340rc2q1wmlGn6HtiJAkqhtNQ tools/blk
    1.20  42090340C-WkRPT7N3t-8Lzehzogdw tools/blktap/blktaplib.h
    1.21  42277b02WrfP1meTDPv1M5swFq8oHQ tools/blktap/blockstore.c
    1.22  42277b02P1C0FYj3gqwTZUD8sxKCug tools/blktap/blockstore.h
    1.23 +42371b8aL1JsxAXOd4bBhmZKDyjiJg tools/blktap/blockstored.c
    1.24 +42371b8aD_x3L9MKsXciMNqkuk58eQ tools/blktap/bstest.c
    1.25  42090340B3mDvcxvd9ehDHUkg46hvw tools/blktap/libgnbd/Makefile
    1.26  42090340ZWkc5Xhf9lpQmDON8HJXww tools/blktap/libgnbd/gnbdtest.c
    1.27  42090340ocMiUScJE3OpY7QNunvSbg tools/blktap/libgnbd/libgnbd.c
     2.1 --- a/BitKeeper/etc/ignore	Wed Mar 16 05:46:44 2005 +0000
     2.2 +++ b/BitKeeper/etc/ignore	Tue Mar 15 23:44:44 2005 +0000
     2.3 @@ -124,3 +124,5 @@ tools/blktap/vdi_validate
     2.4  tools/blktap/xen/*
     2.5  tools/cmdline/*
     2.6  tools/tests/test_x86_emulator
     2.7 +tools/blktap/blockstored
     2.8 +tools/blktap/bstest
     3.1 --- a/BitKeeper/etc/logging_ok	Wed Mar 16 05:46:44 2005 +0000
     3.2 +++ b/BitKeeper/etc/logging_ok	Tue Mar 15 23:44:44 2005 +0000
     3.3 @@ -35,6 +35,7 @@ iap10@pb001.cl.cam.ac.uk
     3.4  iap10@pb007.cl.cam.ac.uk
     3.5  iap10@striker.cl.cam.ac.uk
     3.6  iap10@tetris.cl.cam.ac.uk
     3.7 +jrb44@plym.cl.cam.ac.uk
     3.8  jws22@gauntlet.cl.cam.ac.uk
     3.9  jws@cairnwell.research
    3.10  kaf24@camelot.eng.3leafnetworks.com
    3.11 @@ -83,5 +84,6 @@ tlh20@labyrinth.cl.cam.ac.uk
    3.12  tw275@labyrinth.cl.cam.ac.uk
    3.13  tw275@striker.cl.cam.ac.uk
    3.14  vh249@airwolf.cl.cam.ac.uk
    3.15 +vh249@arcadians.cl.cam.ac.uk
    3.16  xen-ia64.adm@bkbits.net
    3.17  xenbk@gandalf.hpl.hp.com
     4.1 --- a/linux-2.4.29-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c	Wed Mar 16 05:46:44 2005 +0000
     4.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c	Tue Mar 15 23:44:44 2005 +0000
     4.3 @@ -442,11 +442,11 @@ void xlvbd_update_vbds(void)
     4.4      old_nr   = nr_vbds;
     4.5  
     4.6      new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
     4.7 +    if (!new_info)
     4.8 +        return;
     4.9 +
    4.10      if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 )
    4.11 -    {
    4.12 -        kfree(new_info);
    4.13 -        return;
    4.14 -    }
    4.15 +        goto out;
    4.16  
    4.17      /*
    4.18       * Final list maximum size is old list + new list. This occurs only when
    4.19 @@ -454,6 +454,8 @@ void xlvbd_update_vbds(void)
    4.20       * VBDs in the old list because the usage counts are busy.
    4.21       */
    4.22      merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL);
    4.23 +    if (!merged_info)
    4.24 +        goto out;
    4.25  
    4.26      /* @i tracks old list; @j tracks new list; @k tracks merged list. */
    4.27      i = j = k = 0;
    4.28 @@ -500,6 +502,7 @@ void xlvbd_update_vbds(void)
    4.29      nr_vbds  = k;
    4.30  
    4.31      kfree(old_info);
    4.32 +out:
    4.33      kfree(new_info);
    4.34  }
    4.35  
    4.36 @@ -543,6 +546,9 @@ int xlvbd_init(void)
    4.37      }
    4.38  
    4.39      vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
    4.40 +    if (!vbd_info)
    4.41 +        return -ENOMEM;
    4.42 +
    4.43      nr_vbds  = xlvbd_get_vbd_info(vbd_info);
    4.44  
    4.45      if ( nr_vbds < 0 )
     5.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/Makefile	Wed Mar 16 05:46:44 2005 +0000
     5.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/Makefile	Tue Mar 15 23:44:44 2005 +0000
     5.3 @@ -6,8 +6,8 @@ XENARCH	:= $(subst ",,$(CONFIG_XENARCH))
     5.4  
     5.5  CFLAGS	+= -Iarch/$(XENARCH)/mm
     5.6  
     5.7 -obj-y	:= init.o pgtable.o fault.o ioremap.o pageattr.o hypervisor.o
     5.8 -c-obj-y	:= extable.o mmap.o
     5.9 +obj-y	:= init.o pgtable.o fault.o ioremap.o hypervisor.o
    5.10 +c-obj-y	:= extable.o mmap.o pageattr.o
    5.11  
    5.12  c-obj-$(CONFIG_DISCONTIGMEM)	+= discontig.o
    5.13  c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
     6.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pageattr.c	Wed Mar 16 05:46:44 2005 +0000
     6.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.3 @@ -1,226 +0,0 @@
     6.4 -/* 
     6.5 - * Copyright 2002 Andi Kleen, SuSE Labs. 
     6.6 - * Thanks to Ben LaHaise for precious feedback.
     6.7 - */ 
     6.8 -
     6.9 -#include <linux/config.h>
    6.10 -#include <linux/mm.h>
    6.11 -#include <linux/sched.h>
    6.12 -#include <linux/highmem.h>
    6.13 -#include <linux/module.h>
    6.14 -#include <linux/slab.h>
    6.15 -#include <asm/uaccess.h>
    6.16 -#include <asm/processor.h>
    6.17 -#include <asm/tlbflush.h>
    6.18 -
    6.19 -static DEFINE_SPINLOCK(cpa_lock);
    6.20 -static struct list_head df_list = LIST_HEAD_INIT(df_list);
    6.21 -
    6.22 -
    6.23 -pte_t *lookup_address(unsigned long address) 
    6.24 -{ 
    6.25 -	pgd_t *pgd = pgd_offset_k(address);
    6.26 -	pud_t *pud;
    6.27 -	pmd_t *pmd;
    6.28 -	if (pgd_none(*pgd))
    6.29 -		return NULL;
    6.30 -	pud = pud_offset(pgd, address);
    6.31 -	if (pud_none(*pud))
    6.32 -		return NULL;
    6.33 -	pmd = pmd_offset(pud, address);
    6.34 -	if (pmd_none(*pmd))
    6.35 -		return NULL;
    6.36 -	if (pmd_large(*pmd))
    6.37 -		return (pte_t *)pmd;
    6.38 -        return pte_offset_kernel(pmd, address);
    6.39 -} 
    6.40 -
    6.41 -static struct page *split_large_page(unsigned long address, pgprot_t prot)
    6.42 -{ 
    6.43 -	int i; 
    6.44 -	unsigned long addr;
    6.45 -	struct page *base;
    6.46 -	pte_t *pbase;
    6.47 -
    6.48 -	spin_unlock_irq(&cpa_lock);
    6.49 -	base = alloc_pages(GFP_KERNEL, 0);
    6.50 -	spin_lock_irq(&cpa_lock);
    6.51 -	if (!base) 
    6.52 -		return NULL;
    6.53 -
    6.54 -	address = __pa(address);
    6.55 -	addr = address & LARGE_PAGE_MASK; 
    6.56 -	pbase = (pte_t *)page_address(base);
    6.57 -	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
    6.58 -		pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
    6.59 -				   addr == address ? prot : PAGE_KERNEL);
    6.60 -	}
    6.61 -	return base;
    6.62 -} 
    6.63 -
    6.64 -static void flush_kernel_map(void *dummy) 
    6.65 -{ 
    6.66 -	/* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */
    6.67 -	if (boot_cpu_data.x86_model >= 4) 
    6.68 -		wbinvd();
    6.69 -	/* Flush all to work around Errata in early athlons regarding 
    6.70 -	 * large page flushing. 
    6.71 -	 */
    6.72 -	__flush_tlb_all(); 	
    6.73 -}
    6.74 -
    6.75 -static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
    6.76 -{ 
    6.77 -	struct page *page;
    6.78 -	unsigned long flags;
    6.79 -
    6.80 -	set_pte_atomic(kpte, pte); 	/* change init_mm */
    6.81 -	if (PTRS_PER_PMD > 1)
    6.82 -		return;
    6.83 -
    6.84 -	spin_lock_irqsave(&pgd_lock, flags);
    6.85 -	for (page = pgd_list; page; page = (struct page *)page->index) {
    6.86 -		pgd_t *pgd;
    6.87 -		pud_t *pud;
    6.88 -		pmd_t *pmd;
    6.89 -		pgd = (pgd_t *)page_address(page) + pgd_index(address);
    6.90 -		pud = pud_offset(pgd, address);
    6.91 -		pmd = pmd_offset(pud, address);
    6.92 -		set_pte_atomic((pte_t *)pmd, pte);
    6.93 -	}
    6.94 -	spin_unlock_irqrestore(&pgd_lock, flags);
    6.95 -}
    6.96 -
    6.97 -/* 
    6.98 - * No more special protections in this 2/4MB area - revert to a
    6.99 - * large page again. 
   6.100 - */
   6.101 -static inline void revert_page(struct page *kpte_page, unsigned long address)
   6.102 -{
   6.103 -	pte_t *linear = (pte_t *) 
   6.104 -		pmd_offset(pud_offset(pgd_offset_k(address), address), address);
   6.105 -	set_pmd_pte(linear,  address,
   6.106 -		    pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
   6.107 -			    PAGE_KERNEL_LARGE));
   6.108 -}
   6.109 -
   6.110 -static int
   6.111 -__change_page_attr(struct page *page, pgprot_t prot)
   6.112 -{ 
   6.113 -	pte_t *kpte; 
   6.114 -	unsigned long address;
   6.115 -	struct page *kpte_page;
   6.116 -
   6.117 -	BUG_ON(PageHighMem(page));
   6.118 -	address = (unsigned long)page_address(page);
   6.119 -
   6.120 -	kpte = lookup_address(address);
   6.121 -	if (!kpte)
   6.122 -		return -EINVAL;
   6.123 -	kpte_page = virt_to_page(kpte);
   6.124 -	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
   6.125 -		if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 
   6.126 -			set_pte_batched(kpte, mk_pte(page, prot)); 
   6.127 -		} else {
   6.128 -			struct page *split = split_large_page(address, prot); 
   6.129 -			if (!split)
   6.130 -				return -ENOMEM;
   6.131 -			set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
   6.132 -			kpte_page = split;
   6.133 -		}	
   6.134 -		get_page(kpte_page);
   6.135 -	} else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 
   6.136 -		set_pte_batched(kpte, mk_pte(page, PAGE_KERNEL));
   6.137 -		__put_page(kpte_page);
   6.138 -	} else
   6.139 -		BUG();
   6.140 -
   6.141 -	/*
   6.142 -	 * If the pte was reserved, it means it was created at boot
   6.143 -	 * time (not via split_large_page) and in turn we must not
   6.144 -	 * replace it with a largepage.
   6.145 -	 */
   6.146 -	if (!PageReserved(kpte_page)) {
   6.147 -		/* memleak and potential failed 2M page regeneration */
   6.148 -		BUG_ON(!page_count(kpte_page));
   6.149 -
   6.150 -		if (cpu_has_pse && (page_count(kpte_page) == 1)) {
   6.151 -			list_add(&kpte_page->lru, &df_list);
   6.152 -			revert_page(kpte_page, address);
   6.153 -		}
   6.154 -	}
   6.155 -	return 0;
   6.156 -} 
   6.157 -
   6.158 -static inline void flush_map(void)
   6.159 -{
   6.160 -	on_each_cpu(flush_kernel_map, NULL, 1, 1);
   6.161 -}
   6.162 -
   6.163 -/*
   6.164 - * Change the page attributes of an page in the linear mapping.
   6.165 - *
   6.166 - * This should be used when a page is mapped with a different caching policy
   6.167 - * than write-back somewhere - some CPUs do not like it when mappings with
   6.168 - * different caching policies exist. This changes the page attributes of the
   6.169 - * in kernel linear mapping too.
   6.170 - * 
   6.171 - * The caller needs to ensure that there are no conflicting mappings elsewhere.
   6.172 - * This function only deals with the kernel linear map.
   6.173 - * 
   6.174 - * Caller must call global_flush_tlb() after this.
   6.175 - */
   6.176 -int change_page_attr(struct page *page, int numpages, pgprot_t prot)
   6.177 -{
   6.178 -	int err = 0; 
   6.179 -	int i; 
   6.180 -	unsigned long flags;
   6.181 -
   6.182 -	spin_lock_irqsave(&cpa_lock, flags);
   6.183 -	for (i = 0; i < numpages; i++, page++) { 
   6.184 -		err = __change_page_attr(page, prot);
   6.185 -		if (err) 
   6.186 -			break; 
   6.187 -	} 	
   6.188 -	flush_page_update_queue();
   6.189 -	spin_unlock_irqrestore(&cpa_lock, flags);
   6.190 -	return err;
   6.191 -}
   6.192 -
   6.193 -void global_flush_tlb(void)
   6.194 -{ 
   6.195 -	LIST_HEAD(l);
   6.196 -	struct list_head* n;
   6.197 -
   6.198 -	BUG_ON(irqs_disabled());
   6.199 -
   6.200 -	spin_lock_irq(&cpa_lock);
   6.201 -	list_splice_init(&df_list, &l);
   6.202 -	spin_unlock_irq(&cpa_lock);
   6.203 -	flush_map();
   6.204 -	n = l.next;
   6.205 -	while (n != &l) {
   6.206 -		struct page *pg = list_entry(n, struct page, lru);
   6.207 -		n = n->next;
   6.208 -		__free_page(pg);
   6.209 -	}
   6.210 -} 
   6.211 -
   6.212 -#ifdef CONFIG_DEBUG_PAGEALLOC
   6.213 -void kernel_map_pages(struct page *page, int numpages, int enable)
   6.214 -{
   6.215 -	if (PageHighMem(page))
   6.216 -		return;
   6.217 -	/* the return value is ignored - the calls cannot fail,
   6.218 -	 * large pages are disabled at boot time.
   6.219 -	 */
   6.220 -	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
   6.221 -	/* we should perform an IPI and flush all tlbs,
   6.222 -	 * but that can deadlock->flush only current cpu.
   6.223 -	 */
   6.224 -	__flush_tlb_all();
   6.225 -}
   6.226 -#endif
   6.227 -
   6.228 -EXPORT_SYMBOL(change_page_attr);
   6.229 -EXPORT_SYMBOL(global_flush_tlb);
     7.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c	Wed Mar 16 05:46:44 2005 +0000
     7.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c	Tue Mar 15 23:44:44 2005 +0000
     7.3 @@ -165,8 +165,15 @@ static inline void flush_requests(void)
     7.4  module_init(xlblk_init);
     7.5  
     7.6  #if ENABLE_VBD_UPDATE
     7.7 +static void update_vbds_task(void *unused)
     7.8 +{ 
     7.9 +    xlvbd_update_vbds();
    7.10 +}
    7.11 +
    7.12  static void vbd_update(void)
    7.13  {
    7.14 +    static DECLARE_WORK(update_tq, update_vbds_task, NULL);
    7.15 +    schedule_work(&update_tq);
    7.16  }
    7.17  #endif /* ENABLE_VBD_UPDATE */
    7.18  
     8.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/blkfront/vbd.c	Wed Mar 16 05:46:44 2005 +0000
     8.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/blkfront/vbd.c	Tue Mar 15 23:44:44 2005 +0000
     8.3 @@ -354,7 +354,6 @@ static int xlvbd_init_device(vdisk_t *xd
     8.4      return err;
     8.5  }
     8.6  
     8.7 -#if 0
     8.8  /*
     8.9   * xlvbd_remove_device - remove a device node if possible
    8.10   * @device:       numeric device ID
    8.11 @@ -364,14 +363,16 @@ static int xlvbd_init_device(vdisk_t *xd
    8.12   * This is OK for now but in future, should perhaps consider where this should
    8.13   * deallocate gendisks / unregister devices.
    8.14   */
    8.15 -static int xlvbd_remove_device(int device)
    8.16 +static int xlvbd_remove_device(int dev16)
    8.17  {
    8.18 -    int i, rc = 0, minor = MINOR(device);
    8.19 +    int i, rc = 0, minor = MINOR(dev16);
    8.20      struct gendisk *gd;
    8.21      struct block_device *bd;
    8.22 -    xen_block_t *disk = NULL;
    8.23 +    struct xlbd_disk_info *di;
    8.24 +    dev_t device = MKDEV(MAJOR_XEN(dev16), MINOR_XEN(dev16));
    8.25  
    8.26 -    if ( (bd = bdget(device)) == NULL )
    8.27 +    bd = bdget(device);
    8.28 +    if (!bd)
    8.29          return -1;
    8.30  
    8.31      /*
    8.32 @@ -380,67 +381,25 @@ static int xlvbd_remove_device(int devic
    8.33       */
    8.34      down(&bd->bd_sem);
    8.35  
    8.36 -    if ( ((gd = get_gendisk(device)) == NULL) ||
    8.37 -         ((disk = xldev_to_xldisk(device)) == NULL) )
    8.38 -        BUG();
    8.39 +    gd = get_gendisk(device, &i);
    8.40 +    BUG_ON(gd == NULL);
    8.41 +    di = (struct xlbd_disk_info *) gd->private_data;
    8.42 +    BUG_ON(di == NULL);
    8.43  
    8.44 -    if ( disk->usage != 0 )
    8.45 +    if ( di->mi->usage != 0 )
    8.46      {
    8.47          printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device);
    8.48          rc = -1;
    8.49          goto out;
    8.50      }
    8.51 - 
    8.52 -    if ( (minor & (gd->max_p-1)) != 0 )
    8.53 -    {
    8.54 -        /* 1: The VBD is mapped to a partition rather than a whole unit. */
    8.55 -        invalidate_device(device, 1);
    8.56 -        gd->part[minor].start_sect = 0;
    8.57 -        gd->part[minor].nr_sects   = 0;
    8.58 -        gd->sizes[minor]           = 0;
    8.59 -
    8.60 -        /* Clear the consists-of-virtual-partitions flag if possible. */
    8.61 -        gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS;
    8.62 -        for ( i = 1; i < gd->max_p; i++ )
    8.63 -            if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 )
    8.64 -                gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS;
    8.65  
    8.66 -        /*
    8.67 -         * If all virtual partitions are now gone, and a 'whole unit' VBD is
    8.68 -         * present, then we can try to grok the unit's real partition table.
    8.69 -         */
    8.70 -        if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) &&
    8.71 -             (gd->sizes[minor & ~(gd->max_p-1)] != 0) &&
    8.72 -             !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) )
    8.73 -        {
    8.74 -            register_disk(gd,
    8.75 -                          device&~(gd->max_p-1), 
    8.76 -                          gd->max_p, 
    8.77 -                          &xlvbd_block_fops,
    8.78 -                          gd->part[minor&~(gd->max_p-1)].nr_sects);
    8.79 -        }
    8.80 -    }
    8.81 -    else
    8.82 -    {
    8.83 -        /*
    8.84 -         * 2: The VBD is mapped to an entire 'unit'. Clear all partitions.
    8.85 -         * NB. The partition entries are only cleared if there are no VBDs
    8.86 -         * mapped to individual partitions on this unit.
    8.87 -         */
    8.88 -        i = gd->max_p - 1; /* Default: clear subpartitions as well. */
    8.89 -        if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS )
    8.90 -            i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */
    8.91 -        while ( i >= 0 )
    8.92 -        {
    8.93 -            invalidate_device(device+i, 1);
    8.94 -            gd->part[minor+i].start_sect = 0;
    8.95 -            gd->part[minor+i].nr_sects   = 0;
    8.96 -            gd->sizes[minor+i]           = 0;
    8.97 -            i--;
    8.98 -        }
    8.99 -    }
   8.100 +    BUG_ON(minor != gd->first_minor);
   8.101 +    /* The VBD is mapped to an entire unit. */
   8.102 +    
   8.103 +    invalidate_partition(gd, 0);
   8.104 +    set_capacity(gd, 0);
   8.105  
   8.106 - out:
   8.107 +out:
   8.108      up(&bd->bd_sem);
   8.109      bdput(bd);
   8.110      return rc;
   8.111 @@ -460,11 +419,11 @@ void xlvbd_update_vbds(void)
   8.112      old_nr   = nr_vbds;
   8.113  
   8.114      new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
   8.115 +    if (!new_info)
   8.116 +        return;
   8.117 +
   8.118      if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 )
   8.119 -    {
   8.120 -        kfree(new_info);
   8.121 -        return;
   8.122 -    }
   8.123 +        goto out;
   8.124  
   8.125      /*
   8.126       * Final list maximum size is old list + new list. This occurs only when
   8.127 @@ -472,6 +431,8 @@ void xlvbd_update_vbds(void)
   8.128       * VBDs in the old list because the usage counts are busy.
   8.129       */
   8.130      merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL);
   8.131 +    if (!merged_info)
   8.132 +        goto out;
   8.133  
   8.134      /* @i tracks old list; @j tracks new list; @k tracks merged list. */
   8.135      i = j = k = 0;
   8.136 @@ -518,9 +479,9 @@ void xlvbd_update_vbds(void)
   8.137      nr_vbds  = k;
   8.138  
   8.139      kfree(old_info);
   8.140 +out:
   8.141      kfree(new_info);
   8.142  }
   8.143 -#endif
   8.144  
   8.145  /*
   8.146   * Set up all the linux device goop for the virtual block devices
     9.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Mar 16 05:46:44 2005 +0000
     9.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Tue Mar 15 23:44:44 2005 +0000
     9.3 @@ -485,7 +485,10 @@ int __direct_remap_area_pages(struct mm_
     9.4  			      mmu_update_t *v);
     9.5  
     9.6  #define io_remap_page_range(vma,from,phys,size,prot) \
     9.7 -	direct_remap_area_pages(vma->vm_mm,from,phys,size,prot,DOMID_IO)
     9.8 +direct_remap_area_pages(vma->vm_mm,from,phys,size,prot,DOMID_IO)
     9.9 +
    9.10 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
    9.11 +direct_remap_area_pages(vma->vm_mm,from,pfn<<PAGE_SHIFT,size,prot,DOMID_IO)
    9.12  
    9.13  #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
    9.14  #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/patches/linux-2.6.11/iomap.patch	Tue Mar 15 23:44:44 2005 +0000
    10.3 @@ -0,0 +1,120 @@
    10.4 +diff -ur linux-2.6.11/drivers/char/agp/frontend.c linux-2.6.11-io/drivers/char/agp/frontend.c
    10.5 +--- linux-2.6.11/drivers/char/agp/frontend.c	2005-03-02 07:37:49.000000000 +0000
    10.6 ++++ linux-2.6.11-io/drivers/char/agp/frontend.c	2005-03-15 17:38:30.000000000 +0000
    10.7 +@@ -627,7 +627,7 @@
    10.8 + 		DBG("client vm_ops=%p", kerninfo.vm_ops);
    10.9 + 		if (kerninfo.vm_ops) {
   10.10 + 			vma->vm_ops = kerninfo.vm_ops;
   10.11 +-		} else if (remap_pfn_range(vma, vma->vm_start,
   10.12 ++		} else if (io_remap_pfn_range(vma, vma->vm_start,
   10.13 + 				(kerninfo.aper_base + offset) >> PAGE_SHIFT,
   10.14 + 					    size, vma->vm_page_prot)) {
   10.15 + 			goto out_again;
   10.16 +@@ -643,7 +643,7 @@
   10.17 + 		DBG("controller vm_ops=%p", kerninfo.vm_ops);
   10.18 + 		if (kerninfo.vm_ops) {
   10.19 + 			vma->vm_ops = kerninfo.vm_ops;
   10.20 +-		} else if (remap_pfn_range(vma, vma->vm_start,
   10.21 ++		} else if (io_remap_pfn_range(vma, vma->vm_start,
   10.22 + 					    kerninfo.aper_base >> PAGE_SHIFT,
   10.23 + 					    size, vma->vm_page_prot)) {
   10.24 + 			goto out_again;
   10.25 +diff -ur linux-2.6.11/drivers/char/drm/drm_vm.c linux-2.6.11-io/drivers/char/drm/drm_vm.c
   10.26 +--- linux-2.6.11/drivers/char/drm/drm_vm.c	2005-03-02 07:38:33.000000000 +0000
   10.27 ++++ linux-2.6.11-io/drivers/char/drm/drm_vm.c	2005-03-15 17:43:26.000000000 +0000
   10.28 +@@ -630,7 +630,7 @@
   10.29 + 					vma->vm_end - vma->vm_start,
   10.30 + 					vma->vm_page_prot, 0))
   10.31 + #else
   10.32 +-		if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start,
   10.33 ++		if (io_remap_pfn_range(vma, vma->vm_start,
   10.34 + 				     (VM_OFFSET(vma) + offset) >> PAGE_SHIFT,
   10.35 + 				     vma->vm_end - vma->vm_start,
   10.36 + 				     vma->vm_page_prot))
   10.37 +diff -ur linux-2.6.11/drivers/char/drm/i810_dma.c linux-2.6.11-io/drivers/char/drm/i810_dma.c
   10.38 +--- linux-2.6.11/drivers/char/drm/i810_dma.c	2005-03-02 07:37:55.000000000 +0000
   10.39 ++++ linux-2.6.11-io/drivers/char/drm/i810_dma.c	2005-03-15 17:53:36.000000000 +0000
   10.40 +@@ -139,7 +139,7 @@
   10.41 +    	buf_priv->currently_mapped = I810_BUF_MAPPED;
   10.42 + 	unlock_kernel();
   10.43 + 
   10.44 +-	if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start,
   10.45 ++	if (io_remap_pfn_range(vma, vma->vm_start,
   10.46 + 			     VM_OFFSET(vma) >> PAGE_SHIFT,
   10.47 + 			     vma->vm_end - vma->vm_start,
   10.48 + 			     vma->vm_page_prot)) return -EAGAIN;
   10.49 +diff -ur linux-2.6.11/drivers/char/drm/i830_dma.c linux-2.6.11-io/drivers/char/drm/i830_dma.c
   10.50 +--- linux-2.6.11/drivers/char/drm/i830_dma.c	2005-03-02 07:37:48.000000000 +0000
   10.51 ++++ linux-2.6.11-io/drivers/char/drm/i830_dma.c	2005-03-15 17:53:46.000000000 +0000
   10.52 +@@ -157,7 +157,7 @@
   10.53 +    	buf_priv->currently_mapped = I830_BUF_MAPPED;
   10.54 + 	unlock_kernel();
   10.55 + 
   10.56 +-	if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start,
   10.57 ++	if (io_remap_pfn_range(vma, vma->vm_start,
   10.58 + 			     VM_OFFSET(vma) >> PAGE_SHIFT,
   10.59 + 			     vma->vm_end - vma->vm_start,
   10.60 + 			     vma->vm_page_prot)) return -EAGAIN;
   10.61 +diff -ur linux-2.6.11/drivers/char/hpet.c linux-2.6.11-io/drivers/char/hpet.c
   10.62 +--- linux-2.6.11/drivers/char/hpet.c	2005-03-02 07:38:10.000000000 +0000
   10.63 ++++ linux-2.6.11-io/drivers/char/hpet.c	2005-03-15 17:37:22.000000000 +0000
   10.64 +@@ -76,6 +76,7 @@
   10.65 + struct hpets {
   10.66 + 	struct hpets *hp_next;
   10.67 + 	struct hpet __iomem *hp_hpet;
   10.68 ++	unsigned long hp_hpet_phys;
   10.69 + 	struct time_interpolator *hp_interpolator;
   10.70 + 	unsigned long hp_period;
   10.71 + 	unsigned long hp_delta;
   10.72 +@@ -265,7 +266,7 @@
   10.73 + 		return -EINVAL;
   10.74 + 
   10.75 + 	devp = file->private_data;
   10.76 +-	addr = (unsigned long)devp->hd_hpet;
   10.77 ++	addr = devp->hd_hpets->hp_hpet_phys;
   10.78 + 
   10.79 + 	if (addr & (PAGE_SIZE - 1))
   10.80 + 		return -ENOSYS;
   10.81 +@@ -274,7 +275,7 @@
   10.82 + 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
   10.83 + 	addr = __pa(addr);
   10.84 + 
   10.85 +-	if (remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT,
   10.86 ++	if (io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT,
   10.87 + 					PAGE_SIZE, vma->vm_page_prot)) {
   10.88 + 		printk(KERN_ERR "remap_pfn_range failed in hpet.c\n");
   10.89 + 		return -EAGAIN;
   10.90 +@@ -795,6 +796,7 @@
   10.91 + 
   10.92 + 	hpetp->hp_which = hpet_nhpet++;
   10.93 + 	hpetp->hp_hpet = hdp->hd_address;
   10.94 ++	hpetp->hp_hpet_phys = hdp->hd_phys_address;
   10.95 + 
   10.96 + 	hpetp->hp_ntimer = hdp->hd_nirqs;
   10.97 + 
   10.98 +diff -ur linux-2.6.11/drivers/sbus/char/flash.c linux-2.6.11-io/drivers/sbus/char/flash.c
   10.99 +--- linux-2.6.11/drivers/sbus/char/flash.c	2005-03-02 07:38:10.000000000 +0000
  10.100 ++++ linux-2.6.11-io/drivers/sbus/char/flash.c	2005-03-15 17:20:22.000000000 +0000
  10.101 +@@ -75,7 +75,7 @@
  10.102 + 	pgprot_val(vma->vm_page_prot) |= _PAGE_E;
  10.103 + 	vma->vm_flags |= (VM_SHM | VM_LOCKED);
  10.104 + 
  10.105 +-	if (remap_pfn_range(vma, vma->vm_start, addr, size, vma->vm_page_prot))
  10.106 ++	if (io_remap_pfn_range(vma, vma->vm_start, addr, size, vma->vm_page_prot))
  10.107 + 		return -EAGAIN;
  10.108 + 		
  10.109 + 	return 0;
  10.110 +diff -ur linux-2.6.11/include/linux/mm.h linux-2.6.11-io/include/linux/mm.h
  10.111 +--- linux-2.6.11/include/linux/mm.h	2005-03-02 07:37:47.000000000 +0000
  10.112 ++++ linux-2.6.11-io/include/linux/mm.h	2005-03-15 17:03:46.000000000 +0000
  10.113 +@@ -815,6 +815,10 @@
  10.114 + extern int check_user_page_readable(struct mm_struct *mm, unsigned long address);
  10.115 + int remap_pfn_range(struct vm_area_struct *, unsigned long,
  10.116 + 		unsigned long, unsigned long, pgprot_t);
  10.117 ++/* Allow arch override for mapping of device and I/O (non-RAM) pages. */
  10.118 ++#ifndef io_remap_pfn_range
  10.119 ++#define io_remap_pfn_range remap_pfn_range
  10.120 ++#endif
  10.121 + 
  10.122 + #ifdef CONFIG_PROC_FS
  10.123 + void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
    11.1 --- a/tools/blktap/Makefile	Wed Mar 16 05:46:44 2005 +0000
    11.2 +++ b/tools/blktap/Makefile	Tue Mar 15 23:44:44 2005 +0000
    11.3 @@ -141,6 +141,10 @@ vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
    11.4  vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
    11.5  	$(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(VDI_SRCS)
    11.6  
    11.7 +blockstored: blockstored.c
    11.8 +	$(CC) $(CFLAGS) -g3 -o blockstored blockstored.c
    11.9 +bstest: bstest.c blockstore.c
   11.10 +	$(CC) $(CFLAGS) -g3 -o bstest bstest.c blockstore.c
   11.11  
   11.12  rdx_cmp: $(LIB) rdx_cmp.c $(VDI_SRCS)
   11.13  	$(CC) $(CFLAGS) -g3 -o rdx_cmp rdx_cmp.c $(VDI_SRCS)
    12.1 --- a/tools/blktap/blockstore.c	Wed Mar 16 05:46:44 2005 +0000
    12.2 +++ b/tools/blktap/blockstore.c	Tue Mar 15 23:44:44 2005 +0000
    12.3 @@ -15,6 +15,408 @@
    12.4  #include <sys/stat.h>
    12.5  #include "blockstore.h"
    12.6  
    12.7 +#define BLOCKSTORE_REMOTE
    12.8 +
    12.9 +#ifdef BLOCKSTORE_REMOTE
   12.10 +
   12.11 +//#define BSDEBUG
   12.12 +
   12.13 +#include <sys/socket.h>
   12.14 +#include <sys/ioctl.h>
   12.15 +#include <netinet/in.h>
   12.16 +#include <netdb.h>
   12.17 +
   12.18 +#define ENTER_QUEUE_CR (void)0
   12.19 +#define LEAVE_QUEUE_CR (void)0
   12.20 +
   12.21 +bsserver_t bsservers[MAX_SERVERS];
   12.22 +bscluster_t bsclusters[MAX_CLUSTERS];
   12.23 +
   12.24 +struct sockaddr_in sin_local;
   12.25 +int bssock = 0;
   12.26 +
   12.27 +typedef struct bsq_t_struct {
   12.28 +    struct bsq_t_struct *prev;
   12.29 +    struct bsq_t_struct *next;
   12.30 +    int server;
   12.31 +    int length;
   12.32 +    struct msghdr msghdr;
   12.33 +    struct iovec iov[2];
   12.34 +    bshdr_t message;
   12.35 +    void *block;
   12.36 +} bsq_t;
   12.37 +
   12.38 +bsq_t *bs_head = NULL;
   12.39 +bsq_t *bs_tail = NULL;
   12.40 +
   12.41 +int send_message(bsq_t *qe) {
   12.42 +    int rc;
   12.43 +
   12.44 +    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
   12.45 +    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
   12.46 +    qe->msghdr.msg_iov = qe->iov;
   12.47 +    if (qe->block)
   12.48 +        qe->msghdr.msg_iovlen = 2;
   12.49 +    else
   12.50 +        qe->msghdr.msg_iovlen = 1;
   12.51 +    qe->msghdr.msg_control = NULL;
   12.52 +    qe->msghdr.msg_controllen = 0;
   12.53 +    qe->msghdr.msg_flags = 0;
   12.54 +
   12.55 +    qe->iov[0].iov_base = (void *)&(qe->message);
   12.56 +    qe->iov[0].iov_len = MSGBUFSIZE_ID;
   12.57 +
   12.58 +    if (qe->block) {
   12.59 +        qe->iov[1].iov_base = qe->block;
   12.60 +        qe->iov[1].iov_len = BLOCK_SIZE;
   12.61 +    }
   12.62 +
   12.63 +    rc = sendmsg(bssock, &(qe->msghdr), 0);
   12.64 +    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
   12.65 +    //           (struct sockaddr *)&(bsservers[qe->server].sin),
   12.66 +    //           sizeof(struct sockaddr_in));
   12.67 +    if (rc < 0)
   12.68 +        return rc;
   12.69 +    
   12.70 +    ENTER_QUEUE_CR;
   12.71 +    
   12.72 +    LEAVE_QUEUE_CR;
   12.73 +
   12.74 +    return rc;
   12.75 +}
   12.76 +
   12.77 +int recv_message(bsq_t *qe) {
   12.78 +    struct sockaddr_in from;
   12.79 +    //int flen = sizeof(from);
   12.80 +    int rc;
   12.81 +
   12.82 +    qe->msghdr.msg_name = &from;
   12.83 +    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
   12.84 +    qe->msghdr.msg_iov = qe->iov;
   12.85 +    if (qe->block)
   12.86 +        qe->msghdr.msg_iovlen = 2;
   12.87 +    else
   12.88 +        qe->msghdr.msg_iovlen = 1;
   12.89 +    qe->msghdr.msg_control = NULL;
   12.90 +    qe->msghdr.msg_controllen = 0;
   12.91 +    qe->msghdr.msg_flags = 0;
   12.92 +
   12.93 +    qe->iov[0].iov_base = (void *)&(qe->message);
   12.94 +    qe->iov[0].iov_len = MSGBUFSIZE_ID;
   12.95 +    if (qe->block) {
   12.96 +        qe->iov[1].iov_base = qe->block;
   12.97 +        qe->iov[1].iov_len = BLOCK_SIZE;
   12.98 +    }
   12.99 +
  12.100 +    rc = recvmsg(bssock, &(qe->msghdr), 0);
  12.101 +
  12.102 +    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
  12.103 +    //               (struct sockaddr *)&from, &flen);
  12.104 +    return rc;
  12.105 +}
  12.106 +
  12.107 +void *readblock_indiv(int server, u64 id) {
  12.108 +    void *block;
  12.109 +    bsq_t *qe;
  12.110 +    int len;
  12.111 +
  12.112 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  12.113 +    if (!qe) {
  12.114 +        perror("readblock qe malloc");
  12.115 +        return NULL;
  12.116 +    }
  12.117 +    qe->block = malloc(BLOCK_SIZE);
  12.118 +    if (!qe->block) {
  12.119 +        perror("readblock qe malloc");
  12.120 +        free((void *)qe);
  12.121 +        return NULL;
  12.122 +    }
  12.123 +
  12.124 +    qe->server = server;
  12.125 +
  12.126 +    qe->message.operation = BSOP_READBLOCK;
  12.127 +    qe->message.flags = 0;
  12.128 +    qe->message.id = id;
  12.129 +    qe->length = MSGBUFSIZE_ID;
  12.130 +
  12.131 +    if (send_message(qe) < 0) {
  12.132 +        perror("readblock sendto");
  12.133 +        goto err;
  12.134 +    }
  12.135 +    
  12.136 +    len = recv_message(qe);
  12.137 +    if (len < 0) {
  12.138 +        perror("readblock recv");
  12.139 +        goto err;
  12.140 +    }
  12.141 +    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
  12.142 +        fprintf(stderr, "readblock server error\n");
  12.143 +        goto err;
  12.144 +    }
  12.145 +    if (len < MSGBUFSIZE_BLOCK) {
  12.146 +        fprintf(stderr, "readblock recv short (%u)\n", len);
  12.147 +        goto err;
  12.148 +    }
  12.149 +    if ((block = malloc(BLOCK_SIZE)) == NULL) {
  12.150 +        perror("readblock malloc");
  12.151 +        goto err;
  12.152 +    }
  12.153 +    //memcpy(block, qe->message.block, BLOCK_SIZE);
  12.154 +    block = qe->block;
  12.155 +
  12.156 +    free((void *)qe);
  12.157 +    return block;
  12.158 +
  12.159 +    err:
  12.160 +    free(qe->block);
  12.161 +    free((void *)qe);
  12.162 +    return NULL;
  12.163 +}
  12.164 +
  12.165 +/**
  12.166 + * readblock: read a block from disk
  12.167 + *   @id: block id to read
  12.168 + *
  12.169 + *   @return: pointer to block, NULL on error
  12.170 + */
  12.171 +void *readblock(u64 id) {
  12.172 +    int map = (int)BSID_MAP(id);
  12.173 +    u64 xid;
  12.174 +    static int i = CLUSTER_MAX_REPLICAS - 1;
  12.175 +    void *block = NULL;
  12.176 +
  12.177 +    /* special case for the "superblock" just use the first block on the
  12.178 +     * first replica. (extend to blocks < 6 for vdi bug)
  12.179 +     */
  12.180 +    if (id < 6) {
  12.181 +        block = readblock_indiv(bsclusters[map].servers[0], id);
  12.182 +        goto out;
  12.183 +    }
  12.184 +
  12.185 +    i++;
  12.186 +    if (i >= CLUSTER_MAX_REPLICAS)
  12.187 +        i = 0;
  12.188 +    switch (i) {
  12.189 +    case 0:
  12.190 +        xid = BSID_REPLICA0(id);
  12.191 +        break;
  12.192 +    case 1:
  12.193 +        xid = BSID_REPLICA1(id);
  12.194 +        break;
  12.195 +    case 2:
  12.196 +        xid = BSID_REPLICA2(id);
  12.197 +        break;
  12.198 +    }
  12.199 +    
  12.200 +    block = readblock_indiv(bsclusters[map].servers[i], xid);
  12.201 +
  12.202 +    out:
  12.203 +#ifdef BSDEBUG
  12.204 +    if (block)
  12.205 +        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  12.206 +                id,
  12.207 +                (unsigned int)((unsigned char *)block)[0],
  12.208 +                (unsigned int)((unsigned char *)block)[1],
  12.209 +                (unsigned int)((unsigned char *)block)[2],
  12.210 +                (unsigned int)((unsigned char *)block)[3],
  12.211 +                (unsigned int)((unsigned char *)block)[4],
  12.212 +                (unsigned int)((unsigned char *)block)[5],
  12.213 +                (unsigned int)((unsigned char *)block)[6],
  12.214 +                (unsigned int)((unsigned char *)block)[7]);
  12.215 +    else
  12.216 +        fprintf(stderr, "READ:  %016llx NULL\n", id);
  12.217 +#endif
  12.218 +    return block;
  12.219 +}
  12.220 +
  12.221 +int writeblock_indiv(int server, u64 id, void *block) {
  12.222 +    bsq_t *qe;
  12.223 +    int len;
  12.224 +
  12.225 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  12.226 +    if (!qe) {
  12.227 +        perror("writeblock qe malloc");
  12.228 +        goto err;
  12.229 +    }
  12.230 +    qe->server = server;
  12.231 +
  12.232 +    qe->message.operation = BSOP_WRITEBLOCK;
  12.233 +    qe->message.flags = 0;
  12.234 +    qe->message.id = id;
  12.235 +    //memcpy(qe->message.block, block, BLOCK_SIZE);
  12.236 +    qe->block = block;
  12.237 +    qe->length = MSGBUFSIZE_BLOCK;
  12.238 +
  12.239 +    if (send_message(qe) < 0) {
  12.240 +        perror("writeblock sendto");
  12.241 +        goto err;
  12.242 +    }
  12.243 +    
  12.244 +    len = recv_message(qe);
  12.245 +    if (len < 0) {
  12.246 +        perror("writeblock recv");
  12.247 +        goto err;
  12.248 +    }
  12.249 +    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
  12.250 +        fprintf(stderr, "writeblock server error\n");
  12.251 +        goto err;
  12.252 +    }
  12.253 +    if (len < MSGBUFSIZE_ID) {
  12.254 +        fprintf(stderr, "writeblock recv short (%u)\n", len);
  12.255 +        goto err;
  12.256 +    }
  12.257 +
  12.258 +    free((void *)qe);
  12.259 +    return 0;
  12.260 +
  12.261 +    err:
  12.262 +    free((void *)qe);
  12.263 +    return -1;
  12.264 +}
  12.265 +
  12.266 +/**
  12.267 + * writeblock: write an existing block to disk
  12.268 + *   @id: block id
  12.269 + *   @block: pointer to block
  12.270 + *
  12.271 + *   @return: zero on success, -1 on failure
  12.272 + */
  12.273 +int writeblock(u64 id, void *block) {
  12.274 +    int map = (int)BSID_MAP(id);
  12.275 +    
  12.276 +    int rep0 = bsclusters[map].servers[0];
  12.277 +    int rep1 = bsclusters[map].servers[1];
  12.278 +    int rep2 = bsclusters[map].servers[2];
  12.279 +
  12.280 +#ifdef BSDEBUG
  12.281 +    fprintf(stderr,
  12.282 +            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  12.283 +            id,
  12.284 +            (unsigned int)((unsigned char *)block)[0],
  12.285 +            (unsigned int)((unsigned char *)block)[1],
  12.286 +            (unsigned int)((unsigned char *)block)[2],
  12.287 +            (unsigned int)((unsigned char *)block)[3],
  12.288 +            (unsigned int)((unsigned char *)block)[4],
  12.289 +            (unsigned int)((unsigned char *)block)[5],
  12.290 +            (unsigned int)((unsigned char *)block)[6],
  12.291 +            (unsigned int)((unsigned char *)block)[7]);
  12.292 +#endif
  12.293 +
  12.294 +/* special case for the "superblock" just use the first block on the
  12.295 +     * first replica. (extend to blocks < 6 for vdi bug)
  12.296 +     */
  12.297 +    if (id < 6) {
  12.298 +        return writeblock_indiv(rep0, id, block);
  12.299 +    }
  12.300 +
  12.301 +    if (writeblock_indiv(rep0, BSID_REPLICA0(id), block) < 0)
  12.302 +        return -1;
  12.303 +    if (writeblock_indiv(rep1, BSID_REPLICA1(id), block) < 0)
  12.304 +        return -1;
  12.305 +    if (writeblock_indiv(rep2, BSID_REPLICA2(id), block) < 0)
  12.306 +        return -1;
  12.307 +    return 0;
  12.308 +}
  12.309 +
  12.310 +/**
  12.311 + * allocblock: write a new block to disk
  12.312 + *   @block: pointer to block
  12.313 + *
  12.314 + *   @return: new id of block on disk
  12.315 + */
  12.316 +u64 allocblock(void *block) {
  12.317 +    return allocblock_hint(block, 0);
  12.318 +}
  12.319 +
  12.320 +u64 allocblock_hint_indiv(int server, void *block, u64 hint) {
  12.321 +    bsq_t *qe;
  12.322 +    int len;
  12.323 +
  12.324 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  12.325 +    if (!qe) {
  12.326 +        perror("allocblock_hint qe malloc");
  12.327 +        goto err;
  12.328 +    }
  12.329 +    qe->server = server;
  12.330 +
  12.331 +    qe->message.operation = BSOP_ALLOCBLOCK;
  12.332 +    qe->message.flags = 0;
  12.333 +    qe->message.id = hint;
  12.334 +    //memcpy(qe->message.block, block, BLOCK_SIZE);
  12.335 +    qe->block = block;
  12.336 +    qe->length = MSGBUFSIZE_BLOCK;
  12.337 +
  12.338 +    if (send_message(qe) < 0) {
  12.339 +        perror("allocblock_hint sendto");
  12.340 +        goto err;
  12.341 +    }
  12.342 +    
  12.343 +    len = recv_message(qe);
  12.344 +    if (len < 0) {
  12.345 +        perror("allocblock_hint recv");
  12.346 +        goto err;
  12.347 +    }
  12.348 +    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
  12.349 +        fprintf(stderr, "allocblock_hint server error\n");
  12.350 +        goto err;
  12.351 +    }
  12.352 +    if (len < MSGBUFSIZE_ID) {
  12.353 +        fprintf(stderr, "allocblock_hint recv short (%u)\n", len);
  12.354 +        goto err;
  12.355 +    }
  12.356 +
  12.357 +    free((void *)qe);
  12.358 +    return qe->message.id;
  12.359 +
  12.360 +    err:
  12.361 +    free((void *)qe);
  12.362 +    return 0;
  12.363 +}
  12.364 +
  12.365 +/**
  12.366 + * allocblock_hint: write a new block to disk
  12.367 + *   @block: pointer to block
  12.368 + *   @hint: allocation hint
  12.369 + *
  12.370 + *   @return: new id of block on disk
  12.371 + */
  12.372 +u64 allocblock_hint(void *block, u64 hint) {
  12.373 +    int map = (int)hint;
  12.374 +    
  12.375 +    int rep0 = bsclusters[map].servers[0];
  12.376 +    int rep1 = bsclusters[map].servers[1];
  12.377 +    int rep2 = bsclusters[map].servers[2];
  12.378 +
  12.379 +    u64 id0, id1, id2;
  12.380 +
  12.381 +    id0 = allocblock_hint_indiv(rep0, block, 0);
  12.382 +    if (id0 == 0)
  12.383 +        return 0;
  12.384 +    id1 = allocblock_hint_indiv(rep1, block, 0);
  12.385 +    if (id1 == 0)
  12.386 +        return 0;
  12.387 +    id2 = allocblock_hint_indiv(rep2, block, 0);
  12.388 +    if (id2 == 0)
  12.389 +        return 0;
  12.390 +
  12.391 +#ifdef BSDEBUG
  12.392 +    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  12.393 +            BSID(map, id0, id1, id2),
  12.394 +            (unsigned int)((unsigned char *)block)[0],
  12.395 +            (unsigned int)((unsigned char *)block)[1],
  12.396 +            (unsigned int)((unsigned char *)block)[2],
  12.397 +            (unsigned int)((unsigned char *)block)[3],
  12.398 +            (unsigned int)((unsigned char *)block)[4],
  12.399 +            (unsigned int)((unsigned char *)block)[5],
  12.400 +            (unsigned int)((unsigned char *)block)[6],
  12.401 +            (unsigned int)((unsigned char *)block)[7]);
  12.402 +#endif
  12.403 +
  12.404 +    return BSID(map, id0, id1, id2);
  12.405 +}
  12.406 +
  12.407 +#else /* /BLOCKSTORE_REMOTE */
  12.408 +
  12.409  static int block_fp = -1;
  12.410   
  12.411  /**
  12.412 @@ -94,6 +496,18 @@ u64 allocblock(void *block) {
  12.413      return lb;
  12.414  }
  12.415  
  12.416 +/**
  12.417 + * allocblock_hint: write a new block to disk
  12.418 + *   @block: pointer to block
  12.419 + *   @hint: allocation hint
  12.420 + *
  12.421 + *   @return: new id of block on disk
  12.422 + */
  12.423 +u64 allocblock_hint(void *block, u64 hint) {
  12.424 +    return allocblock(block);
  12.425 +}
  12.426 +
  12.427 +#endif /* BLOCKSTORE_REMOTE */
  12.428  
  12.429  /**
  12.430   * newblock: get a new in-memory block set to zeros
  12.431 @@ -124,12 +538,92 @@ void freeblock(void *block) {
  12.432  
  12.433  int __init_blockstore(void)
  12.434  {
  12.435 +#ifdef BLOCKSTORE_REMOTE
  12.436 +    struct hostent *addr;
  12.437 +    int i;
  12.438 +
  12.439 +    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
  12.440 +    bsservers[1].hostname = "tetris.cl.cam.ac.uk";
  12.441 +    bsservers[2].hostname = "donkeykong.cl.cam.ac.uk";
  12.442 +    bsservers[3].hostname = "gunfighter.cl.cam.ac.uk";
  12.443 +    bsservers[4].hostname = "galaxian.cl.cam.ac.uk";
  12.444 +    bsservers[5].hostname = "firetrack.cl.cam.ac.uk";
  12.445 +    bsservers[6].hostname = "funfair.cl.cam.ac.uk";
  12.446 +    bsservers[7].hostname = "felix.cl.cam.ac.uk";
  12.447 +    bsservers[8].hostname = NULL;
  12.448 +    bsservers[9].hostname = NULL;
  12.449 +    bsservers[10].hostname = NULL;
  12.450 +    bsservers[11].hostname = NULL;
  12.451 +    bsservers[12].hostname = NULL;
  12.452 +    bsservers[13].hostname = NULL;
  12.453 +    bsservers[14].hostname = NULL;
  12.454 +    bsservers[15].hostname = NULL;
  12.455 +
  12.456 +    for (i = 0; i < MAX_SERVERS; i++) {
  12.457 +        if (!bsservers[i].hostname)
  12.458 +            continue;
  12.459 +        addr = gethostbyname(bsservers[i].hostname);
  12.460 +        if (!addr) {
  12.461 +            perror("bad hostname");
  12.462 +            return -1;
  12.463 +        }
  12.464 +        bsservers[i].sin.sin_family = addr->h_addrtype;
  12.465 +        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
  12.466 +        bsservers[i].sin.sin_addr.s_addr = 
  12.467 +            ((struct in_addr *)(addr->h_addr))->s_addr;
  12.468 +    }
  12.469 +
  12.470 +    /* Cluster map
  12.471 +     */
  12.472 +    bsclusters[0].servers[0] = 0;
  12.473 +    bsclusters[0].servers[1] = 1;
  12.474 +    bsclusters[0].servers[2] = 2;
  12.475 +    bsclusters[1].servers[0] = 1;
  12.476 +    bsclusters[1].servers[1] = 2;
  12.477 +    bsclusters[1].servers[2] = 3;
  12.478 +    bsclusters[2].servers[0] = 2;
  12.479 +    bsclusters[2].servers[1] = 3;
  12.480 +    bsclusters[2].servers[2] = 4;
  12.481 +    bsclusters[3].servers[0] = 3;
  12.482 +    bsclusters[3].servers[1] = 4;
  12.483 +    bsclusters[3].servers[2] = 5;
  12.484 +    bsclusters[4].servers[0] = 4;
  12.485 +    bsclusters[4].servers[1] = 5;
  12.486 +    bsclusters[4].servers[2] = 6;
  12.487 +    bsclusters[5].servers[0] = 5;
  12.488 +    bsclusters[5].servers[1] = 6;
  12.489 +    bsclusters[5].servers[2] = 7;
  12.490 +    bsclusters[6].servers[0] = 6;
  12.491 +    bsclusters[6].servers[1] = 7;
  12.492 +    bsclusters[6].servers[2] = 0;
  12.493 +    bsclusters[7].servers[0] = 7;
  12.494 +    bsclusters[7].servers[1] = 0;
  12.495 +    bsclusters[7].servers[2] = 1;
  12.496 +
  12.497 +    /* Local socket set up
  12.498 +     */
  12.499 +    bssock = socket(AF_INET, SOCK_DGRAM, 0);
  12.500 +    if (bssock < 0) {
  12.501 +        perror("Bad socket");
  12.502 +        return -1;
  12.503 +    }
  12.504 +    memset(&sin_local, 0, sizeof(sin_local));
  12.505 +    sin_local.sin_family = AF_INET;
  12.506 +    sin_local.sin_port = htons(BLOCKSTORED_PORT);
  12.507 +    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
  12.508 +    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
  12.509 +        perror("bind");
  12.510 +        close(bssock);
  12.511 +        return -1;
  12.512 +    }
  12.513 +
  12.514 +#else /* /BLOCKSTORE_REMOTE */
  12.515      block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  12.516  
  12.517      if (block_fp < 0) {
  12.518          perror("open");
  12.519          return -1;
  12.520      }
  12.521 -    
  12.522 +#endif /*  BLOCKSTORE_REMOTE */   
  12.523      return 0;
  12.524  }
    13.1 --- a/tools/blktap/blockstore.h	Wed Mar 16 05:46:44 2005 +0000
    13.2 +++ b/tools/blktap/blockstore.h	Tue Mar 15 23:44:44 2005 +0000
    13.3 @@ -9,6 +9,7 @@
    13.4  #ifndef __BLOCKSTORE_H__
    13.5  #define __BLOCKSTORE_H__
    13.6  
    13.7 +#include <netinet/in.h>
    13.8  #include <xc.h>
    13.9  
   13.10  #define BLOCK_SIZE  4096
   13.11 @@ -24,8 +25,83 @@
   13.12  extern void *newblock();
   13.13  extern void *readblock(u64 id);
   13.14  extern u64 allocblock(void *block);
   13.15 +extern u64 allocblock_hint(void *block, u64 hint);
   13.16  extern int writeblock(u64 id, void *block);
   13.17  extern void freeblock(void *block);
   13.18  extern int __init_blockstore(void);
   13.19  
   13.20 +#define ALLOCFAIL (((u64)(-1)))
   13.21 +
   13.22 +/* Distribution
   13.23 + */
   13.24 +#define BLOCKSTORED_PORT 9346
   13.25 +
   13.26 +struct bshdr_t_struct {
   13.27 +    u32            operation;
   13.28 +    u32            flags;
   13.29 +    u64            id;
   13.30 +} __attribute__ ((packed));
   13.31 +typedef struct bshdr_t_struct bshdr_t;
   13.32 +
   13.33 +struct bsmsg_t_struct {
   13.34 +    bshdr_t        hdr;
   13.35 +    unsigned char  block[BLOCK_SIZE];
   13.36 +} __attribute__ ((packed));
   13.37 +
   13.38 +typedef struct bsmsg_t_struct bsmsg_t;
   13.39 +
   13.40 +#define MSGBUFSIZE_OP    sizeof(u32)
   13.41 +#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
   13.42 +#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64))
   13.43 +#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
   13.44 +
   13.45 +#define BSOP_READBLOCK  0x01
   13.46 +#define BSOP_WRITEBLOCK 0x02
   13.47 +#define BSOP_ALLOCBLOCK 0x03
   13.48 +
   13.49 +#define BSOP_FLAG_ERROR 0x01
   13.50 +
   13.51 +#define BS_ALLOC_SKIP 10
   13.52 +#define BS_ALLOC_HACK
   13.53 +
   13.54 +/* Remote hosts and cluster map - XXX need to generalise
   13.55 + */
   13.56 +
   13.57 +/*
   13.58 +
   13.59 +  Interim ID format is
   13.60 +
   13.61 +  63 60 59                40 39                20 19                 0
   13.62 +  +----+--------------------+--------------------+--------------------+
   13.63 +  |map | replica 2          | replica 1          | replica 0          |
   13.64 +  +----+--------------------+--------------------+--------------------+
   13.65 +
   13.66 +  The map is an index into a table detailing which machines form the
   13.67 +  cluster.
   13.68 +
   13.69 + */
   13.70 +
   13.71 +#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
   13.72 +#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
   13.73 +#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
   13.74 +#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
   13.75 +
   13.76 +#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
   13.77 +                                         (((u64)(_rep2))<<40) | \
   13.78 +                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
   13.79 +
   13.80 +typedef struct bsserver_t_struct {
   13.81 +    char              *hostname;
   13.82 +    struct sockaddr_in sin;
   13.83 +} bsserver_t;
   13.84 +
   13.85 +#define MAX_SERVERS 16
   13.86 +
   13.87 +#define CLUSTER_MAX_REPLICAS 3
   13.88 +typedef struct bscluster_t_struct {
   13.89 +    int servers[CLUSTER_MAX_REPLICAS];
   13.90 +} bscluster_t;
   13.91 +
   13.92 +#define MAX_CLUSTERS 16
   13.93 +
   13.94  #endif /* __BLOCKSTORE_H__ */
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/tools/blktap/blockstored.c	Tue Mar 15 23:44:44 2005 +0000
    14.3 @@ -0,0 +1,276 @@
    14.4 +/**************************************************************************
    14.5 + * 
    14.6 + * blockstored.c
    14.7 + *
    14.8 + * Block store daemon.
    14.9 + *
   14.10 + */
   14.11 +
   14.12 +#include <fcntl.h>
   14.13 +#include <unistd.h>
   14.14 +#include <stdio.h>
   14.15 +#include <stdlib.h>
   14.16 +#include <string.h>
   14.17 +#include <sys/types.h>
   14.18 +#include <sys/stat.h>
   14.19 +#include <sys/socket.h>
   14.20 +#include <sys/ioctl.h>
   14.21 +#include <netinet/in.h>
   14.22 +#include <errno.h>
   14.23 +#include "blockstore.h"
   14.24 +
   14.25 +//#define BSDEBUG
   14.26 +
   14.27 +int readblock_into(u64 id, void *block);
   14.28 +
   14.29 +int open_socket(u16 port) {
   14.30 +    
   14.31 +    struct sockaddr_in sn;
   14.32 +    int sock;
   14.33 +
   14.34 +    sock = socket(AF_INET, SOCK_DGRAM, 0);
   14.35 +    if (sock < 0) {
   14.36 +        perror("Bad socket");
   14.37 +        return -1;
   14.38 +    }
   14.39 +    memset(&sn, 0, sizeof(sn));
   14.40 +    sn.sin_family = AF_INET;
   14.41 +    sn.sin_port = htons(port);
   14.42 +    sn.sin_addr.s_addr = htonl(INADDR_ANY);
   14.43 +    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
   14.44 +        perror("bind");
   14.45 +        close(sock);
   14.46 +        return -1;
   14.47 +    }
   14.48 +
   14.49 +    return sock;
   14.50 +}
   14.51 +
   14.52 +static int block_fp = -1;
   14.53 +static int bssock = -1;
   14.54 +
   14.55 +int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
   14.56 +
   14.57 +    int rc;
   14.58 +    
   14.59 +#ifdef BSDEBUG
   14.60 +    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
   14.61 +            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id);
   14.62 +#endif
   14.63 +    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer));
   14.64 +    if (rc < 0) {
   14.65 +        perror("send_reply");
   14.66 +        return 1;
   14.67 +    }
   14.68 +
   14.69 +
   14.70 +    return 0;
   14.71 +}
   14.72 +
   14.73 +static bsmsg_t msgbuf;
   14.74 +
   14.75 +void service_loop(void) {
   14.76 +
   14.77 +    for (;;) {
   14.78 +        int rc, len;
   14.79 +        struct sockaddr_in from;
   14.80 +        size_t slen = sizeof(from);
   14.81 +        u64 bid;
   14.82 +
   14.83 +        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
   14.84 +                       (struct sockaddr *)&from, &slen);
   14.85 +
   14.86 +        if (len < 0) {
   14.87 +            perror("recvfrom");
   14.88 +            continue;
   14.89 +        }
   14.90 +
   14.91 +        if (len < MSGBUFSIZE_OP) {
   14.92 +            fprintf(stderr, "Short packet.\n");
   14.93 +            continue;
   14.94 +        }
   14.95 +
   14.96 +#ifdef BSDEBUG
   14.97 +        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
   14.98 +                len, msgbuf.hdr.operation, msgbuf.hdr.id);
   14.99 +#endif
  14.100 +
  14.101 +        switch (msgbuf.hdr.operation) {
  14.102 +        case BSOP_READBLOCK:
  14.103 +            if (len < MSGBUFSIZE_ID) {
  14.104 +                fprintf(stderr, "Short packet (readblock %u).\n", len);
  14.105 +                continue;
  14.106 +            }
  14.107 +            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
  14.108 +            if (rc < 0) {
  14.109 +                fprintf(stderr, "readblock error\n");
  14.110 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  14.111 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  14.112 +                continue;
  14.113 +            }
  14.114 +            msgbuf.hdr.flags = 0;
  14.115 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
  14.116 +            break;
  14.117 +        case BSOP_WRITEBLOCK:
  14.118 +            if (len < MSGBUFSIZE_BLOCK) {
  14.119 +                fprintf(stderr, "Short packet (writeblock %u).\n", len);
  14.120 +                continue;
  14.121 +            }
  14.122 +            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
  14.123 +            if (rc < 0) {
  14.124 +                fprintf(stderr, "writeblock error\n");
  14.125 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  14.126 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  14.127 +                continue;
  14.128 +            }
  14.129 +            msgbuf.hdr.flags = 0;
  14.130 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  14.131 +            break;
  14.132 +        case BSOP_ALLOCBLOCK:
  14.133 +            if (len < MSGBUFSIZE_BLOCK) {
  14.134 +                fprintf(stderr, "Short packet (allocblock %u).\n", len);
  14.135 +                continue;
  14.136 +            }
  14.137 +            bid = allocblock(msgbuf.block);
  14.138 +            if (bid == ALLOCFAIL) {
  14.139 +                fprintf(stderr, "allocblock error\n");
  14.140 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  14.141 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  14.142 +                continue;
  14.143 +            }
  14.144 +            msgbuf.hdr.id = bid;
  14.145 +            msgbuf.hdr.flags = 0;
  14.146 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  14.147 +            break;
  14.148 +        }
  14.149 +
  14.150 +    }
  14.151 +}
  14.152 + 
  14.153 +/**
  14.154 + * readblock: read a block from disk
  14.155 + *   @id: block id to read
  14.156 + *   @block: pointer to buffer to receive block
  14.157 + *
  14.158 + *   @return: 0 if OK, other on error
  14.159 + */
  14.160 +
  14.161 +int readblock_into(u64 id, void *block) {
  14.162 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  14.163 +        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
  14.164 +        perror("readblock lseek");
  14.165 +        return -1;
  14.166 +    }
  14.167 +    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  14.168 +        perror("readblock read");
  14.169 +        return -1;
  14.170 +    }
  14.171 +    return 0;
  14.172 +}
  14.173 +
  14.174 +/**
  14.175 + * writeblock: write an existing block to disk
  14.176 + *   @id: block id
  14.177 + *   @block: pointer to block
  14.178 + *
  14.179 + *   @return: zero on success, -1 on failure
  14.180 + */
  14.181 +int writeblock(u64 id, void *block) {
  14.182 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  14.183 +        perror("writeblock lseek");
  14.184 +        return -1;
  14.185 +    }
  14.186 +    if (write(block_fp, block, BLOCK_SIZE) < 0) {
  14.187 +        perror("writeblock write");
  14.188 +        return -1;
  14.189 +    }
  14.190 +    return 0;
  14.191 +}
  14.192 +
  14.193 +/**
  14.194 + * allocblock: write a new block to disk
  14.195 + *   @block: pointer to block
  14.196 + *
  14.197 + *   @return: new id of block on disk
  14.198 + */
  14.199 +static u64 lastblock = 0;
  14.200 +
  14.201 +u64 allocblock(void *block) {
  14.202 +    u64 lb;
  14.203 +    off64_t pos;
  14.204 +
  14.205 +    retry:
  14.206 +    pos = lseek64(block_fp, 0, SEEK_END);
  14.207 +    if (pos == (off64_t)-1) {
  14.208 +        perror("allocblock lseek");
  14.209 +        return ALLOCFAIL;
  14.210 +    }
  14.211 +    if (pos % BLOCK_SIZE != 0) {
  14.212 +        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
  14.213 +        return ALLOCFAIL;
  14.214 +    }
  14.215 +    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  14.216 +        perror("allocblock write");
  14.217 +        return ALLOCFAIL;
  14.218 +    }
  14.219 +    lb = pos / BLOCK_SIZE + 1;
  14.220 +
  14.221 +#ifdef BS_ALLOC_HACK
  14.222 +    if (lb < BS_ALLOC_SKIP)
  14.223 +        goto retry;
  14.224 +#endif
  14.225 +    
  14.226 +    if (lb <= lastblock)
  14.227 +        printf("[*** %Ld alredy allocated! ***]\n", lb);
  14.228 +    
  14.229 +    lastblock = lb;
  14.230 +    return lb;
  14.231 +}
  14.232 +
  14.233 +/**
  14.234 + * newblock: get a new in-memory block set to zeros
  14.235 + *
  14.236 + *   @return: pointer to new block, NULL on error
  14.237 + */
  14.238 +void *newblock() {
  14.239 +    void *block = malloc(BLOCK_SIZE);
  14.240 +    if (block == NULL) {
  14.241 +        perror("newblock");
  14.242 +        return NULL;
  14.243 +    }
  14.244 +    memset(block, 0, BLOCK_SIZE);
  14.245 +    return block;
  14.246 +}
  14.247 +
  14.248 +
  14.249 +/**
  14.250 + * freeblock: unallocate an in-memory block
  14.251 + *   @id: block id (zero if this is only in-memory)
  14.252 + *   @block: block to be freed
  14.253 + */
  14.254 +void freeblock(void *block) {
  14.255 +    if (block != NULL)
  14.256 +        free(block);
  14.257 +}
  14.258 +
  14.259 +
  14.260 +int main(int argc, char **argv)
  14.261 +{
  14.262 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  14.263 +
  14.264 +    if (block_fp < 0) {
  14.265 +        perror("open");
  14.266 +        return -1;
  14.267 +    }
  14.268 +
  14.269 +    bssock = open_socket(BLOCKSTORED_PORT);
  14.270 +    if (bssock < 0) {
  14.271 +        return -1;
  14.272 +    }
  14.273 +
  14.274 +    service_loop();
  14.275 +    
  14.276 +    close(bssock);
  14.277 +
  14.278 +    return 0;
  14.279 +}
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/tools/blktap/bstest.c	Tue Mar 15 23:44:44 2005 +0000
    15.3 @@ -0,0 +1,191 @@
    15.4 +/**************************************************************************
    15.5 + * 
    15.6 + * bstest.c
    15.7 + *
    15.8 + * Block store daemon test program.
    15.9 + *
   15.10 + * usage: bstest <host>|X {r|w|a} ID 
   15.11 + *
   15.12 + */
   15.13 +
   15.14 +#include <fcntl.h>
   15.15 +#include <unistd.h>
   15.16 +#include <stdio.h>
   15.17 +#include <stdlib.h>
   15.18 +#include <string.h>
   15.19 +#include <sys/types.h>
   15.20 +#include <sys/stat.h>
   15.21 +#include <sys/socket.h>
   15.22 +#include <sys/ioctl.h>
   15.23 +#include <netinet/in.h>
   15.24 +#include <netdb.h>
   15.25 +#include <errno.h>
   15.26 +#include "blockstore.h"
   15.27 +
   15.28 +int direct(char *host, u32 op, u64 id, int len) {
   15.29 +    struct sockaddr_in sn, peer;
   15.30 +    int sock;
   15.31 +    bsmsg_t msgbuf;
   15.32 +    int rc, slen;
   15.33 +    struct hostent *addr;
   15.34 +
   15.35 +    addr = gethostbyname(host);
   15.36 +    if (!addr) {
   15.37 +        perror("bad hostname");
   15.38 +        exit(1);
   15.39 +    }
   15.40 +    peer.sin_family = addr->h_addrtype;
   15.41 +    peer.sin_port = htons(BLOCKSTORED_PORT);
   15.42 +    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
   15.43 +    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
   15.44 +            (unsigned int)(unsigned char)addr->h_addr[0],
   15.45 +            (unsigned int)(unsigned char)addr->h_addr[1],
   15.46 +            (unsigned int)(unsigned char)addr->h_addr[2],
   15.47 +            (unsigned int)(unsigned char)addr->h_addr[3]);
   15.48 +
   15.49 +    sock = socket(AF_INET, SOCK_DGRAM, 0);
   15.50 +    if (sock < 0) {
   15.51 +        perror("Bad socket");
   15.52 +        exit(1);
   15.53 +    }
   15.54 +    memset(&sn, 0, sizeof(sn));
   15.55 +    sn.sin_family = AF_INET;
   15.56 +    sn.sin_port = htons(BLOCKSTORED_PORT);
   15.57 +    sn.sin_addr.s_addr = htonl(INADDR_ANY);
   15.58 +    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
   15.59 +        perror("bind");
   15.60 +        close(sock);
   15.61 +        exit(1);
   15.62 +    }
   15.63 +
   15.64 +    memset((void *)&msgbuf, 0, sizeof(msgbuf));
   15.65 +    msgbuf.operation = op;
   15.66 +    msgbuf.id = id;
   15.67 +
   15.68 +    rc = sendto(sock, (void *)&msgbuf, len, 0,
   15.69 +                (struct sockaddr *)&peer, sizeof(peer));
   15.70 +    if (rc < 0) {
   15.71 +        perror("sendto");
   15.72 +        exit(1);
   15.73 +    }
   15.74 +
   15.75 +    slen = sizeof(peer);
   15.76 +    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
   15.77 +                   (struct sockaddr *)&peer, &slen);
   15.78 +    if (len < 0) {
   15.79 +        perror("recvfrom");
   15.80 +        exit(1);
   15.81 +    }
   15.82 +
   15.83 +    printf("Reply %u bytes:\n", len);
   15.84 +    if (len >= MSGBUFSIZE_OP)
   15.85 +        printf("  operation: %u\n", msgbuf.operation);
   15.86 +    if (len >= MSGBUFSIZE_FLAGS)
   15.87 +        printf("  flags: 0x%x\n", msgbuf.flags);
   15.88 +    if (len >= MSGBUFSIZE_ID)
   15.89 +        printf("  id: %llu\n", msgbuf.id);
   15.90 +    if (len >= (MSGBUFSIZE_ID + 4))
   15.91 +        printf("  data: %02x %02x %02x %02x...\n",
   15.92 +               (unsigned int)msgbuf.block[0],
   15.93 +               (unsigned int)msgbuf.block[1],
   15.94 +               (unsigned int)msgbuf.block[2],
   15.95 +               (unsigned int)msgbuf.block[3]);
   15.96 +    
   15.97 +    if (sock > 0)
   15.98 +        close(sock);
   15.99 +   
  15.100 +    return 0;
  15.101 +}
  15.102 +
  15.103 +int main (int argc, char **argv) {
  15.104 +
  15.105 +    u32 op = 0;
  15.106 +    u64 id = 0;
  15.107 +    int len = 0, rc;
  15.108 +    void *block;
  15.109 +
  15.110 +    if (argc < 3) {
  15.111 +        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
  15.112 +        return 1;
  15.113 +    }
  15.114 +
  15.115 +    switch (argv[2][0]) {
  15.116 +    case 'r':
  15.117 +    case 'R':
  15.118 +        op = BSOP_READBLOCK;
  15.119 +        len = MSGBUFSIZE_ID;
  15.120 +        break;
  15.121 +    case 'w':
  15.122 +    case 'W':
  15.123 +        op = BSOP_WRITEBLOCK;
  15.124 +        len = MSGBUFSIZE_BLOCK;
  15.125 +        break;
  15.126 +    case 'a':
  15.127 +    case 'A':
  15.128 +        op = BSOP_ALLOCBLOCK;
  15.129 +        len = MSGBUFSIZE_BLOCK;
  15.130 +        break;
  15.131 +    default:
  15.132 +        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
  15.133 +        return 1;
  15.134 +    }
  15.135 +
  15.136 +    if (argc >= 4)
  15.137 +        id = atoll(argv[3]);
  15.138 +
  15.139 +    if (strcmp(argv[1], "X") == 0) {
  15.140 +        rc = __init_blockstore();
  15.141 +        if (rc < 0) {
  15.142 +            fprintf(stderr, "blockstore init failed.\n");
  15.143 +            return 1;
  15.144 +        }
  15.145 +        switch(op) {
  15.146 +        case BSOP_READBLOCK:
  15.147 +            block = readblock(id);
  15.148 +            if (block) {
  15.149 +                printf("data: %02x %02x %02x %02x...\n",
  15.150 +                       (unsigned int)((unsigned char*)block)[0],
  15.151 +                       (unsigned int)((unsigned char*)block)[1],
  15.152 +                       (unsigned int)((unsigned char*)block)[2],
  15.153 +                       (unsigned int)((unsigned char*)block)[3]);
  15.154 +            }
  15.155 +            break;
  15.156 +        case BSOP_WRITEBLOCK:
  15.157 +            block = malloc(BLOCK_SIZE);
  15.158 +            if (!block) {
  15.159 +                perror("bstest malloc");
  15.160 +                return 1;
  15.161 +            }
  15.162 +            memset(block, 0, BLOCK_SIZE);
  15.163 +            rc = writeblock(id, block);
  15.164 +            if (rc != 0) {
  15.165 +                printf("error\n");
  15.166 +            }
  15.167 +            else {
  15.168 +                printf("OK\n");
  15.169 +            }
  15.170 +            break;
  15.171 +        case BSOP_ALLOCBLOCK:
  15.172 +            block = malloc(BLOCK_SIZE);
  15.173 +            if (!block) {
  15.174 +                perror("bstest malloc");
  15.175 +                return 1;
  15.176 +            }
  15.177 +            memset(block, 0, BLOCK_SIZE);
  15.178 +            id = allocblock_hint(block, id);
  15.179 +            if (id == 0) {
  15.180 +                printf("error\n");
  15.181 +            }
  15.182 +            else {
  15.183 +                printf("ID: %llu\n", id);
  15.184 +            }
  15.185 +            break;
  15.186 +        }
  15.187 +    }
  15.188 +    else {
  15.189 +        direct(argv[1], op, id, len);
  15.190 +    }
  15.191 +
  15.192 +
  15.193 +    return 0;
  15.194 +}
    16.1 --- a/tools/misc/xend	Wed Mar 16 05:46:44 2005 +0000
    16.2 +++ b/tools/misc/xend	Tue Mar 15 23:44:44 2005 +0000
    16.3 @@ -101,11 +101,16 @@ def check_user():
    16.4  def xcs_running():
    16.5      """ See if the control switch is running.
    16.6      """	
    16.7 +    s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    16.8      try:
    16.9 -	xcs_pidfile = open(XCS_PIDFILE)
   16.10 -    except IOError:
   16.11 -	return(0)
   16.12 -    xcs_pidfile.close()
   16.13 +        s.connect( (XCS_PATH) )
   16.14 +        s.close()
   16.15 +    except:
   16.16 +        try:
   16.17 +            os.remove(XCS_PIDFILE)
   16.18 +        except:
   16.19 +            pass
   16.20 +	return 0
   16.21      return 1
   16.22      
   16.23  def start_xcs():
   16.24 @@ -113,6 +118,8 @@ def start_xcs():
   16.25          if os.fork():
   16.26              time.sleep(0.1) # let xcs start
   16.27          else:
   16.28 +            if not os.path.isdir(os.path.dirname(XCS_PATH)):
   16.29 +                os.makedirs(os.path.dirname(XCS_PATH))
   16.30              try:
   16.31                  os.execvp(XCS_EXEC, XCS_ARGS)
   16.32              except:
   16.33 @@ -129,12 +136,11 @@ def start_xcs():
   16.34  def stop_xcs():
   16.35      try:
   16.36  	xcs_pidfile = open(XCS_PIDFILE)
   16.37 -    except IOError:
   16.38 -	return
   16.39 -    xcs_pid = int(xcs_pidfile.read().strip())
   16.40 -    os.kill(xcs_pid, signal.SIGTERM)
   16.41 -    xcs_pidfile.close()
   16.42 -    
   16.43 +        xcs_pid = int(xcs_pidfile.read().strip())
   16.44 +        os.kill(xcs_pid, signal.SIGTERM)
   16.45 +        xcs_pidfile.close()
   16.46 +    except:
   16.47 +	return    
   16.48              
   16.49  def main():
   16.50      try:
    17.1 --- a/tools/tests/test_x86_emulator.c	Wed Mar 16 05:46:44 2005 +0000
    17.2 +++ b/tools/tests/test_x86_emulator.c	Tue Mar 15 23:44:44 2005 +0000
    17.3 @@ -26,7 +26,7 @@ static int read_any(
    17.4      case 4: *val = *(u32 *)addr; break;
    17.5      case 8: *val = *(unsigned long *)addr; break;
    17.6      }
    17.7 -    return 0;
    17.8 +    return X86EMUL_CONTINUE;
    17.9  }
   17.10  
   17.11  static int write_any(
   17.12 @@ -41,17 +41,15 @@ static int write_any(
   17.13      case 4: *(u32 *)addr = (u32)val; break;
   17.14      case 8: *(unsigned long *)addr = val; break;
   17.15      }
   17.16 -    return 0;
   17.17 +    return X86EMUL_CONTINUE;
   17.18  }
   17.19  
   17.20  static int cmpxchg_any(
   17.21      unsigned long addr,
   17.22      unsigned long old,
   17.23      unsigned long new,
   17.24 -    unsigned long *seen,
   17.25      unsigned int bytes)
   17.26  {
   17.27 -    *seen = old;
   17.28      switch ( bytes )
   17.29      {
   17.30      case 1: *(u8 *)addr = (u8)new; break;
   17.31 @@ -59,7 +57,7 @@ static int cmpxchg_any(
   17.32      case 4: *(u32 *)addr = (u32)new; break;
   17.33      case 8: *(unsigned long *)addr = new; break;
   17.34      }
   17.35 -    return 0;
   17.36 +    return X86EMUL_CONTINUE;
   17.37  }
   17.38  
   17.39  static struct x86_mem_emulator emulops = {
    18.1 --- a/xen/arch/x86/mm.c	Wed Mar 16 05:46:44 2005 +0000
    18.2 +++ b/xen/arch/x86/mm.c	Tue Mar 15 23:44:44 2005 +0000
    18.3 @@ -101,6 +101,7 @@
    18.4  #include <asm/uaccess.h>
    18.5  #include <asm/domain_page.h>
    18.6  #include <asm/ldt.h>
    18.7 +#include <asm/x86_emulate.h>
    18.8  
    18.9  #ifdef VERBOSE
   18.10  #define MEM_LOG(_f, _a...)                           \
   18.11 @@ -265,8 +266,7 @@ int map_ldt_shadow_page(unsigned int off
   18.12  #define TOGGLE_MODE() ((void)0)
   18.13  #endif
   18.14  
   18.15 -    if ( unlikely(in_irq()) )
   18.16 -        BUG();
   18.17 +    BUG_ON(unlikely(in_irq()));
   18.18  
   18.19      TOGGLE_MODE();
   18.20      __get_user(l1e, (unsigned long *)
   18.21 @@ -1939,12 +1939,13 @@ void update_shadow_va_mapping(unsigned l
   18.22          &shadow_linear_pg_table[l1_linear_offset(va)])))) )
   18.23      {
   18.24          /*
   18.25 -         * Since L2's are guranteed RW, failure indicates either that the
   18.26 +         * Since L2's are guaranteed RW, failure indicates either that the
   18.27           * page was not shadowed, or that the L2 entry has not yet been
   18.28           * updated to reflect the shadow.
   18.29           */
   18.30 -        if ( shadow_mode_external(current->domain) )
   18.31 -            BUG(); // can't use linear_l2_table with external tables.
   18.32 +
   18.33 +        /* Can't use linear_l2_table with external tables. */
   18.34 +        BUG_ON(shadow_mode_external(current->domain));
   18.35  
   18.36          l2_pgentry_t gpde = linear_l2_table[l2_table_offset(va)];
   18.37          unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
   18.38 @@ -2294,9 +2295,7 @@ void ptwr_flush(const int which)
   18.39      int            i, cpu = smp_processor_id();
   18.40      struct exec_domain *ed = current;
   18.41      struct domain *d = ed->domain;
   18.42 -#ifdef PERF_COUNTERS
   18.43      unsigned int   modified = 0;
   18.44 -#endif
   18.45  
   18.46      l1va = ptwr_info[cpu].ptinfo[which].l1va;
   18.47      ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
   18.48 @@ -2344,11 +2343,7 @@ void ptwr_flush(const int which)
   18.49  
   18.50      /* Ensure that there are no stale writable mappings in any TLB. */
   18.51      /* NB. INVLPG is a serialising instruction: flushes pending updates. */
   18.52 -#if 1
   18.53      __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
   18.54 -#else
   18.55 -    flush_tlb_all();
   18.56 -#endif
   18.57      PTWR_PRINTK("[%c] disconnected_l1va at %p now %p\n",
   18.58                  PTWR_PRINT_WHICH, ptep, pte);
   18.59  
   18.60 @@ -2365,10 +2360,8 @@ void ptwr_flush(const int which)
   18.61          if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
   18.62              continue;
   18.63  
   18.64 -#ifdef PERF_COUNTERS
   18.65          /* Update number of entries modified. */
   18.66          modified++;
   18.67 -#endif
   18.68  
   18.69          /*
   18.70           * Fast path for PTEs that have merely been write-protected
   18.71 @@ -2411,6 +2404,8 @@ void ptwr_flush(const int which)
   18.72      unmap_domain_mem(pl1e);
   18.73  
   18.74      perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
   18.75 +    ptwr_info[cpu].ptinfo[which].prev_exec_domain = ed;
   18.76 +    ptwr_info[cpu].ptinfo[which].prev_nr_updates  = modified;
   18.77  
   18.78      /*
   18.79       * STEP 3. Reattach the L1 p.t. page into the current address space.
   18.80 @@ -2435,6 +2430,133 @@ void ptwr_flush(const int which)
   18.81      }
   18.82  }
   18.83  
   18.84 +static int ptwr_emulated_update(
   18.85 +    unsigned long addr,
   18.86 +    unsigned long old,
   18.87 +    unsigned long val,
   18.88 +    unsigned int bytes,
   18.89 +    unsigned int do_cmpxchg)
   18.90 +{
   18.91 +    unsigned long sstat, pte, pfn;
   18.92 +    struct pfn_info *page;
   18.93 +    l1_pgentry_t ol1e, nl1e, *pl1e, *sl1e;
   18.94 +    struct domain *d = current->domain;
   18.95 +
   18.96 +    /* Aligned access only, thank you. */
   18.97 +    if ( (addr & (bytes-1)) != 0 )
   18.98 +    {
   18.99 +        MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %p)\n",
  18.100 +                bytes, addr);
  18.101 +        return X86EMUL_UNHANDLEABLE;
  18.102 +    }
  18.103 +
  18.104 +    /* Turn a sub-word access into a full-word access. */
  18.105 +    if ( (addr & ((BITS_PER_LONG/8)-1)) != 0 )
  18.106 +    {
  18.107 +        int           rc;
  18.108 +        unsigned long full;
  18.109 +        unsigned int  mask = addr & ((BITS_PER_LONG/8)-1);
  18.110 +        /* Align address; read full word. */
  18.111 +        addr &= ~((BITS_PER_LONG/8)-1);
  18.112 +        if ( (rc = x86_emulate_read_std(addr, &full, BITS_PER_LONG/8)) )
  18.113 +            return rc;
  18.114 +        /* Mask out bits provided by caller. */
  18.115 +        full &= ~((1UL << (bytes*8)) - 1UL) << (mask*8);
  18.116 +        /* Shift the caller value and OR in the missing bits. */
  18.117 +        val  &= (1UL << (bytes*8)) - 1UL;
  18.118 +        val <<= mask*8;
  18.119 +        val  |= full;
  18.120 +    }
  18.121 +
  18.122 +    /* Read the PTE that maps the page being updated. */
  18.123 +    if ( __get_user(pte, (unsigned long *)
  18.124 +                    &linear_pg_table[l1_linear_offset(addr)]) )
  18.125 +    {
  18.126 +        MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table\n");
  18.127 +        return X86EMUL_UNHANDLEABLE;
  18.128 +    }
  18.129 +
  18.130 +    pfn  = pte >> PAGE_SHIFT;
  18.131 +    page = &frame_table[pfn];
  18.132 +
  18.133 +    /* We are looking only for read-only mappings of p.t. pages. */
  18.134 +    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
  18.135 +         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
  18.136 +    {
  18.137 +        MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%p, %x)\n",
  18.138 +                pte, page->u.inuse.type_info);
  18.139 +        return X86EMUL_UNHANDLEABLE;
  18.140 +    }
  18.141 +
  18.142 +    /* Check the new PTE. */
  18.143 +    nl1e = mk_l1_pgentry(val);
  18.144 +    if ( unlikely(!get_page_from_l1e(nl1e, d)) )
  18.145 +        return X86EMUL_UNHANDLEABLE;
  18.146 +
  18.147 +    /* Checked successfully: do the update (write or cmpxchg). */
  18.148 +    pl1e = map_domain_mem(page_to_phys(page) + (addr & ~PAGE_MASK));
  18.149 +    if ( do_cmpxchg )
  18.150 +    {
  18.151 +        ol1e = mk_l1_pgentry(old);
  18.152 +        if ( cmpxchg((unsigned long *)pl1e, old, val) != old )
  18.153 +        {
  18.154 +            unmap_domain_mem(pl1e);
  18.155 +            return X86EMUL_CMPXCHG_FAILED;
  18.156 +        }
  18.157 +    }
  18.158 +    else
  18.159 +    {
  18.160 +        ol1e  = *pl1e;
  18.161 +        *pl1e = nl1e;
  18.162 +    }
  18.163 +    unmap_domain_mem(pl1e);
  18.164 +
  18.165 +    /* Propagate update to shadow cache. */
  18.166 +    if ( unlikely(shadow_mode_enabled(d)) )
  18.167 +    {
  18.168 +        sstat = get_shadow_status(d, page_to_pfn(page));
  18.169 +        if ( sstat & PSH_shadowed )
  18.170 +        {
  18.171 +            sl1e = map_domain_mem(
  18.172 +                ((sstat & PSH_pfn_mask) << PAGE_SHIFT) + (addr & ~PAGE_MASK));
  18.173 +            l1pte_propagate_from_guest(
  18.174 +                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(*sl1e));
  18.175 +            unmap_domain_mem(sl1e);
  18.176 +        }
  18.177 +    }
  18.178 +
  18.179 +    /* Finally, drop the old PTE. */
  18.180 +    if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
  18.181 +        put_page_from_l1e(ol1e, d);
  18.182 +
  18.183 +    return X86EMUL_CONTINUE;
  18.184 +}
  18.185 +
  18.186 +static int ptwr_emulated_write(
  18.187 +    unsigned long addr,
  18.188 +    unsigned long val,
  18.189 +    unsigned int bytes)
  18.190 +{
  18.191 +    return ptwr_emulated_update(addr, 0, val, bytes, 0);
  18.192 +}
  18.193 +
  18.194 +static int ptwr_emulated_cmpxchg(
  18.195 +    unsigned long addr,
  18.196 +    unsigned long old,
  18.197 +    unsigned long new,
  18.198 +    unsigned int bytes)
  18.199 +{
  18.200 +    return ptwr_emulated_update(addr, old, new, bytes, 1);
  18.201 +}
  18.202 +
  18.203 +static struct x86_mem_emulator ptwr_mem_emulator = {
  18.204 +    .read_std         = x86_emulate_read_std,
  18.205 +    .write_std        = x86_emulate_write_std,
  18.206 +    .read_emulated    = x86_emulate_read_std,
  18.207 +    .write_emulated   = ptwr_emulated_write,
  18.208 +    .cmpxchg_emulated = ptwr_emulated_cmpxchg
  18.209 +};
  18.210 +
  18.211  /* Write page fault handler: check if guest is trying to modify a PTE. */
  18.212  int ptwr_do_page_fault(unsigned long addr)
  18.213  {
  18.214 @@ -2448,13 +2570,13 @@ int ptwr_do_page_fault(unsigned long add
  18.215      return 0; /* Writable pagetables need fixing for x86_64. */
  18.216  #endif
  18.217  
  18.218 +    /* Can't use linear_l2_table with external tables. */
  18.219 +    BUG_ON(shadow_mode_external(current->domain));
  18.220 +
  18.221      /*
  18.222       * Attempt to read the PTE that maps the VA being accessed. By checking for
  18.223       * PDE validity in the L2 we avoid many expensive fixups in __get_user().
  18.224       */
  18.225 -    if ( shadow_mode_external(current->domain) )
  18.226 -        BUG(); // can't use linear_l2_table with external tables.
  18.227 -
  18.228      if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
  18.229             _PAGE_PRESENT) ||
  18.230           __get_user(pte, (unsigned long *)
  18.231 @@ -2472,47 +2594,35 @@ int ptwr_do_page_fault(unsigned long add
  18.232      {
  18.233          return 0;
  18.234      }
  18.235 -    
  18.236 +
  18.237      /* Get the L2 index at which this L1 p.t. is always mapped. */
  18.238      l2_idx = page->u.inuse.type_info & PGT_va_mask;
  18.239      if ( unlikely(l2_idx >= PGT_va_unknown) )
  18.240 -    {
  18.241 -        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
  18.242 -    }
  18.243 +        goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
  18.244      l2_idx >>= PGT_va_shift;
  18.245  
  18.246 -    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
  18.247 -    {
  18.248 -        MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr);
  18.249 -        domain_crash();
  18.250 -    }
  18.251 +    if ( unlikely(l2_idx == (addr >> L2_PAGETABLE_SHIFT)) )
  18.252 +        goto emulate; /* Urk! Pagetable maps itself! */
  18.253  
  18.254      /*
  18.255       * Is the L1 p.t. mapped into the current address space? If so we call it
  18.256       * an ACTIVE p.t., otherwise it is INACTIVE.
  18.257       */
  18.258 -    if ( shadow_mode_external(current->domain) )
  18.259 -        BUG(); // can't use linear_l2_table with external tables.
  18.260 -
  18.261      pl2e = &linear_l2_table[l2_idx];
  18.262      l2e  = l2_pgentry_val(*pl2e);
  18.263      which = PTWR_PT_INACTIVE;
  18.264      if ( (l2e >> PAGE_SHIFT) == pfn )
  18.265      {
  18.266 -        /* Check the PRESENT bit to set ACTIVE. */
  18.267 -        if ( likely(l2e & _PAGE_PRESENT) )
  18.268 +        /*
  18.269 +         * Check the PRESENT bit to set ACTIVE mode.
  18.270 +         * If the PRESENT bit is clear, we may be conflicting with the current 
  18.271 +         * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
  18.272 +         * The ptwr_flush call below will restore the PRESENT bit.
  18.273 +         */
  18.274 +        if ( likely(l2e & _PAGE_PRESENT) ||
  18.275 +             (ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
  18.276 +              (l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx)) )
  18.277              which = PTWR_PT_ACTIVE;
  18.278 -        else {
  18.279 -            /*
  18.280 -             * If the PRESENT bit is clear, we may be conflicting with
  18.281 -             * the current ACTIVE p.t. (it may be the same p.t. mapped
  18.282 -             * at another virt addr).
  18.283 -             * The ptwr_flush call below will restore the PRESENT bit.
  18.284 -             */
  18.285 -            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
  18.286 -                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
  18.287 -                which = PTWR_PT_ACTIVE;
  18.288 -        }
  18.289      }
  18.290      
  18.291      PTWR_PRINTK("[%c] page_fault on l1 pt at va %p, pt for %08x, "
  18.292 @@ -2526,6 +2636,18 @@ int ptwr_do_page_fault(unsigned long add
  18.293      if ( ptwr_info[cpu].ptinfo[which].l1va )
  18.294          ptwr_flush(which);
  18.295  
  18.296 +    /*
  18.297 +     * If last batch made no updates then we are probably stuck. Emulate this 
  18.298 +     * update to ensure we make progress.
  18.299 +     */
  18.300 +    if ( (ptwr_info[cpu].ptinfo[which].prev_exec_domain == current) &&
  18.301 +         (ptwr_info[cpu].ptinfo[which].prev_nr_updates  == 0) )
  18.302 +    {
  18.303 +        /* Force non-emul next time, or we can get stuck emulating forever. */
  18.304 +        ptwr_info[cpu].ptinfo[which].prev_exec_domain = NULL;
  18.305 +        goto emulate;
  18.306 +    }
  18.307 +
  18.308      ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
  18.309      ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
  18.310      
  18.311 @@ -2534,11 +2656,7 @@ int ptwr_do_page_fault(unsigned long add
  18.312           likely(!shadow_mode_enabled(current->domain)) )
  18.313      {
  18.314          *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
  18.315 -#if 1
  18.316          flush_tlb(); /* XXX Multi-CPU guests? */
  18.317 -#else
  18.318 -        flush_tlb_all();
  18.319 -#endif
  18.320      }
  18.321      
  18.322      /* Temporarily map the L1 page, and make a copy of it. */
  18.323 @@ -2563,6 +2681,13 @@ int ptwr_do_page_fault(unsigned long add
  18.324      }
  18.325      
  18.326      return EXCRET_fault_fixed;
  18.327 +
  18.328 + emulate:
  18.329 +    if ( x86_emulate_memop(get_execution_context(), addr,
  18.330 +                           &ptwr_mem_emulator, BITS_PER_LONG/8) )
  18.331 +        return 0;
  18.332 +    perfc_incrc(ptwr_emulations);
  18.333 +    return EXCRET_fault_fixed;
  18.334  }
  18.335  
  18.336  static __init int ptwr_init(void)
  18.337 @@ -2762,8 +2887,7 @@ void audit_domain(struct domain *d)
  18.338          pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
  18.339          page = &frame_table[pfn];
  18.340  
  18.341 -        if ( page_get_owner(page) != d )
  18.342 -            BUG();
  18.343 +        BUG_ON(page_get_owner(page) != d);
  18.344  
  18.345          if ( (page->u.inuse.type_info & PGT_count_mask) >
  18.346               (page->count_info & PGC_count_mask) )
  18.347 @@ -2809,8 +2933,7 @@ void audit_domain(struct domain *d)
  18.348          pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
  18.349          page = &frame_table[pfn];
  18.350  
  18.351 -        if ( page_get_owner(page) != d )
  18.352 -            BUG();
  18.353 +        BUG_ON(page_get_owner(page) != d);
  18.354  
  18.355          switch ( page->u.inuse.type_info & PGT_type_mask )
  18.356          {
  18.357 @@ -3060,7 +3183,10 @@ void audit_domain(struct domain *d)
  18.358              d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1);
  18.359  
  18.360      spin_unlock(&d->page_alloc_lock);
  18.361 -    printk("Audit %d: Done. ref=%d xenpages=%d pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, atomic_read(&d->refcnt), d->xenheap_pages, d->tot_pages, l1, l2, ctot, ttot );
  18.362 +    printk("Audit %d: Done. ref=%d xenpages=%d pages=%d l1=%d"
  18.363 +           " l2=%d ctot=%d ttot=%d\n", 
  18.364 +           d->id, atomic_read(&d->refcnt), d->xenheap_pages, d->tot_pages,
  18.365 +           l1, l2, ctot, ttot );
  18.366  
  18.367      if ( d != current->domain )
  18.368          domain_unpause(d);
    19.1 --- a/xen/arch/x86/x86_32/mm.c	Wed Mar 16 05:46:44 2005 +0000
    19.2 +++ b/xen/arch/x86/x86_32/mm.c	Tue Mar 15 23:44:44 2005 +0000
    19.3 @@ -274,10 +274,25 @@ int check_descriptor(struct desc_struct 
    19.4      if ( (b & _SEGMENT_G) )
    19.5          limit <<= 12;
    19.6  
    19.7 -    switch ( b & (_SEGMENT_CODE | _SEGMENT_EC) )
    19.8 +    if ( (b & (_SEGMENT_CODE | _SEGMENT_EC)) == _SEGMENT_EC )
    19.9      {
   19.10 -    case 0: /* Data segment, grows-up */
   19.11          /*
   19.12 +         * DATA, GROWS-DOWN.
   19.13 +         * Grows-down limit check. 
   19.14 +         * NB. limit == 0xFFFFF provides no access      (if G=1).
   19.15 +         *     limit == 0x00000 provides 4GB-4kB access (if G=1).
   19.16 +         */
   19.17 +        if ( (base + limit) > base )
   19.18 +        {
   19.19 +            limit = -(base & PAGE_MASK);
   19.20 +            goto truncate;
   19.21 +        }
   19.22 +    }
   19.23 +    else
   19.24 +    {
   19.25 +        /*
   19.26 +         * DATA, GROWS-UP. 
   19.27 +         * CODE (CONFORMING AND NON-CONFORMING).
   19.28           * Grows-up limit check.
   19.29           * NB. limit == 0xFFFFF provides 4GB access (if G=1).
   19.30           *     limit == 0x00000 provides 4kB access (if G=1).
   19.31 @@ -293,23 +308,6 @@ int check_descriptor(struct desc_struct 
   19.32              d->a &= ~0x0ffff; d->a |= limit & 0x0ffff;
   19.33              d->b &= ~0xf0000; d->b |= limit & 0xf0000;
   19.34          }
   19.35 -        goto good;
   19.36 -    case _SEGMENT_EC: /* Data segment, grows-down */
   19.37 -        /*
   19.38 -         * Grows-down limit check. 
   19.39 -         * NB. limit == 0xFFFFF provides no access      (if G=1).
   19.40 -         *     limit == 0x00000 provides 4GB-4kB access (if G=1).
   19.41 -         */
   19.42 -        if ( (base + limit) > base )
   19.43 -        {
   19.44 -            limit = -(base & PAGE_MASK);
   19.45 -            goto truncate;
   19.46 -        }
   19.47 -        goto good;
   19.48 -    case _SEGMENT_CODE: /* Code segment, non-conforming */
   19.49 -        goto good;
   19.50 -    case _SEGMENT_CODE|_SEGMENT_EC: /* Code segment, conforming */
   19.51 -        goto bad;
   19.52      }
   19.53  
   19.54   good:
    20.1 --- a/xen/arch/x86/x86_64/mm.c	Wed Mar 16 05:46:44 2005 +0000
    20.2 +++ b/xen/arch/x86/x86_64/mm.c	Tue Mar 15 23:44:44 2005 +0000
    20.3 @@ -287,14 +287,9 @@ int check_descriptor(struct desc_struct 
    20.4      if ( (b & _SEGMENT_DPL) != 3 )
    20.5          goto bad;
    20.6  
    20.7 -    /* Most code and data segments are okay. No base/limit checking. */
    20.8 +    /* All code and data segments are okay. No base/limit checking. */
    20.9      if ( (b & _SEGMENT_S) )
   20.10 -    {
   20.11 -        /* Disallow conforming code segments. I'm not sure they're safe. */
   20.12 -        if ( (b & (_SEGMENT_CODE|_SEGMENT_EC)) == (_SEGMENT_CODE|_SEGMENT_EC) )
   20.13 -            goto bad;
   20.14          goto good;
   20.15 -    }
   20.16  
   20.17      /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
   20.18      if ( (b & _SEGMENT_TYPE) == 0x000 )
    21.1 --- a/xen/arch/x86/x86_emulate.c	Wed Mar 16 05:46:44 2005 +0000
    21.2 +++ b/xen/arch/x86/x86_emulate.c	Tue Mar 15 23:44:44 2005 +0000
    21.3 @@ -363,7 +363,7 @@ do{ __asm__ __volatile__ (              
    21.4  /* Fetch next part of the instruction being emulated. */
    21.5  #define insn_fetch(_type, _size, _eip) \
    21.6  ({ unsigned long _x; \
    21.7 -   if ( ops->read_std((unsigned long)(_eip), &_x, (_size)) ) \
    21.8 +   if ( (rc = ops->read_std((unsigned long)(_eip), &_x, (_size))) != 0 ) \
    21.9         goto done; \
   21.10     (_eip) += (_size); \
   21.11     (_type)_x; \
   21.12 @@ -422,6 +422,7 @@ x86_emulate_memop(
   21.13      u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
   21.14      unsigned int op_bytes = (mode == 8) ? 4 : mode, ad_bytes = mode;
   21.15      unsigned int lock_prefix = 0, rep_prefix = 0, i;
   21.16 +    int rc = 0;
   21.17      struct operand src, dst;
   21.18  
   21.19      /* Shadow copy of register state. Committed on successful emulation. */
   21.20 @@ -556,7 +557,8 @@ x86_emulate_memop(
   21.21          dst.ptr   = (unsigned long *)cr2;
   21.22          dst.bytes = (d & ByteOp) ? 1 : op_bytes;
   21.23          if ( !(d & Mov) && /* optimisation - avoid slow emulated read */
   21.24 -             ops->read_emulated((unsigned long)dst.ptr, &dst.val, dst.bytes) )
   21.25 +             ((rc = ops->read_emulated((unsigned long)dst.ptr,
   21.26 +                                       &dst.val, dst.bytes)) != 0) )
   21.27               goto done;
   21.28          break;
   21.29      }
   21.30 @@ -590,7 +592,8 @@ x86_emulate_memop(
   21.31          src.type  = OP_MEM;
   21.32          src.ptr   = (unsigned long *)cr2;
   21.33          src.bytes = (d & ByteOp) ? 1 : op_bytes;
   21.34 -        if ( ops->read_emulated((unsigned long)src.ptr, &src.val, src.bytes) )
   21.35 +        if ( (rc = ops->read_emulated((unsigned long)src.ptr, 
   21.36 +                                      &src.val, src.bytes)) != 0 )
   21.37              goto done;
   21.38          src.orig_val = src.val;
   21.39          break;
   21.40 @@ -664,6 +667,7 @@ x86_emulate_memop(
   21.41          src.val ^= dst.val;
   21.42          dst.val ^= src.val;
   21.43          src.val ^= dst.val;
   21.44 +        lock_prefix = 1;
   21.45          break;
   21.46      case 0xa0 ... 0xa1: /* mov */
   21.47          dst.ptr = (unsigned long *)&_regs.eax;
   21.48 @@ -682,7 +686,7 @@ x86_emulate_memop(
   21.49          /* 64-bit mode: POP defaults to 64-bit operands. */
   21.50          if ( (mode == 8) && (dst.bytes == 4) )
   21.51              dst.bytes = 8;
   21.52 -        if ( ops->read_std(_regs.esp, &dst.val, dst.bytes) )
   21.53 +        if ( (rc = ops->read_std(_regs.esp, &dst.val, dst.bytes)) != 0 )
   21.54              goto done;
   21.55          _regs.esp += dst.bytes;
   21.56          break;
   21.57 @@ -759,11 +763,12 @@ x86_emulate_memop(
   21.58              if ( (mode == 8) && (dst.bytes == 4) )
   21.59              {
   21.60                  dst.bytes = 8;
   21.61 -                if ( ops->read_std((unsigned long)dst.ptr, &dst.val, 8) )
   21.62 +                if ( (rc = ops->read_std((unsigned long)dst.ptr,
   21.63 +                                         &dst.val, 8)) != 0 )
   21.64                      goto done;
   21.65              }
   21.66              _regs.esp -= dst.bytes;
   21.67 -            if ( ops->write_std(_regs.esp, dst.val, dst.bytes) )
   21.68 +            if ( (rc = ops->write_std(_regs.esp, dst.val, dst.bytes)) != 0 )
   21.69                  goto done;
   21.70              dst.val = dst.orig_val; /* skanky: disable writeback */
   21.71              break;
   21.72 @@ -790,22 +795,13 @@ x86_emulate_memop(
   21.73              break;
   21.74          case OP_MEM:
   21.75              if ( lock_prefix )
   21.76 -            {
   21.77 -                unsigned long seen;
   21.78 -                if ( ops->cmpxchg_emulated((unsigned long)dst.ptr,
   21.79 -                                           dst.orig_val, dst.val,
   21.80 -                                           &seen, dst.bytes) )
   21.81 -                    goto done;
   21.82 -                if ( seen != dst.orig_val )
   21.83 -                    goto done; /* Try again... */
   21.84 -            }
   21.85 +                rc = ops->cmpxchg_emulated(
   21.86 +                    (unsigned long)dst.ptr, dst.orig_val, dst.val, dst.bytes);
   21.87              else
   21.88 -            {
   21.89 -                if ( ops->write_emulated((unsigned long)dst.ptr,
   21.90 -                                         dst.val, dst.bytes) )
   21.91 -                    goto done;
   21.92 -            }
   21.93 -            break;
   21.94 +                rc = ops->write_emulated(
   21.95 +                    (unsigned long)dst.ptr, dst.val, dst.bytes);
   21.96 +            if ( rc != 0 )
   21.97 +                goto done;
   21.98          default:
   21.99              break;
  21.100          }
  21.101 @@ -815,7 +811,7 @@ x86_emulate_memop(
  21.102      *regs = _regs;
  21.103  
  21.104   done:
  21.105 -    return 0;
  21.106 +    return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
  21.107  
  21.108   special_insn:
  21.109      if ( twobyte )
  21.110 @@ -839,15 +835,15 @@ x86_emulate_memop(
  21.111          {
  21.112              /* Write fault: destination is special memory. */
  21.113              dst.ptr = (unsigned long *)cr2;
  21.114 -            if ( ops->read_std(_regs.esi - _regs.edi + cr2, 
  21.115 -                               &dst.val, dst.bytes) )
  21.116 +            if ( (rc = ops->read_std(_regs.esi - _regs.edi + cr2, 
  21.117 +                                     &dst.val, dst.bytes)) != 0 )
  21.118                  goto done;
  21.119          }
  21.120          else
  21.121          {
  21.122              /* Read fault: source is special memory. */
  21.123              dst.ptr = (unsigned long *)(_regs.edi - _regs.esi + cr2);
  21.124 -            if ( ops->read_emulated(cr2, &dst.val, dst.bytes) )
  21.125 +            if ( (rc = ops->read_emulated(cr2, &dst.val, dst.bytes)) != 0 )
  21.126                  goto done;
  21.127          }
  21.128          _regs.esi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes;
  21.129 @@ -867,7 +863,7 @@ x86_emulate_memop(
  21.130          dst.type  = OP_REG;
  21.131          dst.bytes = (d & ByteOp) ? 1 : op_bytes;
  21.132          dst.ptr   = (unsigned long *)&_regs.eax;
  21.133 -        if ( ops->read_emulated(cr2, &dst.val, dst.bytes) )
  21.134 +        if ( (rc = ops->read_emulated(cr2, &dst.val, dst.bytes)) != 0 )
  21.135              goto done;
  21.136          _regs.esi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes;
  21.137          break;
  21.138 @@ -971,3 +967,39 @@ x86_emulate_memop(
  21.139      DPRINTF("Cannot emulate %02x\n", b);
  21.140      return -1;
  21.141  }
  21.142 +
  21.143 +#ifndef __TEST_HARNESS__
  21.144 +
  21.145 +#include <asm/mm.h>
  21.146 +#include <asm/uaccess.h>
  21.147 +
  21.148 +int
  21.149 +x86_emulate_read_std(
  21.150 +    unsigned long addr,
  21.151 +    unsigned long *val,
  21.152 +    unsigned int bytes)
  21.153 +{
  21.154 +    *val = 0;
  21.155 +    if ( copy_from_user((void *)val, (void *)addr, bytes) )
  21.156 +    {
  21.157 +        propagate_page_fault(addr, 4); /* user mode, read fault */
  21.158 +        return X86EMUL_PROPAGATE_FAULT;
  21.159 +    }
  21.160 +    return X86EMUL_CONTINUE;
  21.161 +}
  21.162 +
  21.163 +int
  21.164 +x86_emulate_write_std(
  21.165 +    unsigned long addr,
  21.166 +    unsigned long val,
  21.167 +    unsigned int bytes)
  21.168 +{
  21.169 +    if ( copy_to_user((void *)addr, (void *)&val, bytes) )
  21.170 +    {
  21.171 +        propagate_page_fault(addr, 6); /* user mode, write fault */
  21.172 +        return X86EMUL_PROPAGATE_FAULT;
  21.173 +    }
  21.174 +    return X86EMUL_CONTINUE;
  21.175 +}
  21.176 +
  21.177 +#endif
    22.1 --- a/xen/include/asm-x86/mm.h	Wed Mar 16 05:46:44 2005 +0000
    22.2 +++ b/xen/include/asm-x86/mm.h	Tue Mar 15 23:44:44 2005 +0000
    22.3 @@ -289,6 +289,9 @@ typedef struct {
    22.4      l1_pgentry_t *pl1e;
    22.5      /* Index in L2 page table where this L1 p.t. is always hooked. */
    22.6      unsigned int l2_idx; /* NB. Only used for PTWR_PT_ACTIVE. */
    22.7 +    /* Info about last ptwr update batch. */
    22.8 +    struct exec_domain *prev_exec_domain; /* domain making the update */
    22.9 +    unsigned int        prev_nr_updates;  /* size of update batch */
   22.10  } ptwr_ptinfo_t;
   22.11  
   22.12  typedef struct {
    23.1 --- a/xen/include/asm-x86/x86_emulate.h	Wed Mar 16 05:46:44 2005 +0000
    23.2 +++ b/xen/include/asm-x86/x86_emulate.h	Tue Mar 15 23:44:44 2005 +0000
    23.3 @@ -32,9 +32,17 @@
    23.4   *  2. If the access fails (cannot emulate, or a standard access faults) then
    23.5   *     it is up to the memop to propagate the fault to the guest VM via
    23.6   *     some out-of-band mechanism, unknown to the emulator. The memop signals
    23.7 - *     failure by returning a non-zero value to the emulator, which will then
    23.8 - *     immediately bail.
    23.9 + *     failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
   23.10 + *     then immediately bail.
   23.11   */
   23.12 +/* Access completed successfully: continue emulation as normal. */
   23.13 +#define X86EMUL_CONTINUE        0
   23.14 +/* Access is unhandleable: bail from emulation and return error to caller. */
   23.15 +#define X86EMUL_UNHANDLEABLE    1
   23.16 +/* Terminate emulation but return success to the caller. */
   23.17 +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
   23.18 +#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
   23.19 +#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
   23.20  struct x86_mem_emulator
   23.21  {
   23.22      /*
   23.23 @@ -89,17 +97,26 @@ struct x86_mem_emulator
   23.24       *  @addr:  [IN ] Linear address to access.
   23.25       *  @old:   [IN ] Value expected to be current at @addr.
   23.26       *  @new:   [IN ] Value to write to @addr.
   23.27 -     *  @seen:  [OUT] Value actually seen at @addr, zero-extended to 'u_long'.
   23.28       *  @bytes: [IN ] Number of bytes to access using CMPXCHG.
   23.29       */
   23.30      int (*cmpxchg_emulated)(
   23.31          unsigned long addr,
   23.32 -        unsigned long old, 
   23.33 +        unsigned long old,
   23.34          unsigned long new,
   23.35 -        unsigned long *seen,
   23.36          unsigned int bytes);
   23.37  };
   23.38  
   23.39 +/* Standard reader/writer functions that callers may wish to use. */
   23.40 +extern int
   23.41 +x86_emulate_read_std(
   23.42 +    unsigned long addr,
   23.43 +    unsigned long *val,
   23.44 +    unsigned int bytes);
   23.45 +extern int
   23.46 +x86_emulate_write_std(
   23.47 +    unsigned long addr,
   23.48 +    unsigned long val,
   23.49 +    unsigned int bytes);
   23.50  
   23.51  struct xen_regs;
   23.52  
    24.1 --- a/xen/include/xen/irq_cpustat.h	Wed Mar 16 05:46:44 2005 +0000
    24.2 +++ b/xen/include/xen/irq_cpustat.h	Tue Mar 15 23:44:44 2005 +0000
    24.3 @@ -20,11 +20,7 @@
    24.4  
    24.5  extern irq_cpustat_t irq_stat[];
    24.6  
    24.7 -#ifdef CONFIG_SMP
    24.8  #define __IRQ_STAT(cpu, member)	(irq_stat[cpu].member)
    24.9 -#else
   24.10 -#define __IRQ_STAT(cpu, member)	((void)(cpu), irq_stat[0].member)
   24.11 -#endif	
   24.12  
   24.13    /* arch independent irq_stat fields */
   24.14  #define softirq_pending(cpu)	__IRQ_STAT((cpu), __softirq_pending)
    25.1 --- a/xen/include/xen/perfc_defn.h	Wed Mar 16 05:46:44 2005 +0000
    25.2 +++ b/xen/include/xen/perfc_defn.h	Tue Mar 15 23:44:44 2005 +0000
    25.3 @@ -20,6 +20,7 @@ PERFCOUNTER_CPU( calls_to_update_va, "ca
    25.4  PERFCOUNTER_CPU( page_faults, "page faults" )
    25.5  PERFCOUNTER_CPU( copy_user_faults, "copy_user faults" )
    25.6  PERFCOUNTER_CPU( map_domain_mem_count, "map_domain_mem count" )
    25.7 +PERFCOUNTER_CPU( ptwr_emulations, "writable pt emulations" )
    25.8  
    25.9  PERFCOUNTER_CPU( shadow_l2_table_count, "shadow_l2_table count" )
   25.10  PERFCOUNTER_CPU( shadow_l1_table_count, "shadow_l1_table count" )
    26.1 --- a/xen/include/xen/sched.h	Wed Mar 16 05:46:44 2005 +0000
    26.2 +++ b/xen/include/xen/sched.h	Tue Mar 15 23:44:44 2005 +0000
    26.3 @@ -22,6 +22,7 @@
    26.4  #include <xen/grant_table.h>
    26.5  #include <asm/hardirq.h>
    26.6  #include <asm/domain.h>
    26.7 +#include <asm/bitops.h>
    26.8  
    26.9  extern unsigned long volatile jiffies;
   26.10  extern rwlock_t domlist_lock;
    27.1 --- a/xen/include/xen/softirq.h	Wed Mar 16 05:46:44 2005 +0000
    27.2 +++ b/xen/include/xen/softirq.h	Tue Mar 15 23:44:44 2005 +0000
    27.3 @@ -14,6 +14,7 @@
    27.4  #include <xen/config.h>
    27.5  #include <xen/lib.h>
    27.6  #include <xen/smp.h>
    27.7 +#include <xen/irq_cpustat.h>
    27.8  #include <asm/bitops.h>
    27.9  #include <asm/hardirq.h>
   27.10