ia64/xen-unstable
changeset 1015:dce3446ac01e
bitkeeper revision 1.656 (3fe4de1f1IOfUVzwLIqE8EHIf7xJoA)
Merge nidd.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into nidd.cl.cam.ac.uk:/auto/anfs/scratch/labyrinth/iap10/xeno-clone/xeno.bk
Merge nidd.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into nidd.cl.cam.ac.uk:/auto/anfs/scratch/labyrinth/iap10/xeno-clone/xeno.bk
line diff
1.1 --- a/.rootkeys Sat Dec 20 23:39:49 2003 +0000 1.2 +++ b/.rootkeys Sat Dec 20 23:41:19 2003 +0000 1.3 @@ -79,10 +79,8 @@ 3fbd0a42l40lM0IICw2jXbQBVZSdZg tools/xc/ 1.4 3fbd4bd6GtGwZGxYUJPOheYIR7bPaA tools/xc/py/XenoUtil.py 1.5 3fbd0a40yT6G3M9hMpaz5xTUdl0E4g tools/xc/py/setup.py 1.6 3f72f1bdJPsV3JCnBqs9ddL9tr6D2g xen/COPYING 1.7 -3f841450eJvqAD1Dldc0_aOweGiglQ xen/GUEST_CHANGES 1.8 3ddb79bcbOVHh38VJzc97-JEGD4dJQ xen/Makefile 1.9 3ddb79bcWnTwYsQRWl_PaneJfa6p0w xen/Rules.mk 1.10 -3e74d2be6ELqhaY1sW0yyHRKhpOvDQ xen/TODO 1.11 3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/i386/Makefile 1.12 3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/i386/Rules.mk 1.13 3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/i386/acpitable.c 1.14 @@ -92,6 +90,7 @@ 3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/ 1.15 3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/i386/delay.c 1.16 3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/i386/entry.S 1.17 3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/i386/extable.c 1.18 +3fe443fdDDb0Sw6NQBCk4GQapayfTA xen/arch/i386/flushtlb.c 1.19 3ddb79bcesE5E-lS4QhRhlqXxqj9cA xen/arch/i386/i387.c 1.20 3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/i386/i8259.c 1.21 3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/i386/idle0_task.c
2.1 --- a/tools/xc/lib/xc_linux_build.c Sat Dec 20 23:39:49 2003 +0000 2.2 +++ b/tools/xc/lib/xc_linux_build.c Sat Dec 20 23:41:19 2003 +0000 2.3 @@ -106,12 +106,12 @@ static int setup_guestos(int xc_handle, 2.4 const char *cmdline, 2.5 unsigned long shared_info_frame) 2.6 { 2.7 - l1_pgentry_t *vl1tab = NULL, *vl1e = NULL; 2.8 - l2_pgentry_t *vl2tab = NULL, *vl2e = NULL; 2.9 + l1_pgentry_t *vl1tab; 2.10 + l2_pgentry_t *vl2tab; 2.11 unsigned long *page_array = NULL; 2.12 mmu_update_t *pgt_update_arr = NULL, *pgt_updates = NULL; 2.13 int alloc_index, num_pt_pages; 2.14 - unsigned long l2tab; 2.15 + unsigned long l2tab, l2e, l1e=0; 2.16 unsigned long l1tab = 0; 2.17 unsigned long num_pgt_updates = 0; 2.18 unsigned long count, pt_start, i, j; 2.19 @@ -230,44 +230,46 @@ static int setup_guestos(int xc_handle, 2.20 if ( (vl2tab = map_pfn(pm_handle, l2tab >> PAGE_SHIFT)) == NULL ) 2.21 goto error_out; 2.22 memset(vl2tab, 0, PAGE_SIZE); 2.23 - vl2e = vl2tab + l2_table_offset(virt_load_addr); 2.24 + unmap_pfn(pm_handle, vl2tab); 2.25 + l2e = l2tab + (l2_table_offset(virt_load_addr)*sizeof(l2_pgentry_t)); 2.26 for ( count = 0; count < tot_pages; count++ ) 2.27 { 2.28 - if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) 2.29 + if ( (l1e & (PAGE_SIZE-1)) == 0 ) 2.30 { 2.31 l1tab = page_array[alloc_index] << PAGE_SHIFT; 2.32 if ( (vl1tab = map_pfn(pm_handle, l1tab >> PAGE_SHIFT)) == NULL ) 2.33 goto error_out; 2.34 memset(vl1tab, 0, PAGE_SIZE); 2.35 + unmap_pfn(pm_handle, vl1tab); 2.36 alloc_index--; 2.37 2.38 - vl1e = vl1tab + l1_table_offset(virt_load_addr + 2.39 - (count << PAGE_SHIFT)); 2.40 + l1e = l1tab + (l1_table_offset(virt_load_addr+(count<<PAGE_SHIFT))* 2.41 + sizeof(l1_pgentry_t)); 2.42 2.43 /* make apropriate entry in the page directory */ 2.44 - pgt_updates->ptr = (unsigned long)vl2e; 2.45 + pgt_updates->ptr = l2e; 2.46 pgt_updates->val = l1tab | L2_PROT; 2.47 pgt_updates++; 2.48 num_pgt_updates++; 2.49 - vl2e++; 2.50 + l2e += sizeof(l2_pgentry_t); 2.51 } 2.52 2.53 if ( count < pt_start ) 2.54 { 2.55 - pgt_updates->ptr = (unsigned long)vl1e; 2.56 + pgt_updates->ptr = l1e; 2.57 pgt_updates->val = (page_array[count] << PAGE_SHIFT) | L1_PROT; 2.58 pgt_updates++; 2.59 num_pgt_updates++; 2.60 - vl1e++; 2.61 + l1e += sizeof(l1_pgentry_t); 2.62 } 2.63 else 2.64 { 2.65 - pgt_updates->ptr = (unsigned long)vl1e; 2.66 + pgt_updates->ptr = l1e; 2.67 pgt_updates->val = 2.68 ((page_array[count] << PAGE_SHIFT) | L1_PROT) & ~_PAGE_RW; 2.69 pgt_updates++; 2.70 num_pgt_updates++; 2.71 - vl1e++; 2.72 + l1e += sizeof(l1_pgentry_t); 2.73 } 2.74 2.75 pgt_updates->ptr =
3.1 --- a/tools/xc/lib/xc_linux_restore.c Sat Dec 20 23:39:49 2003 +0000 3.2 +++ b/tools/xc/lib/xc_linux_restore.c Sat Dec 20 23:41:19 2003 +0000 3.3 @@ -301,7 +301,8 @@ int xc_linux_restore(int xc_handle, 3.4 page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; 3.5 } 3.6 if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx, 3.7 - (unsigned long)&ppage[j], page[j]) ) 3.8 + (mfn<<PAGE_SHIFT)+(j*sizeof(l1_pgentry_t)), 3.9 + page[j]) ) 3.10 goto out; 3.11 } 3.12 break; 3.13 @@ -337,7 +338,8 @@ int xc_linux_restore(int xc_handle, 3.14 page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; 3.15 } 3.16 if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx, 3.17 - (unsigned long)&ppage[j], page[j]) ) 3.18 + (mfn<<PAGE_SHIFT)+(j*sizeof(l2_pgentry_t)), 3.19 + page[j]) ) 3.20 goto out; 3.21 } 3.22 break; 3.23 @@ -345,9 +347,6 @@ int xc_linux_restore(int xc_handle, 3.24 memcpy(ppage, page, PAGE_SIZE); 3.25 break; 3.26 } 3.27 - /* NB. Must flush before unmapping page, as pass VAs to Xen. */ 3.28 - if ( flush_mmu_updates(xc_handle, mmu_updates, &mmu_update_idx) ) 3.29 - goto out; 3.30 unmap_pfn(pm_handle, ppage); 3.31 3.32 if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
4.1 --- a/tools/xc/lib/xc_linux_save.c Sat Dec 20 23:39:49 2003 +0000 4.2 +++ b/tools/xc/lib/xc_linux_save.c Sat Dec 20 23:41:19 2003 +0000 4.3 @@ -44,19 +44,20 @@ static int check_pfn_ownership(int xc_ha 4.4 { 4.5 dom0_op_t op; 4.6 op.cmd = DOM0_GETPAGEFRAMEINFO; 4.7 - op.u.getpageframeinfo.pfn = mfn; 4.8 - if ( (do_dom0_op(xc_handle, &op) < 0) || 4.9 - (op.u.getpageframeinfo.domain != dom) ) 4.10 - return 0; 4.11 - return 1; 4.12 + op.u.getpageframeinfo.pfn = mfn; 4.13 + op.u.getpageframeinfo.domain = dom; 4.14 + return (do_dom0_op(xc_handle, &op) >= 0); 4.15 } 4.16 4.17 #define GETPFN_ERR (~0U) 4.18 -static unsigned int get_pfn_type(int xc_handle, unsigned long mfn) 4.19 +static unsigned int get_pfn_type(int xc_handle, 4.20 + unsigned long mfn, 4.21 + unsigned int dom) 4.22 { 4.23 dom0_op_t op; 4.24 op.cmd = DOM0_GETPAGEFRAMEINFO; 4.25 - op.u.getpageframeinfo.pfn = mfn; 4.26 + op.u.getpageframeinfo.pfn = mfn; 4.27 + op.u.getpageframeinfo.domain = dom; 4.28 if ( do_dom0_op(xc_handle, &op) < 0 ) 4.29 { 4.30 PERROR("Unexpected failure when getting page frame info!"); 4.31 @@ -259,7 +260,8 @@ int xc_linux_save(int xc_handle, 4.32 mfn_to_pfn_table[mfn] = i; 4.33 4.34 /* Query page type by MFN, but store it by PFN. */ 4.35 - if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn)) == GETPFN_ERR ) 4.36 + if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn, domid)) == 4.37 + GETPFN_ERR ) 4.38 goto out; 4.39 } 4.40
5.1 --- a/xen/GUEST_CHANGES Sat Dec 20 23:39:49 2003 +0000 5.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 5.3 @@ -1,26 +0,0 @@ 5.4 - 5.5 -The interface between Xen and overlying guest OSes has changed in the 5.6 -following ways since version 1.0: 5.7 - 5.8 -Modified hypercall 'pt_update' 5.9 ------------------------------- 5.10 -Page-table updates passed to the 'pt_update' hypercall must now 5.11 -specify a virtual address that maps the PTE to be modified. Previously 5.12 -a physical address was used, requiring Xen to temporarily map the PTE 5.13 -into its own private region so that it could be read and written. 5.14 -This affects only commands of type PGREQ_NORMAL_UPDATE and 5.15 -PGREQ_UNCHECKED_UPDATE. 5.16 - 5.17 -New hypercall 'update_va_mapping' 5.18 ---------------------------------- 5.19 -A new high-speed page-table update method has been introduced, which 5.20 -may be of particular benefit when fixing up application page faults. 5.21 -Invoked as 'update_va_mapping(page_number, new_pte_value, flags)': 5.22 - <page_number>: The virtual page number in the current address space 5.23 - whose PTE is to be modified. 5.24 - <new_pte_value>: The new value to write into the PTE. 5.25 - <flags>: An ORed combination of 5.26 - UVMF_INVLPG: Flush stale TLB entry of the updated page mapping 5.27 - UVMF_FLUSH_TLB: Flush all TLB entries 5.28 -You can see this new call in use in Xenolinux (common/memory.c). 5.29 -
6.1 --- a/xen/Makefile Sat Dec 20 23:39:49 2003 +0000 6.2 +++ b/xen/Makefile Sat Dec 20 23:41:19 2003 +0000 6.3 @@ -2,7 +2,7 @@ 6.4 # This is the correct place to edit the build version. 6.5 # All other places this is stored (eg. compile.h) should be autogenerated. 6.6 export XEN_VERSION = 1 6.7 -export XEN_SUBVERSION = 2 6.8 +export XEN_SUBVERSION = 3 6.9 export XEN_EXTRAVERSION = "-rc" 6.10 6.11 export BASEDIR := $(shell pwd)
7.1 --- a/xen/TODO Sat Dec 20 23:39:49 2003 +0000 7.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 7.3 @@ -1,54 +0,0 @@ 7.4 - 7.5 -This is stuff we probably want to implement in the near future. 7.6 - 7.7 - -- Keir (16/3/03) 7.8 - 7.9 - 7.10 -1. DOMAIN-0 MANAGEMENT DAEMON 7.11 ------------------------------ 7.12 -A better control daemon is required for domain 0, which keeps proper 7.13 -track of machine resources and can make sensible policy choices. This 7.14 -may require support in Xen; for example, notifications (eg. DOMn is 7.15 -killed), and requests (eg. can DOMn allocate x frames of memory?). 7.16 - 7.17 -2. ASSIGNING DOMAINS TO PROCESSORS 7.18 ----------------------------------- 7.19 -More intelligent assignment of domains to processors. In 7.20 -particular, we don't play well with hyperthreading: we will assign 7.21 -domains to virtual processors on the same package, rather then 7.22 -spreading them across processor packages. 7.23 - 7.24 -What we need to do is port code from Linux which stores information on 7.25 -relationships between processors in the system (eg. which ones are 7.26 -siblings in the same package). We then use this to balance domains 7.27 -across packages, and across virtual processors within a package. 7.28 - 7.29 -3. SANE NETWORK ROUTING 7.30 ------------------------ 7.31 -The current virtual firewall/router is completely broken. Needs a new 7.32 -design and implementation! 7.33 - 7.34 - 7.35 - 7.36 -Graveyard 7.37 -********* 7.38 - 7.39 -The hypervisor page cache 7.40 -------------------------- 7.41 -This will allow guest OSes to make use of spare pages in the system, but 7.42 -allow them to be immediately used for any new domains or memory requests. 7.43 -The idea is that, when a page is laundered and falls off Linux's clean_LRU 7.44 -list, rather than freeing it it becomes a candidate for passing down into 7.45 -the hypervisor. In return, xeno-linux may ask for one of its previously- 7.46 -cached pages back: 7.47 - (page, new_id) = cache_query(page, old_id); 7.48 -If the requested page couldn't be kept, a blank page is returned. 7.49 -When would Linux make the query? Whenever it wants a page back without 7.50 -the delay or going to disc. Also, whenever a page would otherwise be 7.51 -flushed to disc. 7.52 - 7.53 -To try and add to the cache: (blank_page, new_id) = cache_query(page, NULL); 7.54 - [NULL means "give me a blank page"]. 7.55 -To try and retrieve from the cache: (page, new_id) = cache_query(x_page, id) 7.56 - [we may request that x_page just be discarded, and therefore not impinge 7.57 - on this domain's cache quota].
9.1 --- a/xen/arch/i386/apic.c Sat Dec 20 23:39:49 2003 +0000 9.2 +++ b/xen/arch/i386/apic.c Sat Dec 20 23:41:19 2003 +0000 9.3 @@ -47,7 +47,7 @@ 9.4 #include <asm/hardirq.h> 9.5 #include <asm/apic.h> 9.6 #include <xeno/mm.h> 9.7 - 9.8 +#include <asm/io_apic.h> 9.9 #include <asm/timex.h> 9.10 #include <xeno/ac_timer.h> 9.11 #include <xeno/perfc.h>
10.1 --- a/xen/arch/i386/entry.S Sat Dec 20 23:39:49 2003 +0000 10.2 +++ b/xen/arch/i386/entry.S Sat Dec 20 23:41:19 2003 +0000 10.3 @@ -82,7 +82,6 @@ 10.4 #include <xeno/config.h> 10.5 #include <xeno/errno.h> 10.6 #include <hypervisor-ifs/hypervisor-if.h> 10.7 -#include <asm/smp.h> 10.8 10.9 EBX = 0x00 10.10 ECX = 0x04
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 11.2 +++ b/xen/arch/i386/flushtlb.c Sat Dec 20 23:41:19 2003 +0000 11.3 @@ -0,0 +1,64 @@ 11.4 +/****************************************************************************** 11.5 + * flushtlb.c 11.6 + * 11.7 + * TLB flushes are timestamped using a global virtual 'clock' which ticks 11.8 + * on any TLB flush on any processor. 11.9 + * 11.10 + * Copyright (c) 2003, K A Fraser 11.11 + */ 11.12 + 11.13 +#include <xeno/config.h> 11.14 +#include <xeno/sched.h> 11.15 +#include <asm/flushtlb.h> 11.16 + 11.17 +unsigned long tlbflush_mask; 11.18 +unsigned long tlbflush_clock; 11.19 +unsigned long tlbflush_time[NR_CPUS]; 11.20 + 11.21 +static inline void tlb_clocktick(unsigned int cpu) 11.22 +{ 11.23 + unsigned long x, nx, y, ny; 11.24 + 11.25 + clear_bit(cpu, &tlbflush_mask); 11.26 + 11.27 + /* Tick the clock. 'y' contains the current time after the tick. */ 11.28 + ny = tlbflush_clock; 11.29 + do { 11.30 +#ifdef CONFIG_SMP 11.31 + if ( unlikely(((y = ny+1) & (GLOBAL_FLUSH_PERIOD - 1)) == 0) ) 11.32 + { 11.33 + new_tlbflush_clock_period(); 11.34 + y = tlbflush_clock; 11.35 + break; 11.36 + } 11.37 +#else 11.38 + y = ny+1; 11.39 +#endif 11.40 + } 11.41 + while ( unlikely((ny = cmpxchg(&tlbflush_clock, y-1, y)) != y-1) ); 11.42 + 11.43 + /* Update cpu's timestamp to current time, unless someone else beats us. */ 11.44 + nx = tlbflush_time[cpu]; 11.45 + do { 11.46 + if ( unlikely((x = nx) >= y) ) 11.47 + break; 11.48 + } 11.49 + while ( unlikely((nx = cmpxchg(&tlbflush_time[cpu], x, y)) != x) ); 11.50 +} 11.51 + 11.52 +void write_cr3_counted(unsigned long pa) 11.53 +{ 11.54 + __asm__ __volatile__ ( 11.55 + "movl %0, %%cr3" 11.56 + : : "r" (pa) : "memory" ); 11.57 + tlb_clocktick(smp_processor_id()); 11.58 +} 11.59 + 11.60 +void flush_tlb_counted(void) 11.61 +{ 11.62 + __asm__ __volatile__ ( 11.63 + "movl %%cr3, %%eax; movl %%eax, %%cr3" 11.64 + : : : "memory", "eax" ); 11.65 + tlb_clocktick(smp_processor_id()); 11.66 +} 11.67 +
12.1 --- a/xen/arch/i386/io_apic.c Sat Dec 20 23:39:49 2003 +0000 12.2 +++ b/xen/arch/i386/io_apic.c Sat Dec 20 23:41:19 2003 +0000 12.3 @@ -28,6 +28,8 @@ 12.4 #include <xeno/config.h> 12.5 #include <asm/mc146818rtc.h> 12.6 #include <asm/io.h> 12.7 +#include <asm/mpspec.h> 12.8 +#include <asm/io_apic.h> 12.9 #include <asm/smp.h> 12.10 #include <asm/desc.h> 12.11 #include <asm/smpboot.h>
13.1 --- a/xen/arch/i386/ioremap.c Sat Dec 20 23:39:49 2003 +0000 13.2 +++ b/xen/arch/i386/ioremap.c Sat Dec 20 23:41:19 2003 +0000 13.3 @@ -15,92 +15,50 @@ 13.4 #include <asm/pgalloc.h> 13.5 #include <asm/page.h> 13.6 13.7 -static unsigned long remap_base = 0; 13.8 +static unsigned long remap_base = IOREMAP_VIRT_START; 13.9 13.10 #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) 13.11 13.12 -static void new_l2e(l2_pgentry_t *pl2e) 13.13 -{ 13.14 - l1_pgentry_t *pl1e = (l1_pgentry_t *)get_free_page(GFP_KERNEL); 13.15 - if ( !pl1e ) BUG(); 13.16 - clear_page(pl1e); 13.17 - *pl2e = mk_l2_pgentry(__pa(pl1e)|__PAGE_HYPERVISOR); 13.18 -} 13.19 - 13.20 - 13.21 -void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) 13.22 +void * __ioremap(unsigned long phys_addr, 13.23 + unsigned long size, 13.24 + unsigned long flags) 13.25 { 13.26 unsigned long vaddr; 13.27 unsigned long offset, cur=0, last_addr; 13.28 l2_pgentry_t *pl2e; 13.29 l1_pgentry_t *pl1e; 13.30 13.31 - /* First time through, start allocating from far end of virtual memory. */ 13.32 - if ( !remap_base ) remap_base = IOREMAP_VIRT_START; 13.33 - 13.34 /* Don't allow wraparound or zero size */ 13.35 last_addr = phys_addr + size - 1; 13.36 - if (!size || last_addr < phys_addr) 13.37 + if ( (size == 0) || (last_addr < phys_addr) ) 13.38 return NULL; 13.39 13.40 - /* 13.41 - * Don't remap the low PCI/ISA area, it's always mapped.. 13.42 - */ 13.43 - if (phys_addr >= 0xA0000 && last_addr < 0x100000) 13.44 + /* Don't remap the low PCI/ISA area: it's always mapped. */ 13.45 + if ( (phys_addr >= 0xA0000) && (last_addr < 0x100000) ) 13.46 return phys_to_virt(phys_addr); 13.47 13.48 - if(remap_base + size > IOREMAP_VIRT_END-1) { 13.49 - printk("ioremap: going past end of reserved space!\n"); 13.50 - return NULL; 13.51 + if ( (remap_base + size) > (IOREMAP_VIRT_END - 1) ) 13.52 + { 13.53 + printk("ioremap: going past end of reserved space!\n"); 13.54 + return NULL; 13.55 } 13.56 -#if 0 13.57 - /* 13.58 - * Don't allow anybody to remap normal RAM that we're using.. 13.59 - */ 13.60 - if (phys_addr < virt_to_phys(high_memory)) { 13.61 - char *t_addr, *t_end; 13.62 - struct pfn_info *page; 13.63 13.64 - t_addr = __va(phys_addr); 13.65 - t_end = t_addr + (size - 1); 13.66 - 13.67 - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) 13.68 - if(!PageReserved(page)) 13.69 - return NULL; 13.70 - } 13.71 -#endif 13.72 - 13.73 - /* 13.74 - * Mappings have to be page-aligned 13.75 - */ 13.76 + /* Mappings have to be page-aligned. */ 13.77 offset = phys_addr & ~PAGE_MASK; 13.78 phys_addr &= PAGE_MASK; 13.79 size = PAGE_ALIGN(last_addr) - phys_addr; 13.80 13.81 - /* 13.82 - * Ok, go for it.. 13.83 - */ 13.84 + /* Ok, go for it. */ 13.85 vaddr = remap_base; 13.86 remap_base += size; 13.87 pl2e = &idle_pg_table[l2_table_offset(vaddr)]; 13.88 - if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e); 13.89 pl1e = l2_pgentry_to_l1(*pl2e++) + l1_table_offset(vaddr); 13.90 - for ( ; ; ) 13.91 - { 13.92 - if ( !l1_pgentry_empty(*pl1e) ) BUG(); 13.93 + do { 13.94 *pl1e++ = mk_l1_pgentry((phys_addr+cur)|PAGE_HYPERVISOR|flags); 13.95 - cur += PAGE_SIZE; 13.96 - if ( cur == size ) break; 13.97 - if ( !((unsigned long)pl1e & (PAGE_SIZE-1)) ) 13.98 - { 13.99 - if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e); 13.100 - pl1e = l2_pgentry_to_l1(*pl2e++); 13.101 - } 13.102 } 13.103 + while ( (cur += PAGE_SIZE) != size ); 13.104 13.105 - flush_tlb_all(); 13.106 - 13.107 - return (void *) (offset + (char *)vaddr); 13.108 + return (void *)(offset + (char *)vaddr); 13.109 } 13.110 13.111 void iounmap(void *addr)
14.1 --- a/xen/arch/i386/irq.c Sat Dec 20 23:39:49 2003 +0000 14.2 +++ b/xen/arch/i386/irq.c Sat Dec 20 23:41:19 2003 +0000 14.3 @@ -24,7 +24,8 @@ 14.4 #include <xeno/interrupt.h> 14.5 #include <xeno/irq.h> 14.6 #include <xeno/slab.h> 14.7 - 14.8 +#include <asm/mpspec.h> 14.9 +#include <asm/io_apic.h> 14.10 #include <asm/msr.h> 14.11 #include <asm/hardirq.h> 14.12 #include <asm/ptrace.h>
15.1 --- a/xen/arch/i386/mm.c Sat Dec 20 23:39:49 2003 +0000 15.2 +++ b/xen/arch/i386/mm.c Sat Dec 20 23:41:19 2003 +0000 15.3 @@ -27,8 +27,8 @@ 15.4 #include <asm/fixmap.h> 15.5 #include <asm/domain_page.h> 15.6 15.7 -static inline void set_pte_phys (unsigned long vaddr, 15.8 - l1_pgentry_t entry) 15.9 +static inline void set_pte_phys(unsigned long vaddr, 15.10 + l1_pgentry_t entry) 15.11 { 15.12 l2_pgentry_t *l2ent; 15.13 l1_pgentry_t *l1ent; 15.14 @@ -41,20 +41,22 @@ static inline void set_pte_phys (unsigne 15.15 __flush_tlb_one(vaddr); 15.16 } 15.17 15.18 -void __set_fixmap (enum fixed_addresses idx, 15.19 - l1_pgentry_t entry) 15.20 + 15.21 +void __set_fixmap(enum fixed_addresses idx, 15.22 + l1_pgentry_t entry) 15.23 { 15.24 unsigned long address = __fix_to_virt(idx); 15.25 15.26 - if (idx >= __end_of_fixed_addresses) { 15.27 + if ( likely(idx < __end_of_fixed_addresses) ) 15.28 + set_pte_phys(address, entry); 15.29 + else 15.30 printk("Invalid __set_fixmap\n"); 15.31 - return; 15.32 - } 15.33 - set_pte_phys(address, entry); 15.34 } 15.35 15.36 -static void __init fixrange_init (unsigned long start, 15.37 - unsigned long end, l2_pgentry_t *pg_base) 15.38 + 15.39 +static void __init fixrange_init(unsigned long start, 15.40 + unsigned long end, 15.41 + l2_pgentry_t *pg_base) 15.42 { 15.43 l2_pgentry_t *l2e; 15.44 int i; 15.45 @@ -66,7 +68,8 @@ static void __init fixrange_init (unsign 15.46 15.47 for ( ; (i < ENTRIES_PER_L2_PAGETABLE) && (vaddr != end); l2e++, i++ ) 15.48 { 15.49 - if ( !l2_pgentry_empty(*l2e) ) continue; 15.50 + if ( !l2_pgentry_empty(*l2e) ) 15.51 + continue; 15.52 page = (unsigned long)get_free_page(GFP_KERNEL); 15.53 clear_page(page); 15.54 *l2e = mk_l2_pgentry(__pa(page) | __PAGE_HYPERVISOR); 15.55 @@ -79,11 +82,6 @@ void __init paging_init(void) 15.56 unsigned long addr; 15.57 void *ioremap_pt; 15.58 15.59 - /* XXX initialised in boot.S */ 15.60 - /*if ( cpu_has_pge ) set_in_cr4(X86_CR4_PGE);*/ 15.61 - /*if ( cpu_has_pse ) set_in_cr4(X86_CR4_PSE);*/ 15.62 - /*if ( cpu_has_pae ) set_in_cr4(X86_CR4_PAE);*/ 15.63 - 15.64 /* 15.65 * Fixed mappings, only the page table structure has to be 15.66 * created - mappings will be set by set_fixmap(): 15.67 @@ -115,12 +113,12 @@ void __init paging_init(void) 15.68 15.69 } 15.70 15.71 -void __init zap_low_mappings (void) 15.72 +void __init zap_low_mappings(void) 15.73 { 15.74 int i; 15.75 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 15.76 idle_pg_table[i] = mk_l2_pgentry(0); 15.77 - flush_tlb_all(); 15.78 + flush_tlb_all_pge(); 15.79 } 15.80 15.81 15.82 @@ -212,86 +210,54 @@ long set_gdt(struct task_struct *p, 15.83 unsigned int entries) 15.84 { 15.85 /* NB. There are 512 8-byte entries per GDT page. */ 15.86 - unsigned int i, j, nr_pages = (entries + 511) / 512; 15.87 - unsigned long pfn, *gdt_page; 15.88 - long ret = -EINVAL; 15.89 - struct pfn_info *page; 15.90 + int i, nr_pages = (entries + 511) / 512; 15.91 + unsigned long pfn; 15.92 struct desc_struct *vgdt; 15.93 15.94 - spin_lock(&p->page_lock); 15.95 - 15.96 /* Check the new GDT. */ 15.97 for ( i = 0; i < nr_pages; i++ ) 15.98 { 15.99 - if ( frames[i] >= max_page ) 15.100 - goto out; 15.101 - 15.102 - page = frame_table + frames[i]; 15.103 - if ( (page->flags & PG_domain_mask) != p->domain ) 15.104 - goto out; 15.105 - 15.106 - if ( (page->flags & PG_type_mask) != PGT_gdt_page ) 15.107 - { 15.108 - if ( page_type_count(page) != 0 ) 15.109 - goto out; 15.110 - 15.111 - /* Check all potential GDT entries in the page. */ 15.112 - gdt_page = map_domain_mem(frames[0] << PAGE_SHIFT); 15.113 - for ( j = 0; j < 512; j++ ) 15.114 - if ( !check_descriptor(gdt_page[j*2], gdt_page[j*2+1]) ) 15.115 - goto out; 15.116 - unmap_domain_mem(gdt_page); 15.117 - } 15.118 - } 15.119 - 15.120 - /* Tear down the old GDT. */ 15.121 - for ( i = 0; i < 16; i++ ) 15.122 - { 15.123 - pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i]); 15.124 - p->mm.perdomain_pt[i] = mk_l1_pgentry(0); 15.125 - if ( pfn == 0 ) continue; 15.126 - page = frame_table + pfn; 15.127 - ASSERT((page->flags & PG_type_mask) == PGT_gdt_page); 15.128 - ASSERT((page->flags & PG_domain_mask) == p->domain); 15.129 - ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0)); 15.130 - put_page_type(page); 15.131 - put_page_tot(page); 15.132 - } 15.133 - 15.134 - /* Install the new GDT. */ 15.135 - for ( i = 0; i < nr_pages; i++ ) 15.136 - { 15.137 - p->mm.perdomain_pt[i] = 15.138 - mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); 15.139 - 15.140 - page = frame_table + frames[i]; 15.141 - page->flags &= ~(PG_type_mask | PG_need_flush); 15.142 - page->flags |= PGT_gdt_page; 15.143 - get_page_type(page); 15.144 - get_page_tot(page); 15.145 + if ( unlikely(frames[i] >= max_page) || 15.146 + unlikely(!get_page_and_type(&frame_table[frames[i]], 15.147 + p, PGT_gdt_page)) ) 15.148 + goto fail; 15.149 } 15.150 15.151 /* Copy reserved GDT entries to the new GDT. */ 15.152 - vgdt = map_domain_mem(frames[i] << PAGE_SHIFT); 15.153 + vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); 15.154 memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 15.155 gdt_table + FIRST_RESERVED_GDT_ENTRY, 15.156 NR_RESERVED_GDT_ENTRIES*8); 15.157 unmap_domain_mem(vgdt); 15.158 15.159 + /* Tear down the old GDT. */ 15.160 + for ( i = 0; i < 16; i++ ) 15.161 + { 15.162 + if ( (pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i])) != 0 ) 15.163 + put_page_and_type(&frame_table[pfn]); 15.164 + p->mm.perdomain_pt[i] = mk_l1_pgentry(0); 15.165 + } 15.166 + 15.167 + /* Install the new GDT. */ 15.168 + for ( i = 0; i < nr_pages; i++ ) 15.169 + p->mm.perdomain_pt[i] = 15.170 + mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); 15.171 + 15.172 SET_GDT_ADDRESS(p, GDT_VIRT_START); 15.173 SET_GDT_ENTRIES(p, (entries*8)-1); 15.174 15.175 - ret = 0; /* success */ 15.176 + return 0; 15.177 15.178 - out: 15.179 - spin_unlock(&p->page_lock); 15.180 - return ret; 15.181 + fail: 15.182 + while ( i-- > 0 ) 15.183 + put_page_and_type(&frame_table[frames[i]]); 15.184 + return -EINVAL; 15.185 } 15.186 15.187 15.188 long do_set_gdt(unsigned long *frame_list, unsigned int entries) 15.189 { 15.190 - unsigned int nr_pages = (entries + 511) / 512; 15.191 + int nr_pages = (entries + 511) / 512; 15.192 unsigned long frames[16]; 15.193 long ret; 15.194 15.195 @@ -321,14 +287,12 @@ long do_update_descriptor( 15.196 if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) ) 15.197 return -EINVAL; 15.198 15.199 - spin_lock(¤t->page_lock); 15.200 - 15.201 - page = frame_table + pfn; 15.202 - if ( (page->flags & PG_domain_mask) != current->domain ) 15.203 + page = &frame_table[pfn]; 15.204 + if ( unlikely(!get_page(page, current)) ) 15.205 goto out; 15.206 15.207 /* Check if the given frame is in use in an unsafe context. */ 15.208 - switch ( (page->flags & PG_type_mask) ) 15.209 + switch ( page->type_and_flags & PGT_type_mask ) 15.210 { 15.211 case PGT_gdt_page: 15.212 /* Disallow updates of Xen-reserved descriptors in the current GDT. */ 15.213 @@ -336,12 +300,17 @@ long do_update_descriptor( 15.214 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && 15.215 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) 15.216 goto out; 15.217 + if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) 15.218 + goto out; 15.219 + break; 15.220 case PGT_ldt_page: 15.221 - case PGT_writeable_page: 15.222 + if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) 15.223 + goto out; 15.224 break; 15.225 default: 15.226 - if ( page_type_count(page) != 0 ) 15.227 + if ( unlikely(!get_page_type(page, PGT_writeable_page)) ) 15.228 goto out; 15.229 + break; 15.230 } 15.231 15.232 /* All is good so make the update. */ 15.233 @@ -350,9 +319,11 @@ long do_update_descriptor( 15.234 gdt_pent[1] = word2; 15.235 unmap_domain_mem(gdt_pent); 15.236 15.237 + put_page_type(page); 15.238 + 15.239 ret = 0; /* success */ 15.240 15.241 out: 15.242 - spin_unlock(¤t->page_lock); 15.243 + put_page(page); 15.244 return ret; 15.245 }
16.1 --- a/xen/arch/i386/pci-irq.c Sat Dec 20 23:39:49 2003 +0000 16.2 +++ b/xen/arch/i386/pci-irq.c Sat Dec 20 23:41:19 2003 +0000 16.3 @@ -6,16 +6,15 @@ 16.4 16.5 #include <linux/config.h> 16.6 #include <linux/types.h> 16.7 -/*#include <linux/kernel.h>*/ 16.8 #include <linux/pci.h> 16.9 #include <linux/init.h> 16.10 #include <linux/slab.h> 16.11 #include <linux/interrupt.h> 16.12 #include <linux/irq.h> 16.13 #include <linux/sched.h> 16.14 - 16.15 #include <asm/io.h> 16.16 #include <asm/smp.h> 16.17 +#include <asm/mpspec.h> 16.18 #include <asm/io_apic.h> 16.19 16.20 #include "pci-i386.h"
17.1 --- a/xen/arch/i386/process.c Sat Dec 20 23:39:49 2003 +0000 17.2 +++ b/xen/arch/i386/process.c Sat Dec 20 23:41:19 2003 +0000 17.3 @@ -27,6 +27,7 @@ 17.4 #include <asm/processor.h> 17.5 #include <asm/desc.h> 17.6 #include <asm/i387.h> 17.7 +#include <asm/mpspec.h> 17.8 17.9 #include <xeno/irq.h> 17.10 #include <xeno/event.h> 17.11 @@ -263,7 +264,7 @@ void switch_to(struct task_struct *prev_ 17.12 tss->ss1 = next->ss1; 17.13 17.14 /* Switch page tables. */ 17.15 - __write_cr3_counted(pagetable_val(next_p->mm.pagetable)); 17.16 + write_cr3_counted(pagetable_val(next_p->mm.pagetable)); 17.17 17.18 set_current(next_p); 17.19
18.1 --- a/xen/arch/i386/smp.c Sat Dec 20 23:39:49 2003 +0000 18.2 +++ b/xen/arch/i386/smp.c Sat Dec 20 23:41:19 2003 +0000 18.3 @@ -16,6 +16,7 @@ 18.4 #include <asm/mc146818rtc.h> 18.5 #include <asm/pgalloc.h> 18.6 #include <asm/smpboot.h> 18.7 +#include <asm/hardirq.h> 18.8 18.9 #ifdef CONFIG_SMP 18.10 18.11 @@ -264,34 +265,67 @@ static spinlock_t tlbstate_lock = SPIN_L 18.12 asmlinkage void smp_invalidate_interrupt(void) 18.13 { 18.14 ack_APIC_irq(); 18.15 - if (test_and_clear_bit(smp_processor_id(), &flush_cpumask)) 18.16 - local_flush_tlb(); 18.17 + clear_bit(smp_processor_id(), &flush_cpumask); 18.18 + local_flush_tlb(); 18.19 } 18.20 18.21 -void flush_tlb_others(unsigned long cpumask) 18.22 +void flush_tlb_mask(unsigned long mask) 18.23 { 18.24 - spin_lock(&tlbstate_lock); 18.25 - atomic_set_mask(cpumask, &flush_cpumask); 18.26 - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); 18.27 - while (flush_cpumask) continue; 18.28 + if ( unlikely(in_irq()) ) 18.29 + BUG(); 18.30 + 18.31 + if ( mask & (1 << smp_processor_id()) ) 18.32 + { 18.33 + local_flush_tlb(); 18.34 + mask &= ~(1 << smp_processor_id()); 18.35 + } 18.36 + 18.37 + if ( mask != 0 ) 18.38 + { 18.39 + spin_lock(&tlbstate_lock); 18.40 + flush_cpumask = mask; 18.41 + send_IPI_mask(mask, INVALIDATE_TLB_VECTOR); 18.42 + while ( flush_cpumask != 0 ) 18.43 + { 18.44 + rep_nop(); 18.45 + barrier(); 18.46 + } 18.47 + spin_unlock(&tlbstate_lock); 18.48 + } 18.49 +} 18.50 + 18.51 +void new_tlbflush_clock_period(void) 18.52 +{ 18.53 + if ( unlikely(!spin_trylock(&tlbstate_lock)) ) 18.54 + return; 18.55 + 18.56 + if ( unlikely((flush_cpumask = tlbflush_mask) != 0) ) 18.57 + { 18.58 + send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR); 18.59 + while ( flush_cpumask != 0 ) 18.60 + { 18.61 + rep_nop(); 18.62 + barrier(); 18.63 + } 18.64 + } 18.65 + 18.66 + /* No need for cmpxchg updates here: we are protected by tlbstate lock. */ 18.67 + tlbflush_mask = (1 << smp_num_cpus) - 1; 18.68 + wmb(); /* Reset the mask before allowing the clock to continue ticking. */ 18.69 + tlbflush_clock++; 18.70 + 18.71 spin_unlock(&tlbstate_lock); 18.72 } 18.73 - 18.74 -static inline void do_flush_tlb_all_local(void) 18.75 + 18.76 +static void flush_tlb_all_pge_ipi(void* info) 18.77 { 18.78 - __flush_tlb_all(); 18.79 + __flush_tlb_pge(); 18.80 } 18.81 18.82 -static void flush_tlb_all_ipi(void* info) 18.83 +void flush_tlb_all_pge(void) 18.84 { 18.85 - do_flush_tlb_all_local(); 18.86 -} 18.87 - 18.88 -void flush_tlb_all(void) 18.89 -{ 18.90 - smp_call_function (flush_tlb_all_ipi,0,1,1); 18.91 - 18.92 - do_flush_tlb_all_local(); 18.93 + smp_call_function (flush_tlb_all_pge_ipi,0,1,1); 18.94 + __flush_tlb_pge(); 18.95 } 18.96 18.97 void smp_send_event_check_mask(unsigned long cpu_mask)
19.1 --- a/xen/arch/i386/smpboot.c Sat Dec 20 23:39:49 2003 +0000 19.2 +++ b/xen/arch/i386/smpboot.c Sat Dec 20 23:41:19 2003 +0000 19.3 @@ -44,6 +44,8 @@ 19.4 #include <xeno/smp.h> 19.5 #include <asm/msr.h> 19.6 #include <asm/system.h> 19.7 +#include <asm/mpspec.h> 19.8 +#include <asm/io_apic.h> 19.9 #include <xeno/sched.h> 19.10 #include <xeno/delay.h> 19.11 #include <xeno/lib.h>
20.1 --- a/xen/arch/i386/traps.c Sat Dec 20 23:39:49 2003 +0000 20.2 +++ b/xen/arch/i386/traps.c Sat Dec 20 23:41:19 2003 +0000 20.3 @@ -211,6 +211,7 @@ static inline void do_trap(int trapnr, c 20.4 20.5 if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) 20.6 { 20.7 + DPRINTK("Trap %d: %08lx -> %08lx\n", trapnr, regs->eip, fixup); 20.8 regs->eip = fixup; 20.9 regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; 20.10 return; 20.11 @@ -328,6 +329,7 @@ asmlinkage void do_page_fault(struct pt_ 20.12 20.13 if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) 20.14 { 20.15 + DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup); 20.16 regs->eip = fixup; 20.17 regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; 20.18 return; 20.19 @@ -411,6 +413,7 @@ asmlinkage void do_general_protection(st 20.20 20.21 if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) 20.22 { 20.23 + DPRINTK("GPF (%04lx): %08lx -> %08lx\n", error_code, regs->eip, fixup); 20.24 regs->eip = fixup; 20.25 regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; 20.26 return;
21.1 --- a/xen/common/dom0_ops.c Sat Dec 20 23:39:49 2003 +0000 21.2 +++ b/xen/common/dom0_ops.c Sat Dec 20 23:41:19 2003 +0000 21.3 @@ -38,31 +38,6 @@ static unsigned int get_domnr(void) 21.4 return 0; 21.5 } 21.6 21.7 -static void build_page_list(struct task_struct *p) 21.8 -{ 21.9 - unsigned long *list; 21.10 - unsigned long curr; 21.11 - struct list_head *list_ent; 21.12 - 21.13 - curr = list_entry(p->pg_head.next, struct pfn_info, list) - frame_table; 21.14 - list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT); 21.15 - 21.16 - list_for_each(list_ent, &p->pg_head) 21.17 - { 21.18 - *list++ = list_entry(list_ent, struct pfn_info, list) - frame_table; 21.19 - 21.20 - if( ((unsigned long)list & ~PAGE_MASK) == 0 ) 21.21 - { 21.22 - struct list_head *ent = frame_table[curr].list.next; 21.23 - curr = list_entry(ent, struct pfn_info, list) - frame_table; 21.24 - unmap_domain_mem(list-1); 21.25 - list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT); 21.26 - } 21.27 - } 21.28 - 21.29 - unmap_domain_mem(list); 21.30 -} 21.31 - 21.32 static int msr_cpu_mask; 21.33 static unsigned long msr_addr; 21.34 static unsigned long msr_lo; 21.35 @@ -163,8 +138,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op) 21.36 goto exit_create; 21.37 } 21.38 21.39 - build_page_list(p); 21.40 - 21.41 ret = p->domain; 21.42 21.43 op.u.createdomain.domain = ret; 21.44 @@ -246,7 +219,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op) 21.45 case DOM0_GETMEMLIST: 21.46 { 21.47 int i; 21.48 - struct task_struct * p = find_domain_by_id(op.u.getmemlist.domain); 21.49 + struct task_struct *p = find_domain_by_id(op.u.getmemlist.domain); 21.50 unsigned long max_pfns = op.u.getmemlist.max_pfns; 21.51 unsigned long pfn; 21.52 unsigned long *buffer = op.u.getmemlist.buffer; 21.53 @@ -255,28 +228,27 @@ long do_dom0_op(dom0_op_t *u_dom0_op) 21.54 ret = -EINVAL; 21.55 if ( p != NULL ) 21.56 { 21.57 - list_ent = p->pg_head.next; 21.58 - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; 21.59 - 21.60 - for ( i = 0; (i < max_pfns) && (list_ent != &p->pg_head); i++ ) 21.61 + ret = 0; 21.62 + 21.63 + spin_lock(&p->page_list_lock); 21.64 + list_ent = p->page_list.next; 21.65 + for ( i = 0; (i < max_pfns) && (list_ent != &p->page_list); i++ ) 21.66 { 21.67 + pfn = list_entry(list_ent, struct pfn_info, list) - 21.68 + frame_table; 21.69 if ( put_user(pfn, buffer) ) 21.70 { 21.71 ret = -EFAULT; 21.72 - goto out_getmemlist; 21.73 + break; 21.74 } 21.75 buffer++; 21.76 list_ent = frame_table[pfn].list.next; 21.77 - pfn = list_entry(list_ent, struct pfn_info, list) - 21.78 - frame_table; 21.79 } 21.80 + spin_unlock(&p->page_list_lock); 21.81 21.82 op.u.getmemlist.num_pfns = i; 21.83 copy_to_user(u_dom0_op, &op, sizeof(op)); 21.84 - 21.85 - ret = 0; 21.86 - 21.87 - out_getmemlist: 21.88 + 21.89 put_task_struct(p); 21.90 } 21.91 } 21.92 @@ -369,21 +341,24 @@ long do_dom0_op(dom0_op_t *u_dom0_op) 21.93 { 21.94 struct pfn_info *page; 21.95 unsigned long pfn = op.u.getpageframeinfo.pfn; 21.96 - 21.97 - if ( pfn >= max_page ) 21.98 - { 21.99 - ret = -EINVAL; 21.100 - } 21.101 - else 21.102 + unsigned int dom = op.u.getpageframeinfo.domain; 21.103 + struct task_struct *p; 21.104 + 21.105 + ret = -EINVAL; 21.106 + 21.107 + if ( unlikely(pfn >= max_page) || 21.108 + unlikely((p = find_domain_by_id(dom)) == NULL) ) 21.109 + break; 21.110 + 21.111 + page = &frame_table[pfn]; 21.112 + 21.113 + if ( likely(get_page(page, p)) ) 21.114 { 21.115 - page = frame_table + pfn; 21.116 - 21.117 - op.u.getpageframeinfo.domain = page->flags & PG_domain_mask; 21.118 - op.u.getpageframeinfo.type = NONE; 21.119 + op.u.getpageframeinfo.type = NONE; 21.120 21.121 - if ( page_type_count(page) != 0 ) 21.122 + if ( (page->type_and_flags & PGT_count_mask) != 0 ) 21.123 { 21.124 - switch ( page->flags & PG_type_mask ) 21.125 + switch ( page->type_and_flags & PGT_type_mask ) 21.126 { 21.127 case PGT_l1_page_table: 21.128 op.u.getpageframeinfo.type = L1TAB; 21.129 @@ -393,9 +368,13 @@ long do_dom0_op(dom0_op_t *u_dom0_op) 21.130 break; 21.131 } 21.132 } 21.133 + 21.134 + put_page(page); 21.135 + } 21.136 21.137 - copy_to_user(u_dom0_op, &op, sizeof(op)); 21.138 - } 21.139 + put_task_struct(p); 21.140 + 21.141 + copy_to_user(u_dom0_op, &op, sizeof(op)); 21.142 } 21.143 break; 21.144
22.1 --- a/xen/common/dom_mem_ops.c Sat Dec 20 23:39:49 2003 +0000 22.2 +++ b/xen/common/dom_mem_ops.c Sat Dec 20 23:41:19 2003 +0000 22.3 @@ -16,58 +16,26 @@ 22.4 #include <xeno/event.h> 22.5 #include <asm/domain_page.h> 22.6 22.7 -#if 0 22.8 -#define DPRINTK(_f, _a...) printk( _f , ## _a ) 22.9 -#else 22.10 -#define DPRINTK(_f, _a...) ((void)0) 22.11 -#endif 22.12 - 22.13 static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op) 22.14 { 22.15 - struct list_head *temp; 22.16 - struct pfn_info *pf; /* pfn_info of current page */ 22.17 + struct pfn_info *page; 22.18 unsigned long mpfn; /* machine frame number of current page */ 22.19 void *va; /* Xen-usable mapping of current page */ 22.20 unsigned long i; 22.21 - unsigned long flags; 22.22 22.23 - /* 22.24 - * POLICY DECISION: Each domain has a page limit. 22.25 - * NB. The first part of test is because op.size could be so big that 22.26 - * tot_pages + op.size overflows a u_long. 22.27 - */ 22.28 - if( (op.size > p->max_pages) || 22.29 - ((p->tot_pages + op.size) > p->max_pages) ) 22.30 - return -ENOMEM; 22.31 - 22.32 - spin_lock_irqsave(&free_list_lock, flags); 22.33 - 22.34 - if ( free_pfns < (op.size + (SLACK_DOMAIN_MEM_KILOBYTES >> 22.35 - (PAGE_SHIFT-10))) ) 22.36 - { 22.37 - spin_unlock_irqrestore(&free_list_lock, flags); 22.38 - return -ENOMEM; 22.39 - } 22.40 - 22.41 - spin_lock(&p->page_lock); 22.42 - 22.43 - temp = free_list.next; 22.44 for ( i = 0; i < op.size; i++ ) 22.45 { 22.46 - /* Get a free page and add it to the domain's page list. */ 22.47 - pf = list_entry(temp, struct pfn_info, list); 22.48 - pf->flags |= p->domain; 22.49 - set_page_type_count(pf, 0); 22.50 - set_page_tot_count(pf, 0); 22.51 - temp = temp->next; 22.52 - list_del(&pf->list); 22.53 - list_add_tail(&pf->list, &p->pg_head); 22.54 - free_pfns--; 22.55 + /* Leave some slack pages; e.g., for the network. */ 22.56 + if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 22.57 + (PAGE_SHIFT-10))) ) 22.58 + break; 22.59 22.60 - p->tot_pages++; 22.61 - 22.62 + /* NB. 'alloc_domain_page' does limit checking on pages per domain. */ 22.63 + if ( unlikely((page = alloc_domain_page(p)) == NULL) ) 22.64 + break; 22.65 + 22.66 /* Inform the domain of the new page's machine address. */ 22.67 - mpfn = (unsigned long)(pf - frame_table); 22.68 + mpfn = (unsigned long)(page - frame_table); 22.69 copy_to_user(op.pages, &mpfn, sizeof(mpfn)); 22.70 op.pages++; 22.71 22.72 @@ -77,26 +45,17 @@ static long alloc_dom_mem(struct task_st 22.73 unmap_domain_mem(va); 22.74 } 22.75 22.76 - spin_unlock(&p->page_lock); 22.77 - spin_unlock_irqrestore(&free_list_lock, flags); 22.78 - 22.79 - return op.size; 22.80 + return i; 22.81 } 22.82 22.83 static long free_dom_mem(struct task_struct *p, reservation_decrease_t op) 22.84 { 22.85 - struct list_head *temp; 22.86 - struct pfn_info *pf; /* pfn_info of current page */ 22.87 + struct pfn_info *page; 22.88 unsigned long mpfn; /* machine frame number of current page */ 22.89 unsigned long i; 22.90 - unsigned long flags; 22.91 long rc = 0; 22.92 int need_flush = 0; 22.93 22.94 - spin_lock_irqsave(&free_list_lock, flags); 22.95 - spin_lock(&p->page_lock); 22.96 - 22.97 - temp = free_list.next; 22.98 for ( i = 0; i < op.size; i++ ) 22.99 { 22.100 copy_from_user(&mpfn, op.pages, sizeof(mpfn)); 22.101 @@ -109,37 +68,28 @@ static long free_dom_mem(struct task_str 22.102 goto out; 22.103 } 22.104 22.105 - pf = &frame_table[mpfn]; 22.106 - if ( (page_type_count(pf) != 0) || 22.107 - (page_tot_count(pf) != 0) || 22.108 - ((pf->flags & PG_domain_mask) != p->domain) ) 22.109 + page = &frame_table[mpfn]; 22.110 + if ( unlikely(!get_page(page, p)) ) 22.111 { 22.112 - DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n", 22.113 - p->domain, page_type_count(pf), 22.114 - page_tot_count(pf), pf->flags); 22.115 + DPRINTK("Bad page free for domain %d\n", p->domain); 22.116 rc = -EINVAL; 22.117 goto out; 22.118 } 22.119 22.120 - need_flush |= pf->flags & PG_need_flush; 22.121 - 22.122 - pf->flags = 0; 22.123 + if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) ) 22.124 + put_page_and_type(page); 22.125 22.126 - list_del(&pf->list); 22.127 - list_add(&pf->list, &free_list); 22.128 - free_pfns++; 22.129 + if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) ) 22.130 + put_page(page); 22.131 22.132 - p->tot_pages--; 22.133 + put_page(page); 22.134 } 22.135 22.136 out: 22.137 - spin_unlock(&p->page_lock); 22.138 - spin_unlock_irqrestore(&free_list_lock, flags); 22.139 - 22.140 if ( need_flush ) 22.141 { 22.142 __flush_tlb(); 22.143 - perfc_incrc(need_flush_tlb_flush); 22.144 + perfc_incr(need_flush_tlb_flush); 22.145 } 22.146 22.147 return rc ? rc : op.size;
23.1 --- a/xen/common/domain.c Sat Dec 20 23:39:49 2003 +0000 23.2 +++ b/xen/common/domain.c Sat Dec 20 23:41:19 2003 +0000 23.3 @@ -51,12 +51,11 @@ struct task_struct *do_createdomain(unsi 23.4 sprintf(p->name, "Domain-%d", dom_id); 23.5 23.6 spin_lock_init(&p->blk_ring_lock); 23.7 - spin_lock_init(&p->page_lock); 23.8 spin_lock_init(&p->event_channel_lock); 23.9 23.10 p->shared_info = (void *)get_free_page(GFP_KERNEL); 23.11 memset(p->shared_info, 0, PAGE_SIZE); 23.12 - SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), dom_id); 23.13 + SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p); 23.14 23.15 p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL); 23.16 memset(p->mm.perdomain_pt, 0, PAGE_SIZE); 23.17 @@ -67,8 +66,10 @@ struct task_struct *do_createdomain(unsi 23.18 23.19 sched_add_domain(p); 23.20 23.21 - INIT_LIST_HEAD(&p->pg_head); 23.22 + spin_lock_init(&p->page_list_lock); 23.23 + INIT_LIST_HEAD(&p->page_list); 23.24 p->max_pages = p->tot_pages = 0; 23.25 + 23.26 write_lock_irqsave(&tasklist_lock, flags); 23.27 SET_LINKS(p); 23.28 p->next_hash = task_hash[TASK_HASH(dom_id)]; 23.29 @@ -218,77 +219,203 @@ long stop_other_domain(unsigned int dom) 23.30 return 0; 23.31 } 23.32 23.33 -unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes) 23.34 +struct pfn_info *alloc_domain_page(struct task_struct *p) 23.35 { 23.36 - struct list_head *temp; 23.37 - struct pfn_info *pf; 23.38 - unsigned int alloc_pfns; 23.39 - unsigned int req_pages; 23.40 - unsigned long flags; 23.41 - 23.42 - /* how many pages do we need to alloc? */ 23.43 - req_pages = kbytes >> (PAGE_SHIFT - 10); 23.44 + struct pfn_info *page = NULL; 23.45 + unsigned long flags, mask, pfn_stamp, cpu_stamp; 23.46 + int i; 23.47 23.48 spin_lock_irqsave(&free_list_lock, flags); 23.49 - 23.50 - /* is there enough mem to serve the request? */ 23.51 - if ( (req_pages + (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) > 23.52 - free_pfns ) 23.53 + if ( likely(!list_empty(&free_list)) ) 23.54 + { 23.55 + page = list_entry(free_list.next, struct pfn_info, list); 23.56 + list_del(&page->list); 23.57 + free_pfns--; 23.58 + } 23.59 + spin_unlock_irqrestore(&free_list_lock, flags); 23.60 + 23.61 + if ( unlikely(page == NULL) ) 23.62 + return NULL; 23.63 + 23.64 + if ( unlikely((mask = page->u.cpu_mask) != 0) ) 23.65 { 23.66 - spin_unlock_irqrestore(&free_list_lock, flags); 23.67 - return -1; 23.68 + pfn_stamp = page->tlbflush_timestamp; 23.69 + for ( i = 0; mask != 0; i++ ) 23.70 + { 23.71 + if ( unlikely(mask & (1<<i)) ) 23.72 + { 23.73 + cpu_stamp = tlbflush_time[i]; 23.74 + if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) ) 23.75 + mask &= ~(1<<i); 23.76 + } 23.77 + } 23.78 + 23.79 + if ( unlikely(mask != 0) ) 23.80 + { 23.81 + if ( unlikely(in_irq()) ) 23.82 + { 23.83 + DPRINTK("Returning NULL from alloc_domain_page: in_irq\n"); 23.84 + goto free_and_exit; 23.85 + } 23.86 + perfc_incrc(need_flush_tlb_flush); 23.87 + flush_tlb_mask(mask); 23.88 + } 23.89 + } 23.90 + 23.91 + page->u.domain = p; 23.92 + page->type_and_flags = 0; 23.93 + if ( p != NULL ) 23.94 + { 23.95 + if ( unlikely(in_irq()) ) 23.96 + BUG(); 23.97 + wmb(); /* Domain pointer must be visible before updating refcnt. */ 23.98 + spin_lock(&p->page_list_lock); 23.99 + if ( unlikely(p->tot_pages >= p->max_pages) ) 23.100 + { 23.101 + spin_unlock(&p->page_list_lock); 23.102 + goto free_and_exit; 23.103 + } 23.104 + list_add_tail(&page->list, &p->page_list); 23.105 + p->tot_pages++; 23.106 + page->count_and_flags = PGC_allocated | 1; 23.107 + spin_unlock(&p->page_list_lock); 23.108 } 23.109 23.110 - /* allocate pages and build a thread through frame_table */ 23.111 - temp = free_list.next; 23.112 - for ( alloc_pfns = 0; alloc_pfns < req_pages; alloc_pfns++ ) 23.113 + return page; 23.114 + 23.115 + free_and_exit: 23.116 + spin_lock_irqsave(&free_list_lock, flags); 23.117 + list_add(&page->list, &free_list); 23.118 + free_pfns++; 23.119 + spin_unlock_irqrestore(&free_list_lock, flags); 23.120 + return NULL; 23.121 +} 23.122 + 23.123 +void free_domain_page(struct pfn_info *page) 23.124 +{ 23.125 + unsigned long flags; 23.126 + struct task_struct *p = page->u.domain; 23.127 + 23.128 + if ( unlikely(in_irq()) ) 23.129 + BUG(); 23.130 + 23.131 + if ( likely(!IS_XEN_HEAP_FRAME(page)) ) 23.132 + { 23.133 + /* 23.134 + * No race with setting of zombie bit. If it wasn't set before the 23.135 + * last reference was dropped, then it can't be set now. 23.136 + */ 23.137 + page->u.cpu_mask = 0; 23.138 + if ( !(page->count_and_flags & PGC_zombie) ) 23.139 + { 23.140 + page->tlbflush_timestamp = tlbflush_clock; 23.141 + page->u.cpu_mask = 1 << p->processor; 23.142 + 23.143 + spin_lock(&p->page_list_lock); 23.144 + list_del(&page->list); 23.145 + p->tot_pages--; 23.146 + spin_unlock(&p->page_list_lock); 23.147 + } 23.148 + 23.149 + page->count_and_flags = 0; 23.150 + 23.151 + spin_lock_irqsave(&free_list_lock, flags); 23.152 + list_add(&page->list, &free_list); 23.153 + free_pfns++; 23.154 + spin_unlock_irqrestore(&free_list_lock, flags); 23.155 + } 23.156 + else 23.157 { 23.158 - pf = list_entry(temp, struct pfn_info, list); 23.159 - pf->flags = p->domain; 23.160 - set_page_type_count(pf, 0); 23.161 - set_page_tot_count(pf, 0); 23.162 - temp = temp->next; 23.163 - list_del(&pf->list); 23.164 - list_add_tail(&pf->list, &p->pg_head); 23.165 - free_pfns--; 23.166 - ASSERT(free_pfns != 0); 23.167 + /* 23.168 + * No need for a TLB flush. Non-domain pages are always co-held by Xen, 23.169 + * and the Xen reference is not dropped until the domain is dead. 23.170 + * DOM0 may hold references, but it's trusted so no need to flush. 23.171 + */ 23.172 + page->u.cpu_mask = 0; 23.173 + page->count_and_flags = 0; 23.174 + free_page((unsigned long)page_to_virt(page)); 23.175 } 23.176 - 23.177 - spin_unlock_irqrestore(&free_list_lock, flags); 23.178 - 23.179 - p->tot_pages = req_pages; 23.180 +} 23.181 + 23.182 + 23.183 +void free_all_dom_mem(struct task_struct *p) 23.184 +{ 23.185 + struct list_head *ent, zombies; 23.186 + struct pfn_info *page; 23.187 + 23.188 + INIT_LIST_HEAD(&zombies); 23.189 + 23.190 + spin_lock(&p->page_list_lock); 23.191 + while ( (ent = p->page_list.next) != &p->page_list ) 23.192 + { 23.193 + page = list_entry(ent, struct pfn_info, list); 23.194 + 23.195 + if ( unlikely(!get_page(page, p)) ) 23.196 + { 23.197 + /* 23.198 + * Another CPU has dropped the last reference and is responsible 23.199 + * for removing the page from this list. Wait for them to do so. 23.200 + */ 23.201 + spin_unlock(&p->page_list_lock); 23.202 + while ( p->page_list.next == ent ) 23.203 + barrier(); 23.204 + spin_lock(&p->page_list_lock); 23.205 + continue; 23.206 + } 23.207 + 23.208 + set_bit(_PGC_zombie, &page->count_and_flags); 23.209 + 23.210 + list_del(&page->list); 23.211 + p->tot_pages--; 23.212 + 23.213 + list_add(&page->list, &zombies); 23.214 + } 23.215 + spin_unlock(&p->page_list_lock); 23.216 + 23.217 + /* We do the potentially complex 'put' operations with no lock held. */ 23.218 + while ( (ent = zombies.next) != &zombies ) 23.219 + { 23.220 + page = list_entry(ent, struct pfn_info, list); 23.221 + 23.222 + list_del(&page->list); 23.223 + 23.224 + if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) ) 23.225 + put_page_and_type(page); 23.226 + 23.227 + if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) ) 23.228 + put_page(page); 23.229 + 23.230 + put_page(page); 23.231 + } 23.232 +} 23.233 + 23.234 + 23.235 +unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes) 23.236 +{ 23.237 + unsigned int alloc_pfns, nr_pages; 23.238 + 23.239 + nr_pages = kbytes >> (PAGE_SHIFT - 10); 23.240 23.241 /* TEMPORARY: max_pages should be explicitly specified. */ 23.242 - p->max_pages = p->tot_pages; 23.243 + p->max_pages = nr_pages; 23.244 + 23.245 + for ( alloc_pfns = 0; alloc_pfns < nr_pages; alloc_pfns++ ) 23.246 + { 23.247 + if ( unlikely(alloc_domain_page(p) == NULL) || 23.248 + unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 23.249 + (PAGE_SHIFT-10))) ) 23.250 + { 23.251 + free_all_dom_mem(p); 23.252 + return -1; 23.253 + } 23.254 + } 23.255 + 23.256 + p->tot_pages = nr_pages; 23.257 23.258 return 0; 23.259 } 23.260 23.261 23.262 -void free_all_dom_mem(struct task_struct *p) 23.263 -{ 23.264 - struct list_head *ent; 23.265 - unsigned long flags; 23.266 - 23.267 - spin_lock_irqsave(&free_list_lock, flags); 23.268 - while ( (ent = p->pg_head.next) != &p->pg_head ) 23.269 - { 23.270 - struct pfn_info *pf = list_entry(ent, struct pfn_info, list); 23.271 - set_page_type_count(pf, 0); 23.272 - set_page_tot_count(pf, 0); 23.273 - pf->flags = 0; 23.274 - ASSERT(ent->next->prev == ent); 23.275 - ASSERT(ent->prev->next == ent); 23.276 - list_del(ent); 23.277 - list_add(ent, &free_list); 23.278 - free_pfns++; 23.279 - } 23.280 - spin_unlock_irqrestore(&free_list_lock, flags); 23.281 - 23.282 - p->tot_pages = 0; 23.283 -} 23.284 - 23.285 - 23.286 /* Release resources belonging to task @p. */ 23.287 void release_task(struct task_struct *p) 23.288 { 23.289 @@ -309,7 +436,6 @@ void release_task(struct task_struct *p) 23.290 destroy_event_channels(p); 23.291 free_page((unsigned long)p->mm.perdomain_pt); 23.292 UNSHARE_PFN(virt_to_page(p->shared_info)); 23.293 - free_page((unsigned long)p->shared_info); 23.294 free_all_dom_mem(p); 23.295 23.296 kmem_cache_free(task_struct_cachep, p); 23.297 @@ -360,11 +486,10 @@ int final_setup_guestos(struct task_stru 23.298 p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs; 23.299 p->failsafe_address = builddomain->ctxt.failsafe_callback_eip; 23.300 23.301 - /* NB. Page base must already be pinned! */ 23.302 phys_l2tab = builddomain->ctxt.pt_base; 23.303 p->mm.pagetable = mk_pagetable(phys_l2tab); 23.304 - get_page_type(&frame_table[phys_l2tab>>PAGE_SHIFT]); 23.305 - get_page_tot(&frame_table[phys_l2tab>>PAGE_SHIFT]); 23.306 + get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, 23.307 + PGT_l2_page_table); 23.308 23.309 /* Set up the shared info structure. */ 23.310 update_dom_time(p->shared_info); 23.311 @@ -449,7 +574,7 @@ int setup_guestos(struct task_struct *p, 23.312 return -ENOMEM; 23.313 } 23.314 23.315 - alloc_address = list_entry(p->pg_head.prev, struct pfn_info, list) - 23.316 + alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) - 23.317 frame_table; 23.318 alloc_address <<= PAGE_SHIFT; 23.319 alloc_index = p->tot_pages; 23.320 @@ -497,7 +622,7 @@ int setup_guestos(struct task_struct *p, 23.321 p->mm.pagetable = mk_pagetable(phys_l2tab); 23.322 23.323 l2tab += l2_table_offset(virt_load_address); 23.324 - cur_address = list_entry(p->pg_head.next, struct pfn_info, list) - 23.325 + cur_address = list_entry(p->page_list.next, struct pfn_info, list) - 23.326 frame_table; 23.327 cur_address <<= PAGE_SHIFT; 23.328 for ( count = 0; count < p->tot_pages; count++ ) 23.329 @@ -514,10 +639,10 @@ int setup_guestos(struct task_struct *p, 23.330 } 23.331 *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT); 23.332 23.333 - page = frame_table + (cur_address >> PAGE_SHIFT); 23.334 - page->flags = dom | PGT_writeable_page | PG_need_flush; 23.335 - set_page_type_count(page, 1); 23.336 - set_page_tot_count(page, 1); 23.337 + page = &frame_table[cur_address >> PAGE_SHIFT]; 23.338 + set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags); 23.339 + if ( !get_page_and_type(page, p, PGT_writeable_page) ) 23.340 + BUG(); 23.341 /* Set up the MPT entry. */ 23.342 machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count; 23.343 23.344 @@ -538,8 +663,9 @@ int setup_guestos(struct task_struct *p, 23.345 { 23.346 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); 23.347 page = frame_table + l1_pgentry_to_pagenr(*l1tab); 23.348 - page->flags = dom | PGT_l1_page_table; 23.349 - get_page_tot(page); 23.350 + page->type_and_flags &= ~PGT_type_mask; 23.351 + page->type_and_flags |= PGT_l1_page_table; 23.352 + get_page(page, p); /* an extra ref because of readable mapping */ 23.353 l1tab++; 23.354 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) ) 23.355 { 23.356 @@ -548,9 +674,13 @@ int setup_guestos(struct task_struct *p, 23.357 l2tab++; 23.358 } 23.359 } 23.360 - get_page_type(page); /* guest_pinned */ 23.361 - get_page_tot(page); /* guest_pinned */ 23.362 - page->flags = dom | PG_guest_pinned | PGT_l2_page_table; 23.363 + /* Rewrite last L1 page to be a L2 page. */ 23.364 + page->type_and_flags &= ~PGT_type_mask; 23.365 + page->type_and_flags |= PGT_l2_page_table; 23.366 + /* Get another ref to L2 page so that it can be pinned. */ 23.367 + if ( !get_page_and_type(page, p, PGT_l2_page_table) ) 23.368 + BUG(); 23.369 + set_bit(_PGC_guest_pinned, &page->count_and_flags); 23.370 unmap_domain_mem(l1start); 23.371 23.372 /* Set up shared info area. */ 23.373 @@ -565,7 +695,7 @@ int setup_guestos(struct task_struct *p, 23.374 23.375 /* Install the new page tables. */ 23.376 __cli(); 23.377 - __write_cr3_counted(pagetable_val(p->mm.pagetable)); 23.378 + write_cr3_counted(pagetable_val(p->mm.pagetable)); 23.379 23.380 /* Copy the guest OS image. */ 23.381 src = (char *)(phy_data_start + 12); 23.382 @@ -632,7 +762,7 @@ int setup_guestos(struct task_struct *p, 23.383 23.384 23.385 /* Reinstate the caller's page tables. */ 23.386 - __write_cr3_counted(pagetable_val(current->mm.pagetable)); 23.387 + write_cr3_counted(pagetable_val(current->mm.pagetable)); 23.388 __sti(); 23.389 23.390 p->flags |= PF_CONSTRUCTED;
24.1 --- a/xen/common/kernel.c Sat Dec 20 23:39:49 2003 +0000 24.2 +++ b/xen/common/kernel.c Sat Dec 20 23:41:19 2003 +0000 24.3 @@ -181,6 +181,13 @@ void cmain (unsigned long magic, multibo 24.4 for ( ; ; ) ; 24.5 } 24.6 24.7 + /* The array of pfn_info structures must fit into the reserved area. */ 24.8 + if ( sizeof(struct pfn_info) > 24 ) 24.9 + { 24.10 + printk("'struct pfn_info' too large to fit in Xen address space!\n"); 24.11 + for ( ; ; ) ; 24.12 + } 24.13 + 24.14 set_current(&idle0_task); 24.15 24.16 max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10);
25.1 --- a/xen/common/memory.c Sat Dec 20 23:39:49 2003 +0000 25.2 +++ b/xen/common/memory.c Sat Dec 20 23:41:19 2003 +0000 25.3 @@ -139,34 +139,28 @@ 25.4 #include <asm/uaccess.h> 25.5 #include <asm/domain_page.h> 25.6 25.7 -#if 0 25.8 -#define MEM_LOG(_f, _a...) 25.9 +#ifndef NDEBUG 25.10 +#define MEM_LOG(_f, _a...) \ 25.11 printk("DOM%d: (file=memory.c, line=%d) " _f "\n", \ 25.12 current->domain, __LINE__, ## _a ) 25.13 #else 25.14 #define MEM_LOG(_f, _a...) ((void)0) 25.15 #endif 25.16 25.17 -/* Domain 0 is allowed to submit requests on behalf of others. */ 25.18 -#define DOMAIN_OKAY(_f) \ 25.19 - ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0)) 25.20 +static int alloc_l2_table(struct pfn_info *page); 25.21 +static int alloc_l1_table(struct pfn_info *page); 25.22 +static int get_page_from_pagenr(unsigned long page_nr); 25.23 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 25.24 + unsigned int type); 25.25 25.26 -/* 'get' checks parameter for validity before inc'ing refcnt. */ 25.27 -static int get_l2_table(unsigned long page_nr); 25.28 -static int get_l1_table(unsigned long page_nr); 25.29 -static int get_page(unsigned long page_nr, int writeable); 25.30 -static int inc_page_refcnt(unsigned long page_nr, unsigned int type); 25.31 -/* 'put' does no checking because if refcnt not zero, entity must be valid. */ 25.32 -static void put_l2_table(unsigned long page_nr); 25.33 -static void put_l1_table(unsigned long page_nr); 25.34 -static void put_page(unsigned long page_nr, int writeable); 25.35 -static int dec_page_refcnt(unsigned long page_nr, unsigned int type); 25.36 +static void free_l2_table(struct pfn_info *page); 25.37 +static void free_l1_table(struct pfn_info *page); 25.38 25.39 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t); 25.40 +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); 25.41 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); 25.42 25.43 /* frame table size and its size in pages */ 25.44 -frame_table_t * frame_table; 25.45 +struct pfn_info *frame_table; 25.46 unsigned long frame_table_size; 25.47 unsigned long max_page; 25.48 25.49 @@ -176,8 +170,11 @@ unsigned int free_pfns; 25.50 25.51 /* Used to defer flushing of memory structures. */ 25.52 static struct { 25.53 - int flush_tlb; 25.54 - int refresh_ldt; 25.55 +#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ 25.56 +#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ 25.57 +#define DOP_RESTORE_CR0 (1<<2) /* Set the WP bit in CR0. */ 25.58 + unsigned long flags; 25.59 + unsigned long cr0; 25.60 } deferred_op[NR_CPUS] __cacheline_aligned; 25.61 25.62 /* 25.63 @@ -196,7 +193,7 @@ void __init init_frametable(unsigned lon 25.64 max_page = nr_pages; 25.65 frame_table_size = nr_pages * sizeof(struct pfn_info); 25.66 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; 25.67 - frame_table = (frame_table_t *)FRAMETABLE_VIRT_START; 25.68 + frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; 25.69 memset(frame_table, 0, frame_table_size); 25.70 25.71 free_pfns = 0; 25.72 @@ -218,7 +215,7 @@ void __init init_frametable(unsigned lon 25.73 25.74 static void __invalidate_shadow_ldt(struct task_struct *p) 25.75 { 25.76 - int i, cpu = p->processor; 25.77 + int i; 25.78 unsigned long pfn; 25.79 struct pfn_info *page; 25.80 25.81 @@ -230,16 +227,13 @@ static void __invalidate_shadow_ldt(stru 25.82 if ( pfn == 0 ) continue; 25.83 p->mm.perdomain_pt[i] = mk_l1_pgentry(0); 25.84 page = frame_table + pfn; 25.85 - ASSERT((page->flags & PG_type_mask) == PGT_ldt_page); 25.86 - ASSERT((page->flags & PG_domain_mask) == p->domain); 25.87 - ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0)); 25.88 - put_page_type(page); 25.89 - put_page_tot(page); 25.90 + ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); 25.91 + ASSERT_PAGE_IS_DOMAIN(page, p); 25.92 + put_page_and_type(page); 25.93 } 25.94 25.95 /* Dispose of the (now possibly invalid) mappings from the TLB. */ 25.96 - deferred_op[cpu].flush_tlb = 1; 25.97 - deferred_op[cpu].refresh_ldt = 1; 25.98 + deferred_op[p->processor].flags |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; 25.99 } 25.100 25.101 25.102 @@ -251,556 +245,614 @@ static inline void invalidate_shadow_ldt 25.103 } 25.104 25.105 25.106 +int alloc_segdesc_page(struct pfn_info *page) 25.107 +{ 25.108 + unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); 25.109 + int i; 25.110 + 25.111 + for ( i = 0; i < 512; i++ ) 25.112 + if ( unlikely(!check_descriptor(descs[i*2], descs[i*2+1])) ) 25.113 + goto fail; 25.114 + 25.115 + unmap_domain_mem(descs); 25.116 + return 1; 25.117 + 25.118 + fail: 25.119 + unmap_domain_mem(descs); 25.120 + return 0; 25.121 +} 25.122 + 25.123 + 25.124 /* Map shadow page at offset @off. Returns 0 on success. */ 25.125 int map_ldt_shadow_page(unsigned int off) 25.126 { 25.127 struct task_struct *p = current; 25.128 - unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT); 25.129 - unsigned long l1e, *ldt_page; 25.130 - struct pfn_info *page; 25.131 - int i, ret = -1; 25.132 + unsigned long l1e; 25.133 25.134 - /* We cannot take a page_lock in interrupt context. */ 25.135 - if ( in_interrupt() ) 25.136 + if ( unlikely(in_interrupt()) ) 25.137 BUG(); 25.138 25.139 - spin_lock(&p->page_lock); 25.140 - 25.141 - __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT))); 25.142 - if ( unlikely(!(l1e & _PAGE_PRESENT)) ) 25.143 - goto out; 25.144 - 25.145 - page = frame_table + (l1e >> PAGE_SHIFT); 25.146 - if ( unlikely((page->flags & PG_type_mask) != PGT_ldt_page) ) 25.147 - { 25.148 - if ( unlikely(page_type_count(page) != 0) ) 25.149 - goto out; 25.150 + __get_user(l1e, (unsigned long *)&linear_pg_table[(p->mm.ldt_base >> 25.151 + PAGE_SHIFT) + off]); 25.152 25.153 - /* Check all potential LDT entries in the page. */ 25.154 - ldt_page = (unsigned long *)addr; 25.155 - for ( i = 0; i < 512; i++ ) 25.156 - if ( unlikely(!check_descriptor(ldt_page[i*2], ldt_page[i*2+1])) ) 25.157 - goto out; 25.158 - if ( unlikely(page->flags & PG_need_flush) ) 25.159 - { 25.160 - perfc_incrc(need_flush_tlb_flush); 25.161 - __write_cr3_counted(pagetable_val(p->mm.pagetable)); 25.162 - page->flags &= ~PG_need_flush; 25.163 - } 25.164 + if ( unlikely(!(l1e & _PAGE_PRESENT)) || 25.165 + unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 25.166 + p, PGT_ldt_page)) ) 25.167 + return 0; 25.168 25.169 - page->flags &= ~PG_type_mask; 25.170 - page->flags |= PGT_ldt_page; 25.171 - } 25.172 - 25.173 - /* Success! */ 25.174 - get_page_type(page); 25.175 - get_page_tot(page); 25.176 - p->mm.perdomain_pt[off+16] = mk_l1_pgentry(l1e|_PAGE_RW); 25.177 + p->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); 25.178 p->mm.shadow_ldt_mapcnt++; 25.179 25.180 - ret = 0; 25.181 - 25.182 - out: 25.183 - spin_unlock(&p->page_lock); 25.184 - return ret; 25.185 + return 1; 25.186 } 25.187 25.188 25.189 -/* Return original refcnt, or -1 on error. */ 25.190 -static int inc_page_refcnt(unsigned long page_nr, unsigned int type) 25.191 +/* Domain 0 is allowed to build page tables on others' behalf. */ 25.192 +static inline int dom0_get_page(struct pfn_info *page) 25.193 { 25.194 - struct pfn_info *page; 25.195 - unsigned long flags; 25.196 + unsigned long x, nx, y = page->count_and_flags; 25.197 + 25.198 + do { 25.199 + x = y; 25.200 + nx = x + 1; 25.201 + if ( unlikely((x & PGC_count_mask) == 0) || 25.202 + unlikely((nx & PGC_count_mask) == 0) ) 25.203 + return 0; 25.204 + } 25.205 + while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) ); 25.206 + 25.207 + return 1; 25.208 +} 25.209 + 25.210 + 25.211 +static int get_page_from_pagenr(unsigned long page_nr) 25.212 +{ 25.213 + struct pfn_info *page = &frame_table[page_nr]; 25.214 25.215 if ( unlikely(page_nr >= max_page) ) 25.216 { 25.217 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page); 25.218 - return -1; 25.219 + return 0; 25.220 } 25.221 - page = frame_table + page_nr; 25.222 - flags = page->flags; 25.223 - if ( unlikely(!DOMAIN_OKAY(flags)) ) 25.224 - { 25.225 - MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask); 25.226 - return -1; 25.227 - } 25.228 - if ( (flags & PG_type_mask) != type ) 25.229 + 25.230 + if ( unlikely(!get_page(page, current)) && 25.231 + ((current->domain != 0) || !dom0_get_page(page)) ) 25.232 { 25.233 - if ( page_type_count(page) != 0 ) 25.234 - { 25.235 - MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld", 25.236 - page_nr << PAGE_SHIFT, 25.237 - flags & PG_type_mask, type, page_type_count(page)); 25.238 - return -1; 25.239 - } 25.240 - 25.241 - if ( unlikely(flags & PG_need_flush) ) 25.242 - { 25.243 - deferred_op[smp_processor_id()].flush_tlb = 1; 25.244 - page->flags &= ~PG_need_flush; 25.245 - perfc_incrc(need_flush_tlb_flush); 25.246 - } 25.247 - 25.248 - page->flags &= ~PG_type_mask; 25.249 - page->flags |= type; 25.250 + MEM_LOG("Could not get page reference for pfn %08lx\n", page_nr); 25.251 + return 0; 25.252 } 25.253 25.254 - get_page_tot(page); 25.255 - return get_page_type(page); 25.256 + return 1; 25.257 +} 25.258 + 25.259 + 25.260 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 25.261 + unsigned int type) 25.262 +{ 25.263 + struct pfn_info *page = &frame_table[page_nr]; 25.264 + 25.265 + if ( unlikely(!get_page_from_pagenr(page_nr)) ) 25.266 + return 0; 25.267 + 25.268 + if ( unlikely(!get_page_type(page, type)) ) 25.269 + { 25.270 + MEM_LOG("Bad page type for pfn %08lx (%08lx)", 25.271 + page_nr, page->type_and_flags); 25.272 + put_page(page); 25.273 + return 0; 25.274 + } 25.275 + 25.276 + return 1; 25.277 } 25.278 25.279 25.280 -/* Return new refcnt, or -1 on error. */ 25.281 -static int dec_page_refcnt(unsigned long page_nr, unsigned int type) 25.282 +/* 25.283 + * We allow an L2 table to map itself, to achieve a linear p.t. Note that this 25.284 + * does not raise any reference counts. 25.285 + */ 25.286 +static int check_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn) 25.287 +{ 25.288 + if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) 25.289 + { 25.290 + MEM_LOG("Attempt to create linear p.t. with write perms"); 25.291 + return 0; 25.292 + } 25.293 + 25.294 + if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) 25.295 + { 25.296 + MEM_LOG("L2 tables may not map _other_ L2 tables!\n"); 25.297 + return 0; 25.298 + } 25.299 + 25.300 + return 1; 25.301 +} 25.302 + 25.303 + 25.304 +static int get_page_from_l1e(l1_pgentry_t l1e) 25.305 +{ 25.306 + ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT); 25.307 + 25.308 + if ( unlikely((l1_pgentry_val(l1e) & 25.309 + (_PAGE_GLOBAL|_PAGE_PAT))) ) 25.310 + { 25.311 + MEM_LOG("Bad L1 page type settings %04lx", 25.312 + l1_pgentry_val(l1e) & 25.313 + (_PAGE_GLOBAL|_PAGE_PAT)); 25.314 + return 0; 25.315 + } 25.316 + 25.317 + if ( l1_pgentry_val(l1e) & _PAGE_RW ) 25.318 + { 25.319 + if ( unlikely(!get_page_and_type_from_pagenr( 25.320 + l1_pgentry_to_pagenr(l1e), PGT_writeable_page)) ) 25.321 + return 0; 25.322 + set_bit(_PGC_tlb_flush_on_type_change, 25.323 + &frame_table[l1_pgentry_to_pagenr(l1e)].count_and_flags); 25.324 + } 25.325 + else 25.326 + { 25.327 + if ( unlikely(!get_page_from_pagenr(l1_pgentry_to_pagenr(l1e))) ) 25.328 + return 0; 25.329 + } 25.330 + 25.331 + return 1; 25.332 +} 25.333 + 25.334 + 25.335 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ 25.336 +static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 25.337 +{ 25.338 + ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT); 25.339 + 25.340 + if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) 25.341 + { 25.342 + MEM_LOG("Bad L2 page type settings %04lx", 25.343 + l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); 25.344 + return 0; 25.345 + } 25.346 + 25.347 + if ( unlikely(!get_page_and_type_from_pagenr( 25.348 + l2_pgentry_to_pagenr(l2e), PGT_l1_page_table)) && 25.349 + unlikely(!check_linear_pagetable(l2e, pfn)) ) 25.350 + return 0; 25.351 + 25.352 + return 1; 25.353 +} 25.354 + 25.355 + 25.356 +static void put_page_from_l1e(l1_pgentry_t l1e) 25.357 { 25.358 struct pfn_info *page; 25.359 25.360 - if ( unlikely(page_nr >= max_page) ) 25.361 + ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT); 25.362 + 25.363 + page = &frame_table[l1_pgentry_to_pagenr(l1e)]; 25.364 + 25.365 + if ( l1_pgentry_val(l1e) & _PAGE_RW ) 25.366 { 25.367 - MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page); 25.368 - return -1; 25.369 + put_page_and_type(page); 25.370 } 25.371 - page = frame_table + page_nr; 25.372 - if ( unlikely(!DOMAIN_OKAY(page->flags)) || 25.373 - unlikely(((page->flags & PG_type_mask) != type)) ) 25.374 + else 25.375 { 25.376 - MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)", 25.377 - page->flags & PG_domain_mask, page->flags & PG_type_mask, 25.378 - type); 25.379 - return -1; 25.380 + /* We expect this is rare so we blow the entire shadow LDT. */ 25.381 + if ( unlikely(((page->type_and_flags & PGT_type_mask) == 25.382 + PGT_ldt_page)) && 25.383 + unlikely(((page->type_and_flags & PGT_count_mask) != 0)) ) 25.384 + invalidate_shadow_ldt(); 25.385 + put_page(page); 25.386 } 25.387 - ASSERT(page_type_count(page) != 0); 25.388 - put_page_tot(page); 25.389 - return put_page_type(page); 25.390 +} 25.391 + 25.392 + 25.393 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ 25.394 +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 25.395 +{ 25.396 + ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT); 25.397 + 25.398 + if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 25.399 + ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) 25.400 + put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); 25.401 } 25.402 25.403 25.404 -/* We allow a L2 table to map itself, to achieve a linear pagetable. */ 25.405 -/* NB. There's no need for a put_twisted_l2_table() function!! */ 25.406 -static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e) 25.407 +static int alloc_l2_table(struct pfn_info *page) 25.408 { 25.409 - unsigned long l2v = l2_pgentry_val(l2e); 25.410 + unsigned long page_nr = page - frame_table; 25.411 + l2_pgentry_t *pl2e, l2e; 25.412 + int i; 25.413 + 25.414 + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 25.415 + 25.416 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 25.417 + { 25.418 + l2e = pl2e[i]; 25.419 + 25.420 + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 25.421 + continue; 25.422 25.423 - /* Clearly the mapping must be read-only :-) */ 25.424 - if ( (l2v & _PAGE_RW) ) 25.425 + if ( unlikely(!get_page_from_l2e(l2e, page_nr)) ) 25.426 + goto fail; 25.427 + } 25.428 + 25.429 + /* Now we add our private high mappings. */ 25.430 + memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 25.431 + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 25.432 + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 25.433 + pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = 25.434 + mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); 25.435 + pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = 25.436 + mk_l2_pgentry(__pa(page->u.domain->mm.perdomain_pt) | 25.437 + __PAGE_HYPERVISOR); 25.438 + 25.439 + unmap_domain_mem(pl2e); 25.440 + return 1; 25.441 + 25.442 + fail: 25.443 + while ( i-- > 0 ) 25.444 { 25.445 - MEM_LOG("Attempt to install twisted L2 entry with write permissions"); 25.446 - return -1; 25.447 + l2e = pl2e[i]; 25.448 + if ( l2_pgentry_val(l2e) & _PAGE_PRESENT ) 25.449 + put_page_from_l2e(l2e, page_nr); 25.450 } 25.451 25.452 - /* This is a sufficient final check. */ 25.453 - if ( (l2v >> PAGE_SHIFT) != entry_pfn ) 25.454 - { 25.455 - MEM_LOG("L2 tables may not map _other_ L2 tables!\n"); 25.456 - return -1; 25.457 - } 25.458 - 25.459 - /* We don't bump the reference counts. */ 25.460 + unmap_domain_mem(pl2e); 25.461 return 0; 25.462 } 25.463 25.464 25.465 -static int get_l2_table(unsigned long page_nr) 25.466 +static int alloc_l1_table(struct pfn_info *page) 25.467 { 25.468 - struct pfn_info *page; 25.469 - struct task_struct *p; 25.470 - l2_pgentry_t *p_l2_entry, l2_entry; 25.471 - int i, ret=0; 25.472 - 25.473 - ret = inc_page_refcnt(page_nr, PGT_l2_page_table); 25.474 - if ( likely(ret != 0) ) return (ret < 0) ? ret : 0; 25.475 - 25.476 - /* NEW level-2 page table! Deal with every PDE in the table. */ 25.477 - p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT); 25.478 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 25.479 + unsigned long page_nr = page - frame_table; 25.480 + l1_pgentry_t *pl1e, l1e; 25.481 + int i; 25.482 + 25.483 + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 25.484 + 25.485 + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 25.486 { 25.487 - l2_entry = *p_l2_entry++; 25.488 - if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue; 25.489 - if ( unlikely((l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE))) ) 25.490 - { 25.491 - MEM_LOG("Bad L2 page type settings %04lx", 25.492 - l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE)); 25.493 - ret = -1; 25.494 + l1e = pl1e[i]; 25.495 + 25.496 + if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 25.497 + continue; 25.498 + 25.499 + if ( unlikely(!get_page_from_l1e(l1e)) ) 25.500 goto fail; 25.501 - } 25.502 - /* Assume we're mapping an L1 table, falling back to twisted L2. */ 25.503 - ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry)); 25.504 - if ( unlikely(ret) ) ret = get_twisted_l2_table(page_nr, l2_entry); 25.505 - if ( unlikely(ret) ) goto fail; 25.506 - } 25.507 - 25.508 - /* Now we simply slap in our high mapping. */ 25.509 - memcpy(p_l2_entry, 25.510 - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 25.511 - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); 25.512 - p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - 25.513 - DOMAIN_ENTRIES_PER_L2_PAGETABLE] = 25.514 - mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); 25.515 - 25.516 - /* 25.517 - * The per-domain PGD is slightly tricky, as we may not be executing 25.518 - * in the context of the correct domain (DOM0 builds pt's for others). 25.519 - */ 25.520 - page = frame_table + page_nr; 25.521 - if ( (p = find_domain_by_id(page->flags & PG_domain_mask)) != NULL ) 25.522 - { 25.523 - p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) - 25.524 - DOMAIN_ENTRIES_PER_L2_PAGETABLE] = 25.525 - mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR); 25.526 - put_task_struct(p); 25.527 } 25.528 25.529 - out: 25.530 - unmap_domain_mem(p_l2_entry); 25.531 - return ret; 25.532 + /* Make sure we unmap the right page! */ 25.533 + unmap_domain_mem(pl1e); 25.534 + return 1; 25.535 25.536 fail: 25.537 - p_l2_entry--; 25.538 while ( i-- > 0 ) 25.539 { 25.540 - l2_entry = *--p_l2_entry; 25.541 - if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) 25.542 - put_l1_table(l2_pgentry_to_pagenr(l2_entry)); 25.543 + l1e = pl1e[i]; 25.544 + if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 25.545 + continue; 25.546 + put_page_from_l1e(l1e); 25.547 } 25.548 - if ( dec_page_refcnt(page_nr, PGT_l2_page_table) != 0 ) 25.549 - BUG(); 25.550 - goto out; 25.551 + 25.552 + unmap_domain_mem(pl1e); 25.553 + return 0; 25.554 } 25.555 25.556 25.557 -static int get_l1_table(unsigned long page_nr) 25.558 +static void free_l2_table(struct pfn_info *page) 25.559 { 25.560 - l1_pgentry_t *p_l1_entry, l1_entry; 25.561 - int i, ret; 25.562 + unsigned long page_nr = page - frame_table; 25.563 + l2_pgentry_t *pl2e, l2e; 25.564 + int i; 25.565 + 25.566 + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); 25.567 25.568 - /* Update ref count for page pointed at by PDE. */ 25.569 - ret = inc_page_refcnt(page_nr, PGT_l1_page_table); 25.570 - if ( likely(ret != 0) ) return (ret < 0) ? ret : 0; 25.571 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 25.572 + { 25.573 + l2e = pl2e[i]; 25.574 + if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 25.575 + unlikely((l2_pgentry_val(l2e) >> PAGE_SHIFT) != page_nr) ) 25.576 + put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); 25.577 + } 25.578 25.579 - /* NEW level-1 page table! Deal with every PTE in the table. */ 25.580 - p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT); 25.581 + unmap_domain_mem(pl2e); 25.582 +} 25.583 + 25.584 + 25.585 +static void free_l1_table(struct pfn_info *page) 25.586 +{ 25.587 + unsigned long page_nr = page - frame_table; 25.588 + l1_pgentry_t *pl1e, l1e; 25.589 + int i; 25.590 + 25.591 + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); 25.592 + 25.593 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 25.594 { 25.595 - l1_entry = *p_l1_entry++; 25.596 - if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue; 25.597 - if ( unlikely((l1_pgentry_val(l1_entry) & 25.598 - (_PAGE_GLOBAL|_PAGE_PAT))) ) 25.599 - { 25.600 - MEM_LOG("Bad L1 page type settings %04lx", 25.601 - l1_pgentry_val(l1_entry) & 25.602 - (_PAGE_GLOBAL|_PAGE_PAT)); 25.603 - ret = -1; 25.604 - goto fail; 25.605 - } 25.606 - ret = get_page(l1_pgentry_to_pagenr(l1_entry), 25.607 - l1_pgentry_val(l1_entry) & _PAGE_RW); 25.608 - if ( unlikely(ret) ) goto fail; 25.609 + l1e = pl1e[i]; 25.610 + if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 25.611 + continue; 25.612 + put_page_from_l1e(l1e); 25.613 } 25.614 25.615 - /* Make sure we unmap the right page! */ 25.616 - unmap_domain_mem(p_l1_entry-1); 25.617 - return ret; 25.618 + unmap_domain_mem(pl1e); 25.619 +} 25.620 + 25.621 25.622 - fail: 25.623 - p_l1_entry--; 25.624 - while ( i-- > 0 ) 25.625 - { 25.626 - l1_entry = *--p_l1_entry; 25.627 - if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 25.628 - put_page(l1_pgentry_to_pagenr(l1_entry), 25.629 - l1_pgentry_val(l1_entry) & _PAGE_RW); 25.630 - } 25.631 - if ( dec_page_refcnt(page_nr, PGT_l1_page_table) != 0 ) 25.632 - BUG(); 25.633 - unmap_domain_mem(p_l1_entry); 25.634 - return ret; 25.635 +static inline int update_l2e(l2_pgentry_t *pl2e, 25.636 + l2_pgentry_t ol2e, 25.637 + l2_pgentry_t nl2e) 25.638 +{ 25.639 + unsigned long o = cmpxchg((unsigned long *)pl2e, 25.640 + l2_pgentry_val(ol2e), 25.641 + l2_pgentry_val(nl2e)); 25.642 + if ( o != l2_pgentry_val(ol2e) ) 25.643 + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 25.644 + l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); 25.645 + return (o == l2_pgentry_val(ol2e)); 25.646 } 25.647 25.648 25.649 -static int get_page(unsigned long page_nr, int writeable) 25.650 +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ 25.651 +static int mod_l2_entry(l2_pgentry_t *pl2e, 25.652 + l2_pgentry_t nl2e, 25.653 + unsigned long pfn) 25.654 { 25.655 - struct pfn_info *page; 25.656 - unsigned long flags; 25.657 + l2_pgentry_t ol2e; 25.658 + unsigned long _ol2e; 25.659 25.660 - /* Update ref count for page pointed at by PTE. */ 25.661 - if ( unlikely(page_nr >= max_page) ) 25.662 - { 25.663 - MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page); 25.664 - return(-1); 25.665 - } 25.666 - page = frame_table + page_nr; 25.667 - flags = page->flags; 25.668 - if ( unlikely(!DOMAIN_OKAY(flags)) ) 25.669 + if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= 25.670 + DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) 25.671 { 25.672 - MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask); 25.673 - return(-1); 25.674 - } 25.675 - 25.676 - if ( writeable ) 25.677 - { 25.678 - if ( (flags & PG_type_mask) != PGT_writeable_page ) 25.679 - { 25.680 - if ( page_type_count(page) != 0 ) 25.681 - { 25.682 - MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld", 25.683 - flags & PG_type_mask, PGT_writeable_page, 25.684 - page_type_count(page)); 25.685 - return(-1); 25.686 - } 25.687 - page->flags &= ~PG_type_mask; 25.688 - page->flags |= PGT_writeable_page; 25.689 - } 25.690 - page->flags |= PG_need_flush; 25.691 - get_page_type(page); 25.692 + MEM_LOG("Illegal L2 update attempt in hypervisor area %p", pl2e); 25.693 + return 0; 25.694 } 25.695 25.696 - get_page_tot(page); 25.697 - 25.698 - return(0); 25.699 -} 25.700 - 25.701 - 25.702 -static void put_l2_table(unsigned long page_nr) 25.703 -{ 25.704 - l2_pgentry_t *p_l2_entry, l2_entry; 25.705 - int i; 25.706 + if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) 25.707 + return 0; 25.708 + ol2e = mk_l2_pgentry(_ol2e); 25.709 25.710 - if ( likely(dec_page_refcnt(page_nr, PGT_l2_page_table)) ) return; 25.711 + if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) 25.712 + { 25.713 + /* Differ in mapping (bits 12-31) or presence (bit 0)? */ 25.714 + if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) != 0 ) 25.715 + { 25.716 + if ( unlikely(!get_page_from_l2e(nl2e, pfn)) ) 25.717 + return 0; 25.718 25.719 - /* We had last reference to level-2 page table. Free the PDEs. */ 25.720 - p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT); 25.721 - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) 25.722 - { 25.723 - l2_entry = *p_l2_entry++; 25.724 - if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) 25.725 - put_l1_table(l2_pgentry_to_pagenr(l2_entry)); 25.726 - } 25.727 + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 25.728 + { 25.729 + put_page_from_l2e(nl2e, pfn); 25.730 + return 0; 25.731 + } 25.732 25.733 - unmap_domain_mem(p_l2_entry); 25.734 -} 25.735 - 25.736 - 25.737 -static void put_l1_table(unsigned long page_nr) 25.738 -{ 25.739 - l1_pgentry_t *p_l1_entry, l1_entry; 25.740 - int i; 25.741 - 25.742 - if ( likely(dec_page_refcnt(page_nr, PGT_l1_page_table)) ) return; 25.743 + if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT ) 25.744 + put_page_from_l2e(ol2e, pfn); 25.745 + } 25.746 + else if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 25.747 + { 25.748 + return 0; 25.749 + } 25.750 + } 25.751 + else 25.752 + { 25.753 + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) 25.754 + return 0; 25.755 25.756 - /* We had last reference to level-1 page table. Free the PTEs. */ 25.757 - p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT); 25.758 - for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) 25.759 - { 25.760 - l1_entry = *p_l1_entry++; 25.761 - if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 25.762 - put_page(l1_pgentry_to_pagenr(l1_entry), 25.763 - l1_pgentry_val(l1_entry) & _PAGE_RW); 25.764 + if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT ) 25.765 + put_page_from_l2e(ol2e, pfn); 25.766 } 25.767 - 25.768 - /* Make sure we unmap the right page! */ 25.769 - unmap_domain_mem(p_l1_entry-1); 25.770 + 25.771 + return 1; 25.772 } 25.773 25.774 25.775 -static void put_page(unsigned long page_nr, int writeable) 25.776 +static inline int update_l1e(l1_pgentry_t *pl1e, 25.777 + l1_pgentry_t ol1e, 25.778 + l1_pgentry_t nl1e) 25.779 { 25.780 - struct pfn_info *page; 25.781 - ASSERT(page_nr < max_page); 25.782 - page = frame_table + page_nr; 25.783 - ASSERT(DOMAIN_OKAY(page->flags)); 25.784 - ASSERT((!writeable) || 25.785 - ((page_type_count(page) != 0) && 25.786 - ((page->flags & PG_type_mask) == PGT_writeable_page) && 25.787 - ((page->flags & PG_need_flush) == PG_need_flush))); 25.788 - if ( writeable ) 25.789 + unsigned long o = l1_pgentry_val(ol1e); 25.790 + unsigned long n = l1_pgentry_val(nl1e); 25.791 + 25.792 + while ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ) 25.793 { 25.794 - put_page_type(page); 25.795 + unsigned int cpu = smp_processor_id(); 25.796 + /* The CMPXCHG faulted -- maybe we need to clear the WP bit. */ 25.797 + if ( deferred_op[cpu].flags & DOP_RESTORE_CR0 ) 25.798 + { 25.799 + MEM_LOG("cmpxchg fault despite WP bit cleared\n"); 25.800 + return 0; 25.801 + } 25.802 + deferred_op[cpu].cr0 = read_cr0(); 25.803 + write_cr0(deferred_op[cpu].cr0 & ~X86_CR0_WP); 25.804 + deferred_op[cpu].flags |= DOP_RESTORE_CR0; 25.805 } 25.806 - else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) && 25.807 - (page_type_count(page) != 0)) ) 25.808 - { 25.809 - /* We expect this is rare so we just blow the entire shadow LDT. */ 25.810 - invalidate_shadow_ldt(); 25.811 - } 25.812 - put_page_tot(page); 25.813 + 25.814 + if ( o != l1_pgentry_val(ol1e)) 25.815 + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", 25.816 + l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); 25.817 + 25.818 + /* The swap was successful if the old value we saw is equal to ol1e. */ 25.819 + return (o == l1_pgentry_val(ol1e)); 25.820 } 25.821 25.822 25.823 -static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry) 25.824 +/* Update the L1 entry at pl1e to new value nl1e. */ 25.825 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) 25.826 { 25.827 - l2_pgentry_t old_l2_entry = *p_l2_entry; 25.828 + l1_pgentry_t ol1e; 25.829 + unsigned long _ol1e; 25.830 25.831 - if ( unlikely((((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >= 25.832 - DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) 25.833 - { 25.834 - MEM_LOG("Illegal L2 update attempt in hypervisor area %p", 25.835 - p_l2_entry); 25.836 - goto fail; 25.837 - } 25.838 - 25.839 - if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) ) 25.840 + if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) 25.841 { 25.842 - if ( unlikely((l2_pgentry_val(new_l2_entry) & 25.843 - (_PAGE_GLOBAL|_PAGE_PSE))) ) 25.844 - { 25.845 - MEM_LOG("Bad L2 entry val %04lx", 25.846 - l2_pgentry_val(new_l2_entry) & 25.847 - (_PAGE_GLOBAL|_PAGE_PSE)); 25.848 - goto fail; 25.849 - } 25.850 - /* Differ in mapping (bits 12-31) or presence (bit 0)? */ 25.851 - if ( ((l2_pgentry_val(old_l2_entry) ^ 25.852 - l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 ) 25.853 - { 25.854 - /* Assume we're mapping an L1 table, falling back to twisted L2. */ 25.855 - if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) ) 25.856 - { 25.857 - /* NB. No need to sanity-check the VA: done already. */ 25.858 - unsigned long l1e = l1_pgentry_val( 25.859 - linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]); 25.860 - if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) ) 25.861 - goto fail; 25.862 - } 25.863 - 25.864 - if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) ) 25.865 - put_l1_table(l2_pgentry_to_pagenr(old_l2_entry)); 25.866 - } 25.867 - } 25.868 - else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) ) 25.869 - { 25.870 - put_l1_table(l2_pgentry_to_pagenr(old_l2_entry)); 25.871 + MEM_LOG("Bad get_user\n"); 25.872 + return 0; 25.873 } 25.874 25.875 - *p_l2_entry = new_l2_entry; 25.876 - return 0; 25.877 - 25.878 - fail: 25.879 - return -1; 25.880 -} 25.881 - 25.882 + ol1e = mk_l1_pgentry(_ol1e); 25.883 25.884 -static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry) 25.885 -{ 25.886 - l1_pgentry_t old_l1_entry = *p_l1_entry; 25.887 - 25.888 - if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) ) 25.889 + if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) 25.890 { 25.891 - if ( unlikely((l1_pgentry_val(new_l1_entry) & 25.892 - (_PAGE_GLOBAL|_PAGE_PAT))) ) 25.893 - { 25.894 - MEM_LOG("Bad L1 entry val %04lx", 25.895 - l1_pgentry_val(new_l1_entry) & 25.896 - (_PAGE_GLOBAL|_PAGE_PAT)); 25.897 - goto fail; 25.898 - } 25.899 /* 25.900 * Differ in mapping (bits 12-31), writeable (bit 1), or 25.901 * presence (bit 0)? 25.902 */ 25.903 - if ( ((l1_pgentry_val(old_l1_entry) ^ 25.904 - l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 ) 25.905 + if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) != 0 ) 25.906 { 25.907 - if ( get_page(l1_pgentry_to_pagenr(new_l1_entry), 25.908 - l1_pgentry_val(new_l1_entry) & _PAGE_RW) ) 25.909 - goto fail; 25.910 + if ( unlikely(!get_page_from_l1e(nl1e)) ) 25.911 + return 0; 25.912 + 25.913 + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 25.914 + { 25.915 + put_page_from_l1e(nl1e); 25.916 + return 0; 25.917 + } 25.918 25.919 - if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) ) 25.920 - put_page(l1_pgentry_to_pagenr(old_l1_entry), 25.921 - l1_pgentry_val(old_l1_entry) & _PAGE_RW); 25.922 - } 25.923 + if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT ) 25.924 + put_page_from_l1e(ol1e); 25.925 + } 25.926 + else if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 25.927 + { 25.928 + return 0; 25.929 + } 25.930 } 25.931 - else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) ) 25.932 + else 25.933 { 25.934 - put_page(l1_pgentry_to_pagenr(old_l1_entry), 25.935 - l1_pgentry_val(old_l1_entry) & _PAGE_RW); 25.936 + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) 25.937 + return 0; 25.938 + 25.939 + if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT ) 25.940 + put_page_from_l1e(ol1e); 25.941 } 25.942 25.943 - *p_l1_entry = new_l1_entry; 25.944 - return 0; 25.945 + return 1; 25.946 +} 25.947 + 25.948 + 25.949 +int alloc_page_type(struct pfn_info *page, unsigned int type) 25.950 +{ 25.951 + if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, 25.952 + &page->count_and_flags)) ) 25.953 + { 25.954 + struct task_struct *p = page->u.domain; 25.955 + mb(); /* Check zombie status before using domain ptr. */ 25.956 + /* 25.957 + * NB. 'p' may no longer be valid by time we dereference it, so 25.958 + * p->processor might be garbage. We clamp it, just in case. 25.959 + */ 25.960 + if ( !test_bit(_PGC_zombie, &page->count_and_flags) && 25.961 + unlikely(NEED_FLUSH(tlbflush_time[(p->processor)&(NR_CPUS-1)], 25.962 + page->tlbflush_timestamp)) ) 25.963 + { 25.964 + perfc_incr(need_flush_tlb_flush); 25.965 + flush_tlb_cpu(p->processor); 25.966 + } 25.967 + } 25.968 25.969 - fail: 25.970 - return -1; 25.971 + switch ( type ) 25.972 + { 25.973 + case PGT_l1_page_table: 25.974 + return alloc_l1_table(page); 25.975 + case PGT_l2_page_table: 25.976 + return alloc_l2_table(page); 25.977 + case PGT_gdt_page: 25.978 + case PGT_ldt_page: 25.979 + return alloc_segdesc_page(page); 25.980 + default: 25.981 + BUG(); 25.982 + } 25.983 + 25.984 + return 0; 25.985 +} 25.986 + 25.987 + 25.988 +void free_page_type(struct pfn_info *page, unsigned int type) 25.989 +{ 25.990 + switch ( type ) 25.991 + { 25.992 + case PGT_l1_page_table: 25.993 + return free_l1_table(page); 25.994 + case PGT_l2_page_table: 25.995 + return free_l2_table(page); 25.996 + default: 25.997 + BUG(); 25.998 + } 25.999 } 25.1000 25.1001 25.1002 static int do_extended_command(unsigned long ptr, unsigned long val) 25.1003 { 25.1004 - int err = 0, cpu = smp_processor_id(); 25.1005 + int okay = 1, cpu = smp_processor_id(); 25.1006 unsigned int cmd = val & MMUEXT_CMD_MASK; 25.1007 unsigned long pfn = ptr >> PAGE_SHIFT; 25.1008 - struct pfn_info *page = frame_table + pfn; 25.1009 + struct pfn_info *page = &frame_table[pfn]; 25.1010 25.1011 /* 'ptr' must be in range except where it isn't a machine address. */ 25.1012 if ( (pfn >= max_page) && (cmd != MMUEXT_SET_LDT) ) 25.1013 + { 25.1014 + MEM_LOG("Ptr out of range for extended MMU command"); 25.1015 return 1; 25.1016 + } 25.1017 25.1018 switch ( cmd ) 25.1019 { 25.1020 case MMUEXT_PIN_L1_TABLE: 25.1021 - if ( unlikely(page->flags & PG_guest_pinned) ) 25.1022 - { 25.1023 - MEM_LOG("Pfn %08lx already pinned", pfn); 25.1024 - err = 1; 25.1025 - break; 25.1026 - } 25.1027 - err = get_l1_table(pfn); 25.1028 - goto mark_as_pinned; 25.1029 - 25.1030 case MMUEXT_PIN_L2_TABLE: 25.1031 - if ( unlikely(page->flags & PG_guest_pinned) ) 25.1032 - { 25.1033 - MEM_LOG("Pfn %08lx already pinned", pfn); 25.1034 - err = 1; 25.1035 - break; 25.1036 - } 25.1037 - err = get_l2_table(pfn); 25.1038 - 25.1039 - mark_as_pinned: 25.1040 - if ( unlikely(err) ) 25.1041 + okay = get_page_and_type_from_pagenr(pfn, 25.1042 + (cmd == MMUEXT_PIN_L2_TABLE) ? 25.1043 + PGT_l2_page_table : 25.1044 + PGT_l1_page_table); 25.1045 + if ( unlikely(!okay) ) 25.1046 { 25.1047 MEM_LOG("Error while pinning pfn %08lx", pfn); 25.1048 break; 25.1049 } 25.1050 - page->flags |= PG_guest_pinned; 25.1051 + 25.1052 + if ( unlikely(test_and_set_bit(_PGC_guest_pinned, 25.1053 + &page->count_and_flags)) ) 25.1054 + { 25.1055 + MEM_LOG("Pfn %08lx already pinned", pfn); 25.1056 + put_page_and_type(page); 25.1057 + okay = 0; 25.1058 + break; 25.1059 + } 25.1060 + 25.1061 break; 25.1062 25.1063 case MMUEXT_UNPIN_TABLE: 25.1064 - if ( unlikely(!DOMAIN_OKAY(page->flags)) ) 25.1065 + if ( unlikely(!(okay = get_page_from_pagenr(pfn))) ) 25.1066 { 25.1067 - err = 1; 25.1068 - MEM_LOG("Page %08lx bad domain (dom=%ld)", 25.1069 - ptr, page->flags & PG_domain_mask); 25.1070 + MEM_LOG("Page %08lx bad domain (dom=%p)", 25.1071 + ptr, page->u.domain); 25.1072 } 25.1073 - else if ( likely(page->flags & PG_guest_pinned) ) 25.1074 + else if ( likely(test_and_clear_bit(_PGC_guest_pinned, 25.1075 + &page->count_and_flags)) ) 25.1076 { 25.1077 - page->flags &= ~PG_guest_pinned; 25.1078 - ((page->flags & PG_type_mask) == PGT_l1_page_table) ? 25.1079 - put_l1_table(pfn) : put_l2_table(pfn); 25.1080 + put_page_and_type(page); 25.1081 } 25.1082 else 25.1083 { 25.1084 - err = 1; 25.1085 + okay = 0; 25.1086 MEM_LOG("Pfn %08lx not pinned", pfn); 25.1087 } 25.1088 break; 25.1089 25.1090 case MMUEXT_NEW_BASEPTR: 25.1091 - err = get_l2_table(pfn); 25.1092 - if ( !err ) 25.1093 + okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table); 25.1094 + if ( likely(okay) ) 25.1095 { 25.1096 - put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT); 25.1097 + put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable) 25.1098 + >> PAGE_SHIFT]); 25.1099 current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT); 25.1100 invalidate_shadow_ldt(); 25.1101 - deferred_op[cpu].flush_tlb = 1; 25.1102 + deferred_op[cpu].flags |= DOP_FLUSH_TLB; 25.1103 } 25.1104 else 25.1105 { 25.1106 - MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err); 25.1107 + MEM_LOG("Error while installing new baseptr %08lx", ptr); 25.1108 } 25.1109 break; 25.1110 25.1111 case MMUEXT_TLB_FLUSH: 25.1112 - deferred_op[cpu].flush_tlb = 1; 25.1113 + deferred_op[cpu].flags |= DOP_FLUSH_TLB; 25.1114 break; 25.1115 25.1116 case MMUEXT_INVLPG: 25.1117 @@ -815,7 +867,7 @@ static int do_extended_command(unsigned 25.1118 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || 25.1119 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) 25.1120 { 25.1121 - err = 1; 25.1122 + okay = 0; 25.1123 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); 25.1124 } 25.1125 else if ( (current->mm.ldt_ents != ents) || 25.1126 @@ -825,37 +877,39 @@ static int do_extended_command(unsigned 25.1127 current->mm.ldt_base = ptr; 25.1128 current->mm.ldt_ents = ents; 25.1129 load_LDT(current); 25.1130 - deferred_op[cpu].refresh_ldt = (ents != 0); 25.1131 + deferred_op[cpu].flags &= ~DOP_RELOAD_LDT; 25.1132 + if ( ents != 0 ) 25.1133 + deferred_op[cpu].flags |= DOP_RELOAD_LDT; 25.1134 } 25.1135 break; 25.1136 } 25.1137 25.1138 default: 25.1139 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); 25.1140 - err = 1; 25.1141 + okay = 0; 25.1142 break; 25.1143 } 25.1144 25.1145 - return err; 25.1146 + return okay; 25.1147 } 25.1148 25.1149 25.1150 int do_mmu_update(mmu_update_t *ureqs, int count) 25.1151 { 25.1152 mmu_update_t req; 25.1153 - unsigned long flags, pfn, l1e; 25.1154 + unsigned long va = 0, flags, pfn, prev_pfn = 0; 25.1155 struct pfn_info *page; 25.1156 - int rc = 0, err = 0, i, cpu = smp_processor_id(); 25.1157 + int rc = 0, okay = 1, i, cpu = smp_processor_id(); 25.1158 unsigned int cmd; 25.1159 - unsigned long cr0 = 0; 25.1160 25.1161 - perfc_incrc( calls_to_mmu_update ); 25.1162 - perfc_addc( num_page_updates, count ); 25.1163 + perfc_incrc(calls_to_mmu_update); 25.1164 + perfc_addc(num_page_updates, count); 25.1165 25.1166 for ( i = 0; i < count; i++ ) 25.1167 { 25.1168 if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) ) 25.1169 { 25.1170 + MEM_LOG("Bad copy_from_user"); 25.1171 rc = -EFAULT; 25.1172 break; 25.1173 } 25.1174 @@ -863,77 +917,85 @@ int do_mmu_update(mmu_update_t *ureqs, i 25.1175 cmd = req.ptr & (sizeof(l1_pgentry_t)-1); 25.1176 pfn = req.ptr >> PAGE_SHIFT; 25.1177 25.1178 - err = 1; 25.1179 - 25.1180 - spin_lock(¤t->page_lock); 25.1181 + okay = 0; 25.1182 25.1183 - /* Get the page-frame number that a non-extended command references. */ 25.1184 - if ( (cmd == MMU_NORMAL_PT_UPDATE) || 25.1185 - (cmd == MMU_UNCHECKED_PT_UPDATE) ) 25.1186 - { 25.1187 - if ( cr0 == 0 ) 25.1188 - { 25.1189 - cr0 = read_cr0(); 25.1190 - write_cr0(cr0 & ~X86_CR0_WP); 25.1191 - } 25.1192 - /* Need to use 'get_user' since the VA's PGD may be absent. */ 25.1193 - __get_user(l1e, (unsigned long *)(linear_pg_table+pfn)); 25.1194 - /* Now check that the VA's PTE isn't absent. */ 25.1195 - if ( unlikely(!(l1e & _PAGE_PRESENT)) ) 25.1196 - { 25.1197 - MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e); 25.1198 - goto unlock; 25.1199 - } 25.1200 - /* Finally, get the underlying machine address. */ 25.1201 - pfn = l1e >> PAGE_SHIFT; 25.1202 - } 25.1203 - 25.1204 - /* Least significant bits of 'ptr' demux the operation type. */ 25.1205 switch ( cmd ) 25.1206 { 25.1207 /* 25.1208 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. 25.1209 */ 25.1210 case MMU_NORMAL_PT_UPDATE: 25.1211 - page = frame_table + pfn; 25.1212 - flags = page->flags; 25.1213 + page = &frame_table[pfn]; 25.1214 25.1215 - if ( likely(DOMAIN_OKAY(flags)) ) 25.1216 + if ( unlikely(!get_page(page, current)) && 25.1217 + ((current->domain != 0) || !dom0_get_page(page)) ) 25.1218 { 25.1219 - switch ( (flags & PG_type_mask) ) 25.1220 - { 25.1221 - case PGT_l1_page_table: 25.1222 - err = mod_l1_entry((l1_pgentry_t *)req.ptr, 25.1223 - mk_l1_pgentry(req.val)); 25.1224 - break; 25.1225 - case PGT_l2_page_table: 25.1226 - err = mod_l2_entry((l2_pgentry_t *)req.ptr, 25.1227 - mk_l2_pgentry(req.val)); 25.1228 - break; 25.1229 - default: 25.1230 - if ( page_type_count(page) == 0 ) 25.1231 - { 25.1232 - *(unsigned long *)req.ptr = req.val; 25.1233 - err = 0; 25.1234 - } 25.1235 - else 25.1236 - MEM_LOG("Update to bad page %08lx", req.ptr); 25.1237 - break; 25.1238 - } 25.1239 + MEM_LOG("Could not get page for normal update"); 25.1240 + break; 25.1241 + } 25.1242 + 25.1243 + if ( likely(prev_pfn == pfn) ) 25.1244 + { 25.1245 + va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); 25.1246 } 25.1247 else 25.1248 { 25.1249 - MEM_LOG("Bad domain normal update (dom %d, pfn %ld)", 25.1250 - current->domain, pfn); 25.1251 + if ( prev_pfn != 0 ) 25.1252 + unmap_domain_mem((void *)va); 25.1253 + va = (unsigned long)map_domain_mem(req.ptr); 25.1254 + prev_pfn = pfn; 25.1255 } 25.1256 + 25.1257 + switch ( (page->type_and_flags & PGT_type_mask) ) 25.1258 + { 25.1259 + case PGT_l1_page_table: 25.1260 + if ( likely(get_page_type(page, PGT_l1_page_table)) ) 25.1261 + { 25.1262 + okay = mod_l1_entry((l1_pgentry_t *)va, 25.1263 + mk_l1_pgentry(req.val)); 25.1264 + put_page_type(page); 25.1265 + } 25.1266 + break; 25.1267 + case PGT_l2_page_table: 25.1268 + if ( likely(get_page_type(page, PGT_l2_page_table)) ) 25.1269 + { 25.1270 + okay = mod_l2_entry((l2_pgentry_t *)va, 25.1271 + mk_l2_pgentry(req.val), 25.1272 + pfn); 25.1273 + put_page_type(page); 25.1274 + } 25.1275 + break; 25.1276 + default: 25.1277 + if ( likely(get_page_type(page, PGT_writeable_page)) ) 25.1278 + { 25.1279 + *(unsigned long *)va = req.val; 25.1280 + okay = 1; 25.1281 + put_page_type(page); 25.1282 + } 25.1283 + break; 25.1284 + } 25.1285 + 25.1286 + put_page(page); 25.1287 + 25.1288 break; 25.1289 25.1290 case MMU_UNCHECKED_PT_UPDATE: 25.1291 req.ptr &= ~(sizeof(l1_pgentry_t) - 1); 25.1292 if ( likely(IS_PRIV(current)) ) 25.1293 { 25.1294 - *(unsigned long *)req.ptr = req.val; 25.1295 - err = 0; 25.1296 + if ( likely(prev_pfn == pfn) ) 25.1297 + { 25.1298 + va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); 25.1299 + } 25.1300 + else 25.1301 + { 25.1302 + if ( prev_pfn != 0 ) 25.1303 + unmap_domain_mem((void *)va); 25.1304 + va = (unsigned long)map_domain_mem(req.ptr); 25.1305 + prev_pfn = pfn; 25.1306 + } 25.1307 + *(unsigned long *)va = req.val; 25.1308 + okay = 1; 25.1309 } 25.1310 else 25.1311 { 25.1312 @@ -942,21 +1004,18 @@ int do_mmu_update(mmu_update_t *ureqs, i 25.1313 break; 25.1314 25.1315 case MMU_MACHPHYS_UPDATE: 25.1316 - page = frame_table + pfn; 25.1317 + page = &frame_table[pfn]; 25.1318 if ( unlikely(pfn >= max_page) ) 25.1319 { 25.1320 MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page); 25.1321 } 25.1322 - else if ( likely(DOMAIN_OKAY(page->flags)) ) 25.1323 + else if ( likely(get_page(page, current)) || 25.1324 + ((current->domain == 0) && dom0_get_page(page)) ) 25.1325 { 25.1326 machine_to_phys_mapping[pfn] = req.val; 25.1327 - err = 0; 25.1328 + okay = 1; 25.1329 + put_page(page); 25.1330 } 25.1331 - else 25.1332 - { 25.1333 - MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)", 25.1334 - current->domain, pfn); 25.1335 - } 25.1336 break; 25.1337 25.1338 /* 25.1339 @@ -965,7 +1024,7 @@ int do_mmu_update(mmu_update_t *ureqs, i 25.1340 */ 25.1341 case MMU_EXTENDED_COMMAND: 25.1342 req.ptr &= ~(sizeof(l1_pgentry_t) - 1); 25.1343 - err = do_extended_command(req.ptr, req.val); 25.1344 + okay = do_extended_command(req.ptr, req.val); 25.1345 break; 25.1346 25.1347 default: 25.1348 @@ -973,10 +1032,7 @@ int do_mmu_update(mmu_update_t *ureqs, i 25.1349 break; 25.1350 } 25.1351 25.1352 - unlock: 25.1353 - spin_unlock(¤t->page_lock); 25.1354 - 25.1355 - if ( unlikely(err) ) 25.1356 + if ( unlikely(!okay) ) 25.1357 { 25.1358 rc = -EINVAL; 25.1359 break; 25.1360 @@ -985,20 +1041,20 @@ int do_mmu_update(mmu_update_t *ureqs, i 25.1361 ureqs++; 25.1362 } 25.1363 25.1364 - if ( deferred_op[cpu].flush_tlb ) 25.1365 - { 25.1366 - deferred_op[cpu].flush_tlb = 0; 25.1367 - __write_cr3_counted(pagetable_val(current->mm.pagetable)); 25.1368 - } 25.1369 + if ( prev_pfn != 0 ) 25.1370 + unmap_domain_mem((void *)va); 25.1371 + 25.1372 + flags = deferred_op[cpu].flags; 25.1373 + deferred_op[cpu].flags = 0; 25.1374 25.1375 - if ( deferred_op[cpu].refresh_ldt ) 25.1376 - { 25.1377 - deferred_op[cpu].refresh_ldt = 0; 25.1378 + if ( flags & DOP_FLUSH_TLB ) 25.1379 + write_cr3_counted(pagetable_val(current->mm.pagetable)); 25.1380 + 25.1381 + if ( flags & DOP_RELOAD_LDT ) 25.1382 (void)map_ldt_shadow_page(0); 25.1383 - } 25.1384 25.1385 - if ( cr0 != 0 ) 25.1386 - write_cr0(cr0); 25.1387 + if ( unlikely(flags & DOP_RESTORE_CR0) ) 25.1388 + write_cr0(deferred_op[cpu].cr0); 25.1389 25.1390 return rc; 25.1391 } 25.1392 @@ -1006,48 +1062,34 @@ int do_mmu_update(mmu_update_t *ureqs, i 25.1393 25.1394 int do_update_va_mapping(unsigned long page_nr, 25.1395 unsigned long val, 25.1396 - unsigned long flags) 25.1397 + unsigned long caller_flags) 25.1398 { 25.1399 - unsigned long _x, cr0 = 0; 25.1400 struct task_struct *p = current; 25.1401 - int err = -EINVAL; 25.1402 + int err = 0; 25.1403 + unsigned int cpu = p->processor; 25.1404 + unsigned long defer_flags; 25.1405 25.1406 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) ) 25.1407 - goto out; 25.1408 - 25.1409 - spin_lock(&p->page_lock); 25.1410 + return -EINVAL; 25.1411 25.1412 - /* Check that the VA's page-directory entry is present.. */ 25.1413 - if ( unlikely((err = __get_user(_x, (unsigned long *) 25.1414 - (&linear_pg_table[page_nr]))) != 0) ) 25.1415 - goto unlock_and_out; 25.1416 + if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 25.1417 + mk_l1_pgentry(val))) ) 25.1418 + err = -EINVAL; 25.1419 25.1420 - /* If the VA's page-directory entry is read-only, we frob the WP bit. */ 25.1421 - if ( unlikely(__put_user(_x, (unsigned long *) 25.1422 - (&linear_pg_table[page_nr]))) ) 25.1423 - { 25.1424 - cr0 = read_cr0(); 25.1425 - write_cr0(cr0 & ~X86_CR0_WP); 25.1426 - } 25.1427 + defer_flags = deferred_op[cpu].flags; 25.1428 + deferred_op[cpu].flags = 0; 25.1429 25.1430 - if ( unlikely(mod_l1_entry(&linear_pg_table[page_nr], 25.1431 - mk_l1_pgentry(val)) != 0) ) 25.1432 - { 25.1433 - err = -EINVAL; 25.1434 - goto check_cr0_unlock_and_out; 25.1435 - } 25.1436 - 25.1437 - if ( unlikely(flags & UVMF_INVLPG) ) 25.1438 + if ( unlikely(defer_flags & DOP_FLUSH_TLB) || 25.1439 + unlikely(caller_flags & UVMF_FLUSH_TLB) ) 25.1440 + write_cr3_counted(pagetable_val(p->mm.pagetable)); 25.1441 + else if ( unlikely(caller_flags & UVMF_INVLPG) ) 25.1442 __flush_tlb_one(page_nr << PAGE_SHIFT); 25.1443 25.1444 - if ( unlikely(flags & UVMF_FLUSH_TLB) ) 25.1445 - __write_cr3_counted(pagetable_val(p->mm.pagetable)); 25.1446 + if ( unlikely(defer_flags & DOP_RELOAD_LDT) ) 25.1447 + (void)map_ldt_shadow_page(0); 25.1448 25.1449 - check_cr0_unlock_and_out: 25.1450 - if ( unlikely(cr0 != 0) ) 25.1451 - write_cr0(cr0); 25.1452 - unlock_and_out: 25.1453 - spin_unlock(&p->page_lock); 25.1454 - out: 25.1455 + if ( unlikely(defer_flags & DOP_RESTORE_CR0) ) 25.1456 + write_cr0(deferred_op[cpu].cr0); 25.1457 + 25.1458 return err; 25.1459 }
26.1 --- a/xen/common/network.c Sat Dec 20 23:39:49 2003 +0000 26.2 +++ b/xen/common/network.c Sat Dec 20 23:41:19 2003 +0000 26.3 @@ -90,7 +90,7 @@ net_vif_t *create_net_vif(int domain) 26.4 if ( sizeof(net_ring_t) > PAGE_SIZE ) BUG(); 26.5 new_ring = (net_ring_t *)get_free_page(GFP_KERNEL); 26.6 clear_page(new_ring); 26.7 - SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), domain); 26.8 + SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p); 26.9 26.10 /* 26.11 * Fill in the new vif struct. Note that, while the vif's refcnt is
27.1 --- a/xen/common/page_alloc.c Sat Dec 20 23:39:49 2003 +0000 27.2 +++ b/xen/common/page_alloc.c Sat Dec 20 23:41:19 2003 +0000 27.3 @@ -188,12 +188,12 @@ void __init init_page_allocator(unsigned 27.4 /* Release a PHYSICAL address range to the allocator. */ 27.5 void release_bytes_to_allocator(unsigned long min, unsigned long max) 27.6 { 27.7 - min = round_pgup (min) + PAGE_OFFSET; 27.8 - max = round_pgdown(max) + PAGE_OFFSET; 27.9 + min = round_pgup (min); 27.10 + max = round_pgdown(max); 27.11 27.12 while ( min < max ) 27.13 { 27.14 - __free_pages(min, 0); 27.15 + __free_pages(min+PAGE_OFFSET, 0); 27.16 min += PAGE_SIZE; 27.17 } 27.18 } 27.19 @@ -210,7 +210,6 @@ unsigned long __get_free_pages(int mask, 27.20 retry: 27.21 spin_lock_irqsave(&alloc_lock, flags); 27.22 27.23 - 27.24 /* Find smallest order which can satisfy the request. */ 27.25 for ( i = order; i < FREELIST_SIZE; i++ ) { 27.26 if ( !FREELIST_EMPTY(free_head[i]) )
28.1 --- a/xen/drivers/block/ll_rw_blk.c Sat Dec 20 23:39:49 2003 +0000 28.2 +++ b/xen/drivers/block/ll_rw_blk.c Sat Dec 20 23:41:19 2003 +0000 28.3 @@ -14,31 +14,15 @@ 28.4 #include <xeno/types.h> 28.5 #include <xeno/lib.h> 28.6 #include <xeno/sched.h> 28.7 -/*#include <xeno/kernel_stat.h>*/ 28.8 #include <xeno/errno.h> 28.9 -/*#include <xeno/locks.h>*/ 28.10 #include <xeno/mm.h> 28.11 -/*#include <xeno/swap.h>*/ 28.12 #include <xeno/init.h> 28.13 -/*#include <xeno/smp_lock.h>*/ 28.14 -/*#include <xeno/completion.h>*/ 28.15 - 28.16 #include <asm/system.h> 28.17 #include <asm/io.h> 28.18 #include <xeno/blk.h> 28.19 -/*#include <xeno/highmem.h>*/ 28.20 #include <xeno/slab.h> 28.21 #include <xeno/module.h> 28.22 28.23 -/* 28.24 - * KAF: We can turn off noise relating to barking guest-OS requests. 28.25 - */ 28.26 -#if 0 28.27 -#define DPRINTK(_f, _a...) printk(_f , ## _a) 28.28 -#else 28.29 -#define DPRINTK(_f, _a...) ((void)0) 28.30 -#endif 28.31 - 28.32 /* This will die as all synchronous stuff is coming to an end */ 28.33 #if 0 28.34 #define complete(_r) panic("completion.h stuff may be needed...") 28.35 @@ -47,8 +31,6 @@ 28.36 #define complete(_r) (*(int *)(_r) = 0) 28.37 #endif 28.38 28.39 - 28.40 - 28.41 /* 28.42 * MAC Floppy IWM hooks 28.43 */
29.1 --- a/xen/drivers/block/xen_block.c Sat Dec 20 23:39:49 2003 +0000 29.2 +++ b/xen/drivers/block/xen_block.c Sat Dec 20 23:41:19 2003 +0000 29.3 @@ -20,12 +20,6 @@ 29.4 #include <xeno/vbd.h> 29.5 #include <xeno/slab.h> 29.6 29.7 -#if 0 29.8 -#define DPRINTK(_f, _a...) printk( _f , ## _a ) 29.9 -#else 29.10 -#define DPRINTK(_f, _a...) ((void)0) 29.11 -#endif 29.12 - 29.13 /* 29.14 * These are rather arbitrary. They are fairly large because adjacent 29.15 * requests pulled from a communication ring are quite likely to end 29.16 @@ -60,15 +54,11 @@ static atomic_t nr_pending; 29.17 29.18 static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned; 29.19 29.20 -static int __buffer_is_valid(struct task_struct *p, 29.21 - unsigned long buffer, 29.22 - unsigned short size, 29.23 - int writeable_buffer); 29.24 -static void __lock_buffer(unsigned long buffer, 29.25 - unsigned short size, 29.26 - int writeable_buffer); 29.27 -static void unlock_buffer(struct task_struct *p, 29.28 - unsigned long buffer, 29.29 +static int lock_buffer(struct task_struct *p, 29.30 + unsigned long buffer, 29.31 + unsigned short size, 29.32 + int writeable_buffer); 29.33 +static void unlock_buffer(unsigned long buffer, 29.34 unsigned short size, 29.35 int writeable_buffer); 29.36 29.37 @@ -185,8 +175,7 @@ static void end_block_io_op_softirq(stru 29.38 { 29.39 pending_req = bh->pending_req; 29.40 29.41 - unlock_buffer(pending_req->domain, 29.42 - virt_to_phys(bh->b_data), 29.43 + unlock_buffer(virt_to_phys(bh->b_data), 29.44 bh->b_size, 29.45 (pending_req->operation==READ)); 29.46 29.47 @@ -321,55 +310,10 @@ long do_block_io_op(block_io_op_t *u_blo 29.48 * DOWNWARD CALLS -- These interface with the block-device layer proper. 29.49 */ 29.50 29.51 -static int __buffer_is_valid(struct task_struct *p, 29.52 - unsigned long buffer, 29.53 - unsigned short size, 29.54 - int writeable_buffer) 29.55 -{ 29.56 - unsigned long pfn; 29.57 - struct pfn_info *page; 29.58 - int rc = 0; 29.59 - 29.60 - /* A request may span multiple page frames. Each must be checked. */ 29.61 - for ( pfn = buffer >> PAGE_SHIFT; 29.62 - pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); 29.63 - pfn++ ) 29.64 - { 29.65 - /* Each frame must be within bounds of machine memory. */ 29.66 - if ( pfn >= max_page ) 29.67 - { 29.68 - DPRINTK("pfn out of range: %08lx\n", pfn); 29.69 - goto out; 29.70 - } 29.71 - 29.72 - page = frame_table + pfn; 29.73 - 29.74 - /* Each frame must belong to the requesting domain. */ 29.75 - if ( (page->flags & PG_domain_mask) != p->domain ) 29.76 - { 29.77 - DPRINTK("bad domain: expected %d, got %ld\n", 29.78 - p->domain, page->flags & PG_domain_mask); 29.79 - goto out; 29.80 - } 29.81 - 29.82 - /* If reading into the frame, the frame must be writeable. */ 29.83 - if ( writeable_buffer && 29.84 - ((page->flags & PG_type_mask) != PGT_writeable_page) && 29.85 - (page_type_count(page) != 0) ) 29.86 - { 29.87 - DPRINTK("non-writeable page passed for block read\n"); 29.88 - goto out; 29.89 - } 29.90 - } 29.91 - 29.92 - rc = 1; 29.93 - out: 29.94 - return rc; 29.95 -} 29.96 - 29.97 -static void __lock_buffer(unsigned long buffer, 29.98 - unsigned short size, 29.99 - int writeable_buffer) 29.100 +static int lock_buffer(struct task_struct *p, 29.101 + unsigned long buffer, 29.102 + unsigned short size, 29.103 + int writeable_buffer) 29.104 { 29.105 unsigned long pfn; 29.106 struct pfn_info *page; 29.107 @@ -378,40 +322,48 @@ static void __lock_buffer(unsigned long 29.108 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); 29.109 pfn++ ) 29.110 { 29.111 - page = frame_table + pfn; 29.112 - if ( writeable_buffer ) 29.113 + if ( unlikely(pfn >= max_page) ) 29.114 + goto fail; 29.115 + 29.116 + page = &frame_table[pfn]; 29.117 + 29.118 + if ( unlikely(!get_page(page, p)) ) 29.119 + goto fail; 29.120 + 29.121 + if ( writeable_buffer && 29.122 + unlikely(!get_page_type(page, PGT_writeable_page)) ) 29.123 { 29.124 - if ( page_type_count(page) == 0 ) 29.125 - { 29.126 - page->flags &= ~PG_type_mask; 29.127 - /* No need for PG_need_flush here. */ 29.128 - page->flags |= PGT_writeable_page; 29.129 - } 29.130 - get_page_type(page); 29.131 + put_page(page); 29.132 + goto fail; 29.133 } 29.134 - get_page_tot(page); 29.135 } 29.136 + 29.137 + return 1; 29.138 + 29.139 + fail: 29.140 + while ( pfn-- > (buffer >> PAGE_SHIFT) ) 29.141 + { 29.142 + if ( writeable_buffer ) 29.143 + put_page_type(&frame_table[pfn]); 29.144 + put_page(&frame_table[pfn]); 29.145 + } 29.146 + return 0; 29.147 } 29.148 29.149 -static void unlock_buffer(struct task_struct *p, 29.150 - unsigned long buffer, 29.151 +static void unlock_buffer(unsigned long buffer, 29.152 unsigned short size, 29.153 int writeable_buffer) 29.154 { 29.155 - unsigned long pfn; 29.156 - struct pfn_info *page; 29.157 + unsigned long pfn; 29.158 29.159 - spin_lock(&p->page_lock); 29.160 for ( pfn = buffer >> PAGE_SHIFT; 29.161 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); 29.162 pfn++ ) 29.163 { 29.164 - page = frame_table + pfn; 29.165 if ( writeable_buffer ) 29.166 - put_page_type(page); 29.167 - put_page_tot(page); 29.168 + put_page_type(&frame_table[pfn]); 29.169 + put_page(&frame_table[pfn]); 29.170 } 29.171 - spin_unlock(&p->page_lock); 29.172 } 29.173 29.174 static int do_block_io_op_domain(struct task_struct *p, int max_to_do) 29.175 @@ -480,8 +432,6 @@ static void dispatch_rw_block_io(struct 29.176 int new_segs, nr_psegs = 0; 29.177 phys_seg_t phys_seg[MAX_BLK_SEGS * 2]; 29.178 29.179 - spin_lock(&p->page_lock); 29.180 - 29.181 /* Check that number of segments is sane. */ 29.182 if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) ) 29.183 { 29.184 @@ -506,7 +456,7 @@ static void dispatch_rw_block_io(struct 29.185 goto bad_descriptor; 29.186 } 29.187 29.188 - if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) ) 29.189 + if ( !lock_buffer(p, buffer, nr_sects<<9, (operation==READ)) ) 29.190 { 29.191 DPRINTK("invalid buffer\n"); 29.192 goto bad_descriptor; 29.193 @@ -530,6 +480,7 @@ static void dispatch_rw_block_io(struct 29.194 req->sector_number + tot_sects, 29.195 req->sector_number + tot_sects + nr_sects, 29.196 req->device); 29.197 + unlock_buffer(buffer, nr_sects<<9, (operation==READ)); 29.198 goto bad_descriptor; 29.199 } 29.200 29.201 @@ -546,12 +497,6 @@ static void dispatch_rw_block_io(struct 29.202 if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG(); 29.203 } 29.204 29.205 - /* Lock pages associated with each buffer head. */ 29.206 - for ( i = 0; i < nr_psegs; i++ ) 29.207 - __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, 29.208 - (operation==READ)); 29.209 - spin_unlock(&p->page_lock); 29.210 - 29.211 atomic_inc(&nr_pending); 29.212 pending_req = pending_reqs + pending_ring[pending_cons]; 29.213 PENDREQ_IDX_INC(pending_cons); 29.214 @@ -594,7 +539,6 @@ static void dispatch_rw_block_io(struct 29.215 return; 29.216 29.217 bad_descriptor: 29.218 - spin_unlock(&p->page_lock); 29.219 make_response(p, req->id, req->operation, 1); 29.220 } 29.221 29.222 @@ -670,7 +614,7 @@ void init_blkdev_info(struct task_struct 29.223 if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG(); 29.224 p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL); 29.225 clear_page(p->blk_ring_base); 29.226 - SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain); 29.227 + SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p); 29.228 p->blkdev_list.next = NULL; 29.229 spin_lock_init(&p->vbd_lock); 29.230 } 29.231 @@ -680,7 +624,6 @@ void destroy_blkdev_info(struct task_str 29.232 { 29.233 ASSERT(!__on_blkdev_list(p)); 29.234 UNSHARE_PFN(virt_to_page(p->blk_ring_base)); 29.235 - free_page((unsigned long)p->blk_ring_base); 29.236 destroy_all_vbds(p); 29.237 } 29.238
30.1 --- a/xen/drivers/block/xen_vbd.c Sat Dec 20 23:39:49 2003 +0000 30.2 +++ b/xen/drivers/block/xen_vbd.c Sat Dec 20 23:41:19 2003 +0000 30.3 @@ -23,13 +23,6 @@ 30.4 extern int ide_probe_devices(xen_disk_info_t *xdi); 30.5 extern int scsi_probe_devices(xen_disk_info_t *xdi); 30.6 30.7 - 30.8 -#if 0 30.9 -#define DPRINTK(_f, _a...) printk( _f , ## _a ) 30.10 -#else 30.11 -#define DPRINTK(_f, _a...) ((void)0) 30.12 -#endif 30.13 - 30.14 /* XXX SMH: crappy 'hash function' .. fix when care. */ 30.15 #define HSH(_x) ((_x) & (VBD_HTAB_SZ - 1)) 30.16 30.17 @@ -447,16 +440,9 @@ long vbd_probe(vbd_probe_t *probe) 30.18 if ( (probe->domain == VBD_PROBE_ALL) || IS_PRIV(p) ) 30.19 { 30.20 /* Privileged domains always get access to the 'real' devices. */ 30.21 - if ( (ret = ide_probe_devices(&probe->xdi)) != 0 ) 30.22 - { 30.23 - DPRINTK("vbd_probe: error %d in probing ide devices\n", ret); 30.24 + if ( ((ret = ide_probe_devices(&probe->xdi)) != 0) || 30.25 + ((ret = scsi_probe_devices(&probe->xdi)) != 0) ) 30.26 goto out; 30.27 - } 30.28 - if ( (ret = scsi_probe_devices(&probe->xdi)) != 0 ) 30.29 - { 30.30 - DPRINTK("vbd_probe: error %d in probing scsi devices\n", ret); 30.31 - goto out; 30.32 - } 30.33 } 30.34 30.35 if ( probe->domain == VBD_PROBE_ALL ) 30.36 @@ -469,8 +455,6 @@ long vbd_probe(vbd_probe_t *probe) 30.37 { 30.38 if( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) 30.39 { 30.40 - DPRINTK("vbd_probe: error %d in probing virtual devices\n", 30.41 - ret); 30.42 read_unlock_irqrestore(&tasklist_lock, flags); 30.43 goto out; 30.44 } 30.45 @@ -478,17 +462,12 @@ long vbd_probe(vbd_probe_t *probe) 30.46 } 30.47 read_unlock_irqrestore(&tasklist_lock, flags); 30.48 } 30.49 - else 30.50 - { 30.51 - if ( (ret = vbd_probe_devices(&probe->xdi, p)) ) 30.52 - { 30.53 - DPRINTK("vbd_probe: error %d in probing virtual devices\n", ret); 30.54 - goto out; 30.55 - } 30.56 - 30.57 - } 30.58 + else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) 30.59 + goto out; 30.60 30.61 out: 30.62 + if ( ret != 0 ) 30.63 + DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); 30.64 if ( p != NULL ) 30.65 put_task_struct(p); 30.66 return ret;
31.1 --- a/xen/drivers/net/e1000/e1000_main.c Sat Dec 20 23:39:49 2003 +0000 31.2 +++ b/xen/drivers/net/e1000/e1000_main.c Sat Dec 20 23:41:19 2003 +0000 31.3 @@ -1816,10 +1816,12 @@ e1000_xmit_frame(struct sk_buff *skb, st 31.4 static void 31.5 e1000_tx_timeout(struct net_device *netdev) 31.6 { 31.7 +#if 0 31.8 struct e1000_adapter *adapter = netdev->priv; 31.9 31.10 /* Do the reset outside of interrupt context */ 31.11 - //schedule_work(&adapter->tx_timeout_task); 31.12 + schedule_work(&adapter->tx_timeout_task); 31.13 +#endif 31.14 e1000_tx_timeout_task(netdev); // XXXX HACK!!! XEN 31.15 } 31.16
32.1 --- a/xen/include/asm-i386/atomic.h Sat Dec 20 23:39:49 2003 +0000 32.2 +++ b/xen/include/asm-i386/atomic.h Sat Dec 20 23:41:19 2003 +0000 32.3 @@ -186,15 +186,6 @@ static __inline__ int atomic_add_negativ 32.4 return c; 32.5 } 32.6 32.7 -/* These are x86-specific, used by some header files */ 32.8 -#define atomic_clear_mask(mask, addr) \ 32.9 -__asm__ __volatile__(LOCK "andl %0,%1" \ 32.10 -: : "r" (~(mask)),"m" (*addr) : "memory") 32.11 - 32.12 -#define atomic_set_mask(mask, addr) \ 32.13 -__asm__ __volatile__(LOCK "orl %0,%1" \ 32.14 -: : "r" (mask),"m" (*addr) : "memory") 32.15 - 32.16 /* Atomic operations are already serializing on x86 */ 32.17 #define smp_mb__before_atomic_dec() barrier() 32.18 #define smp_mb__after_atomic_dec() barrier()
33.1 --- a/xen/include/asm-i386/flushtlb.h Sat Dec 20 23:39:49 2003 +0000 33.2 +++ b/xen/include/asm-i386/flushtlb.h Sat Dec 20 23:41:19 2003 +0000 33.3 @@ -1,40 +1,39 @@ 33.4 /****************************************************************************** 33.5 * flushtlb.h 33.6 * 33.7 - * TLB flush macros that count flushes. Counting is used to enforce 33.8 - * zero-copy safety, particularily for the network code. 33.9 - * 33.10 - * akw - Jan 21, 2003 33.11 + * TLB flushes are timestamped using a global virtual 'clock' which ticks 33.12 + * on any TLB flush on any processor. 33.13 + * 33.14 + * Copyright (c) 2003, K A Fraser 33.15 */ 33.16 33.17 -#ifndef __FLUSHTLB_H 33.18 -#define __FLUSHTLB_H 33.19 +#ifndef __FLUSHTLB_H__ 33.20 +#define __FLUSHTLB_H__ 33.21 33.22 #include <xeno/smp.h> 33.23 -#include <asm/atomic.h> 33.24 33.25 -atomic_t tlb_flush_count[NR_CPUS]; 33.26 - 33.27 -#define __write_cr3_counted(__pa) \ 33.28 - do { \ 33.29 - __asm__ __volatile__ ( \ 33.30 - "movl %0, %%cr3;" \ 33.31 - :: "r" (__pa) \ 33.32 - : "memory"); \ 33.33 - atomic_inc(&tlb_flush_count[smp_processor_id()]); \ 33.34 - } while (0) 33.35 +/* 33.36 + * Every GLOBAL_FLUSH_PERIOD ticks of the tlbflush clock, every TLB in the 33.37 + * system is guaranteed to have been flushed. 33.38 + */ 33.39 +#define GLOBAL_FLUSH_PERIOD (1<<16) 33.40 33.41 -#define __flush_tlb_counted() \ 33.42 - do { \ 33.43 - unsigned int tmpreg; \ 33.44 - \ 33.45 - __asm__ __volatile__( \ 33.46 - "movl %%cr3, %0; # flush TLB \n" \ 33.47 - "movl %0, %%cr3; " \ 33.48 - : "=r" (tmpreg) \ 33.49 - :: "memory"); \ 33.50 - atomic_inc(&tlb_flush_count[smp_processor_id()]); \ 33.51 - } while (0) 33.52 +/* 33.53 + * '_cpu_stamp' is the current timestamp for the CPU we are testing. 33.54 + * '_lastuse_stamp' is a timestamp taken when the PFN we are testing was last 33.55 + * used for a purpose that may have caused the CPU's TLB to become tainted. 33.56 + */ 33.57 +#define NEED_FLUSH(_cpu_stamp, _lastuse_stamp) \ 33.58 + (((_cpu_stamp) > (_lastuse_stamp)) || \ 33.59 + (((_lastuse_stamp) - (_cpu_stamp)) > (2*GLOBAL_FLUSH_PERIOD))) 33.60 33.61 -#endif 33.62 - 33.63 +extern unsigned long tlbflush_mask; 33.64 +extern unsigned long tlbflush_clock; 33.65 +extern unsigned long tlbflush_time[NR_CPUS]; 33.66 + 33.67 +extern void new_tlbflush_clock_period(void); 33.68 + 33.69 +extern void write_cr3_counted(unsigned long pa); 33.70 +extern void flush_tlb_counted(void); 33.71 + 33.72 +#endif /* __FLUSHTLB_H__ */
34.1 --- a/xen/include/asm-i386/io.h Sat Dec 20 23:39:49 2003 +0000 34.2 +++ b/xen/include/asm-i386/io.h Sat Dec 20 23:41:19 2003 +0000 34.3 @@ -36,10 +36,9 @@ static inline void * phys_to_virt(unsign 34.4 return __va(address); 34.5 } 34.6 34.7 -/* 34.8 - * Change "struct page" to physical address. 34.9 - */ 34.10 -#define page_to_phys(page) ((page - frame_table) << PAGE_SHIFT) 34.11 +#define page_to_pfn(_page) ((unsigned long)((_page) - frame_table)) 34.12 +#define page_to_phys(_page) (page_to_pfn(_page) << PAGE_SHIFT) 34.13 +#define page_to_virt(_page) phys_to_virt(page_to_phys(_page)) 34.14 34.15 extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); 34.16
35.1 --- a/xen/include/asm-i386/page.h Sat Dec 20 23:39:49 2003 +0000 35.2 +++ b/xen/include/asm-i386/page.h Sat Dec 20 23:41:19 2003 +0000 35.3 @@ -92,7 +92,7 @@ typedef struct { unsigned long pt_lo; } 35.4 extern l2_pgentry_t idle_pg_table[ENTRIES_PER_L2_PAGETABLE]; 35.5 extern void paging_init(void); 35.6 35.7 -#define __flush_tlb() __flush_tlb_counted() 35.8 +#define __flush_tlb() flush_tlb_counted() 35.9 35.10 /* Flush global pages as well. */ 35.11 35.12 @@ -111,10 +111,10 @@ extern void paging_init(void); 35.13 } while (0) 35.14 35.15 35.16 -#define __flush_tlb_all() \ 35.17 +#define __flush_tlb_pge() \ 35.18 do { \ 35.19 __pge_off(); \ 35.20 - __flush_tlb_counted(); \ 35.21 + flush_tlb_counted(); \ 35.22 __pge_on(); \ 35.23 } while (0) 35.24
36.1 --- a/xen/include/asm-i386/pgalloc.h Sat Dec 20 23:39:49 2003 +0000 36.2 +++ b/xen/include/asm-i386/pgalloc.h Sat Dec 20 23:41:19 2003 +0000 36.3 @@ -47,28 +47,24 @@ 36.4 36.5 #ifndef CONFIG_SMP 36.6 36.7 -#define flush_tlb() __flush_tlb() 36.8 -#define flush_tlb_all() __flush_tlb_all() 36.9 -#define local_flush_tlb() __flush_tlb() 36.10 -#define flush_tlb_cpu(_cpu) __flush_tlb() 36.11 +#define flush_tlb() __flush_tlb() 36.12 +#define flush_tlb_all() __flush_tlb() 36.13 +#define flush_tlb_all_pge() __flush_tlb_pge() 36.14 +#define local_flush_tlb() __flush_tlb() 36.15 +#define flush_tlb_cpu(_cpu) __flush_tlb() 36.16 +#define flush_tlb_mask(_mask) __flush_tlb() 36.17 36.18 #else 36.19 36.20 #include <xeno/smp.h> 36.21 36.22 -#define flush_tlb() __flush_tlb() 36.23 -#define local_flush_tlb() __flush_tlb() 36.24 - 36.25 -extern void flush_tlb_all(void); 36.26 +extern void flush_tlb_mask(unsigned long mask); 36.27 +extern void flush_tlb_all_pge(void); 36.28 36.29 -extern void flush_tlb_others(unsigned long cpumask); 36.30 -static inline void flush_tlb_cpu(unsigned int cpu) 36.31 -{ 36.32 - if ( cpu == smp_processor_id() ) 36.33 - __flush_tlb(); 36.34 - else 36.35 - flush_tlb_others(1<<cpu); 36.36 -} 36.37 +#define flush_tlb() __flush_tlb() 36.38 +#define flush_tlb_all() flush_tlb_mask((1 << smp_num_cpus) - 1) 36.39 +#define local_flush_tlb() __flush_tlb() 36.40 +#define flush_tlb_cpu(_cpu) flush_tlb_mask(1 << (_cpu)) 36.41 36.42 #endif 36.43
37.1 --- a/xen/include/asm-i386/smp.h Sat Dec 20 23:39:49 2003 +0000 37.2 +++ b/xen/include/asm-i386/smp.h Sat Dec 20 23:41:19 2003 +0000 37.3 @@ -1,15 +1,8 @@ 37.4 #ifndef __ASM_SMP_H 37.5 #define __ASM_SMP_H 37.6 37.7 -#ifndef __ASSEMBLY__ 37.8 #include <xeno/config.h> 37.9 #include <asm/ptrace.h> 37.10 -#include <asm/fixmap.h> 37.11 -#include <asm/bitops.h> 37.12 -#include <asm/mpspec.h> 37.13 -#include <asm/io_apic.h> 37.14 -#include <asm/apic.h> 37.15 -#endif 37.16 37.17 #ifdef CONFIG_SMP 37.18 #define TARGET_CPUS cpu_online_map 37.19 @@ -18,8 +11,6 @@ 37.20 #endif 37.21 37.22 #ifdef CONFIG_SMP 37.23 -#ifndef __ASSEMBLY__ 37.24 - 37.25 /* 37.26 * Private routines/data 37.27 */ 37.28 @@ -74,6 +65,9 @@ extern void smp_store_cpu_info(int id); 37.29 37.30 #define smp_processor_id() (current->processor) 37.31 37.32 +#include <asm/fixmap.h> 37.33 +#include <asm/apic.h> 37.34 + 37.35 static __inline int hard_smp_processor_id(void) 37.36 { 37.37 /* we don't want to mark this access volatile - bad code generation */ 37.38 @@ -86,7 +80,5 @@ static __inline int logical_smp_processo 37.39 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); 37.40 } 37.41 37.42 -#endif /* !__ASSEMBLY__ */ 37.43 - 37.44 #endif 37.45 #endif
38.1 --- a/xen/include/asm-i386/spinlock.h Sat Dec 20 23:39:49 2003 +0000 38.2 +++ b/xen/include/asm-i386/spinlock.h Sat Dec 20 23:41:19 2003 +0000 38.3 @@ -1,11 +1,10 @@ 38.4 #ifndef __ASM_SPINLOCK_H 38.5 #define __ASM_SPINLOCK_H 38.6 38.7 +#include <xeno/config.h> 38.8 +#include <xeno/lib.h> 38.9 #include <asm/atomic.h> 38.10 #include <asm/rwlock.h> 38.11 -#include <asm/page.h> 38.12 -#include <xeno/config.h> 38.13 -#include <xeno/lib.h> 38.14 38.15 #if 0 38.16 #define SPINLOCK_DEBUG 1
39.1 --- a/xen/include/asm-i386/system.h Sat Dec 20 23:39:49 2003 +0000 39.2 +++ b/xen/include/asm-i386/system.h Sat Dec 20 23:41:19 2003 +0000 39.3 @@ -93,7 +93,34 @@ static inline unsigned long __cmpxchg(vo 39.4 #define cmpxchg(ptr,o,n)\ 39.5 ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ 39.6 (unsigned long)(n),sizeof(*(ptr)))) 39.7 - 39.8 + 39.9 + 39.10 +/* 39.11 + * This function causes longword _o to be changed to _n at location _p. 39.12 + * If this access causes a fault then we return 1, otherwise we return 0. 39.13 + * If no fault occurs then _o is updated to teh value we saw at _p. If this 39.14 + * is the same as the initial value of _o then _n is written to location _p. 39.15 + */ 39.16 +#define cmpxchg_user(_p,_o,_n) \ 39.17 +({ \ 39.18 + int _rc; \ 39.19 + __asm__ __volatile__ ( \ 39.20 + "1: " LOCK_PREFIX "cmpxchgl %2,%3\n" \ 39.21 + "2:\n" \ 39.22 + ".section .fixup,\"ax\"\n" \ 39.23 + "3: movl $1,%1\n" \ 39.24 + " jmp 2b\n" \ 39.25 + ".previous\n" \ 39.26 + ".section __ex_table,\"a\"\n" \ 39.27 + " .align 4\n" \ 39.28 + " .long 1b,3b\n" \ 39.29 + ".previous" \ 39.30 + : "=a" (_o), "=r" (_rc) \ 39.31 + : "q" (_n), "m" (*__xg((volatile void *)_p)), "0" (_o), "1" (0) \ 39.32 + : "memory"); \ 39.33 + _rc; \ 39.34 +}) 39.35 + 39.36 /* 39.37 * Force strict CPU ordering. 39.38 * And yes, this is required on UP too when we're talking
40.1 --- a/xen/include/hypervisor-ifs/dom0_ops.h Sat Dec 20 23:39:49 2003 +0000 40.2 +++ b/xen/include/hypervisor-ifs/dom0_ops.h Sat Dec 20 23:41:19 2003 +0000 40.3 @@ -141,8 +141,8 @@ typedef struct dom0_getpageframeinfo_st 40.4 { 40.5 /* IN variables. */ 40.6 unsigned long pfn; /* Machine page frame number to query. */ 40.7 + unsigned int domain; /* To which domain does the frame belong? */ 40.8 /* OUT variables. */ 40.9 - unsigned int domain; /* To which domain does the frame belong? */ 40.10 enum { NONE, L1TAB, L2TAB } type; /* Is the page PINNED to a type? */ 40.11 } dom0_getpageframeinfo_t; 40.12
41.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h Sat Dec 20 23:39:49 2003 +0000 41.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h Sat Dec 20 23:41:19 2003 +0000 41.3 @@ -125,9 +125,9 @@ 41.4 * which shifts the least bits out. 41.5 */ 41.6 /* A normal page-table update request. */ 41.7 -#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is VA. */ 41.8 +#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ 41.9 /* DOM0 can make entirely unchecked updates which do not affect refcnts. */ 41.10 -#define MMU_UNCHECKED_PT_UPDATE 1 /* unchecked '*ptr = val'. ptr is VA. */ 41.11 +#define MMU_UNCHECKED_PT_UPDATE 1 /* unchecked '*ptr = val'. ptr is MA. */ 41.12 /* Update an entry in the machine->physical mapping table. */ 41.13 #define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */ 41.14 /* An extended command. */
42.1 --- a/xen/include/xeno/config.h Sat Dec 20 23:39:49 2003 +0000 42.2 +++ b/xen/include/xeno/config.h Sat Dec 20 23:41:19 2003 +0000 42.3 @@ -145,6 +145,13 @@ 42.4 42.5 #define capable(_c) 0 42.6 42.7 +#ifndef NDEBUG 42.8 +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \ 42.9 + __FILE__, __LINE__, ## _a) 42.10 +#else 42.11 +#define DPRINTK(_f, _a...) ((void)0) 42.12 +#endif 42.13 + 42.14 #ifndef __ASSEMBLY__ 42.15 42.16 #include <xeno/compiler.h>
43.1 --- a/xen/include/xeno/mm.h Sat Dec 20 23:39:49 2003 +0000 43.2 +++ b/xen/include/xeno/mm.h Sat Dec 20 23:41:19 2003 +0000 43.3 @@ -3,34 +3,35 @@ 43.4 #define __XENO_MM_H__ 43.5 43.6 #include <xeno/config.h> 43.7 +#include <xeno/list.h> 43.8 +#include <xeno/spinlock.h> 43.9 +#include <xeno/perfc.h> 43.10 +#include <xeno/sched.h> 43.11 + 43.12 +#include <asm/pgalloc.h> 43.13 #include <asm/atomic.h> 43.14 #include <asm/desc.h> 43.15 -#include <xeno/list.h> 43.16 +#include <asm/flushtlb.h> 43.17 +#include <asm/io.h> 43.18 + 43.19 #include <hypervisor-ifs/hypervisor-if.h> 43.20 -#include <xeno/spinlock.h> 43.21 43.22 -/* XXX KAF: These may die eventually, but so many refs in slab.c :((( */ 43.23 +/* 43.24 + * These are for compatibility with calls to the Linux memory allocators. 43.25 + */ 43.26 43.27 -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */ 43.28 #define __GFP_DMA 0x01 43.29 - 43.30 -/* Action modifiers - doesn't change the zoning */ 43.31 +#define GFP_DMA __GFP_DMA 43.32 #define __GFP_WAIT 0x10 /* Can wait and reschedule? */ 43.33 #define __GFP_HIGH 0x20 /* Should access emergency pools? */ 43.34 #define __GFP_IO 0x40 /* Can start low memory physical IO? */ 43.35 #define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */ 43.36 #define __GFP_FS 0x100 /* Can call down to low-level FS? */ 43.37 - 43.38 #define GFP_ATOMIC (__GFP_HIGH) 43.39 -#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) 43.40 +#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | \ 43.41 + __GFP_HIGHIO | __GFP_FS) 43.42 43.43 -/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some 43.44 - platforms, used as appropriate on others */ 43.45 - 43.46 -#define GFP_DMA __GFP_DMA 43.47 - 43.48 - 43.49 -/****************************************************************************** 43.50 +/* 43.51 * The following is for page_alloc.c. 43.52 */ 43.53 43.54 @@ -44,95 +45,80 @@ void __free_pages(unsigned long p, int o 43.55 #define free_page(_p) (__free_pages(_p,0)) 43.56 43.57 43.58 -/****************************************************************************** 43.59 - * The following is the array of page info. One entry per page owned 43.60 - * by the hypervisor, indexed from `mem_map', just like Linux. 43.61 - * 43.62 - * 12.11.02. We no longer use struct page or mem_map, these are replaced 43.63 - * with struct pfn_info and frame_table respectively. Boris Dragovic 43.64 +/* 43.65 + * Per-page-frame information. 43.66 */ 43.67 43.68 -typedef struct pfn_info { 43.69 - struct list_head list; /* ->mapping has some page lists. */ 43.70 - unsigned long flags; /* atomic flags. */ 43.71 - unsigned long tot_count; /* Total domain usage count. */ 43.72 - unsigned long type_count; /* pagetable/dir, or domain-writeable refs. */ 43.73 -} frame_table_t; 43.74 - 43.75 -#define get_page_tot(p) ((p)->tot_count++) 43.76 -#define put_page_tot(p) \ 43.77 - ({ ASSERT((p)->tot_count != 0); --(p)->tot_count; }) 43.78 -#define page_tot_count(p) ((p)->tot_count) 43.79 -#define set_page_tot_count(p,v) ((p)->tot_count = v) 43.80 - 43.81 -#define get_page_type(p) ((p)->type_count++) 43.82 -#define put_page_type(p) \ 43.83 - ({ ASSERT((p)->type_count != 0); --(p)->type_count; }) 43.84 -#define page_type_count(p) ((p)->type_count) 43.85 -#define set_page_type_count(p,v) ((p)->type_count = v) 43.86 +struct pfn_info 43.87 +{ 43.88 + /* Each frame can be threaded onto a doubly-linked list. */ 43.89 + struct list_head list; 43.90 + /* The following possible uses are context-dependent. */ 43.91 + union { 43.92 + /* Page is in use and not a zombie: we keep a pointer to its owner. */ 43.93 + struct task_struct *domain; 43.94 + /* Page is not currently allocated: mask of possibly-tainted TLBs. */ 43.95 + unsigned long cpu_mask; 43.96 + /* Page is a zombie: this word currently has no use. */ 43.97 + unsigned long _unused; 43.98 + } u; 43.99 + /* Reference count and various PGC_xxx flags and fields. */ 43.100 + unsigned long count_and_flags; 43.101 + /* Type reference count and various PGT_xxx flags and fields. */ 43.102 + unsigned long type_and_flags; 43.103 + /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ 43.104 + unsigned long tlbflush_timestamp; 43.105 +}; 43.106 43.107 -#define PG_domain_mask MAX_DOMAIN_ID /* owning domain (16 bits) */ 43.108 -/* hypervisor flags (domain == 0) */ 43.109 -#define PG_slab 24 43.110 -/* domain flags (domain != 0) */ 43.111 -/* 43.112 - * NB. The following page types are MUTUALLY EXCLUSIVE. 43.113 - * At most one can be true at any point, and 'type_count' counts how many 43.114 - * references exist of the current type. A change in type can only occur 43.115 - * when type_count == 0. 43.116 - */ 43.117 -#define PG_type_mask (15<<24) /* bits 24-27 */ 43.118 -#define PGT_none (0<<24) /* no special uses of this page */ 43.119 -#define PGT_l1_page_table (1<<24) /* using this page as an L1 page table? */ 43.120 -#define PGT_l2_page_table (2<<24) /* using this page as an L2 page table? */ 43.121 -#define PGT_l3_page_table (3<<24) /* using this page as an L3 page table? */ 43.122 -#define PGT_l4_page_table (4<<24) /* using this page as an L4 page table? */ 43.123 -#define PGT_gdt_page (5<<24) /* using this page in a GDT? */ 43.124 -#define PGT_ldt_page (6<<24) /* using this page in an LDT? */ 43.125 -#define PGT_writeable_page (7<<24) /* has writable mappings of this page? */ 43.126 + /* The following page types are MUTUALLY EXCLUSIVE. */ 43.127 +#define PGT_none (0<<29) /* no special uses of this page */ 43.128 +#define PGT_l1_page_table (1<<29) /* using this page as an L1 page table? */ 43.129 +#define PGT_l2_page_table (2<<29) /* using this page as an L2 page table? */ 43.130 +#define PGT_l3_page_table (3<<29) /* using this page as an L3 page table? */ 43.131 +#define PGT_l4_page_table (4<<29) /* using this page as an L4 page table? */ 43.132 +#define PGT_gdt_page (5<<29) /* using this page in a GDT? */ 43.133 +#define PGT_ldt_page (6<<29) /* using this page in an LDT? */ 43.134 +#define PGT_writeable_page (7<<29) /* has writable mappings of this page? */ 43.135 +#define PGT_type_mask (7<<29) /* Bits 29-31. */ 43.136 + /* Has this page been validated for use as its current type? */ 43.137 +#define _PGT_validated 28 43.138 +#define PGT_validated (1<<_PGT_validated) 43.139 + /* 28-bit count of uses of this frame as its current type. */ 43.140 +#define PGT_count_mask ((1<<28)-1) 43.141 43.142 -/* 43.143 - * This bit indicates that the TLB must be flushed when the type count of this 43.144 - * frame drops to zero. This is needed on current x86 processors only for 43.145 - * frames which have guestos-accessible writeable mappings. In this case we 43.146 - * must prevent stale TLB entries allowing the frame to be written if it used 43.147 - * for a page table, for example. 43.148 - * 43.149 - * We have this bit because the writeable type is actually also used to pin a 43.150 - * page when it is used as a disk read buffer. This doesn't require a TLB flush 43.151 - * because the frame never has a mapping in the TLB. 43.152 - */ 43.153 -#define PG_need_flush (1<<28) 43.154 + /* The owner of this page is dead: 'u.domain' is no longer valid. */ 43.155 +#define _PGC_zombie 31 43.156 +#define PGC_zombie (1<<_PGC_zombie) 43.157 + /* For safety, force a TLB flush when this page's type changes. */ 43.158 +#define _PGC_tlb_flush_on_type_change 30 43.159 +#define PGC_tlb_flush_on_type_change (1<<_PGC_tlb_flush_on_type_change) 43.160 + /* Owning guest has pinned this page to its current type? */ 43.161 +#define _PGC_guest_pinned 29 43.162 +#define PGC_guest_pinned (1<<_PGC_guest_pinned) 43.163 + /* Cleared when the owning guest 'frees' this page. */ 43.164 +#define _PGC_allocated 28 43.165 +#define PGC_allocated (1<<_PGC_allocated) 43.166 + /* 28-bit count of references to this frame. */ 43.167 +#define PGC_count_mask ((1<<28)-1) 43.168 43.169 -/* 43.170 - * This bit indicates that the guest OS has pinned the page to its current 43.171 - * type. For page tables this can avoid the frame scanning and reference-count 43.172 - * updates that occur when the type count falls to zero. 43.173 - */ 43.174 -#define PG_guest_pinned (1<<29) 43.175 +/* We trust the slab allocator in slab.c, and our use of it. */ 43.176 +#define PageSlab(page) (1) 43.177 +#define PageSetSlab(page) ((void)0) 43.178 +#define PageClearSlab(page) ((void)0) 43.179 + 43.180 +#define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < MAX_MONITOR_ADDRESS) 43.181 43.182 -#define PageSlab(page) test_bit(PG_slab, &(page)->flags) 43.183 -#define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) 43.184 -#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) 43.185 - 43.186 -#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \ 43.187 - do { \ 43.188 - (_pfn)->flags = (_dom) | PGT_writeable_page | PG_need_flush; \ 43.189 - set_page_tot_count((_pfn), 2); \ 43.190 - set_page_type_count((_pfn), 2); \ 43.191 +#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \ 43.192 + do { \ 43.193 + (_pfn)->u.domain = (_dom); \ 43.194 + wmb(); /* install valid domain ptr before updating refcnt. */ \ 43.195 + (_pfn)->count_and_flags = 1; /* Xen holds a writeable reference */ \ 43.196 + (_pfn)->type_and_flags = PGT_writeable_page | PGT_validated | 1; \ 43.197 } while ( 0 ) 43.198 43.199 -#define UNSHARE_PFN(_pfn) \ 43.200 - do { \ 43.201 - (_pfn)->flags = 0; \ 43.202 - set_page_tot_count((_pfn), 0); \ 43.203 - set_page_type_count((_pfn), 0); \ 43.204 - } while ( 0 ) 43.205 +#define UNSHARE_PFN(_pfn) put_page_and_type(_pfn) 43.206 43.207 -/* The array of struct pfn_info, 43.208 - * free pfn list and number of free pfns in the free list 43.209 - */ 43.210 -extern frame_table_t * frame_table; 43.211 +extern struct pfn_info *frame_table; 43.212 extern unsigned long frame_table_size; 43.213 extern struct list_head free_list; 43.214 extern spinlock_t free_list_lock; 43.215 @@ -140,6 +126,180 @@ extern unsigned int free_pfns; 43.216 extern unsigned long max_page; 43.217 void init_frametable(unsigned long nr_pages); 43.218 43.219 +struct pfn_info *alloc_domain_page(struct task_struct *p); 43.220 +void free_domain_page(struct pfn_info *page); 43.221 + 43.222 +int alloc_page_type(struct pfn_info *page, unsigned int type); 43.223 +void free_page_type(struct pfn_info *page, unsigned int type); 43.224 + 43.225 +static inline void put_page(struct pfn_info *page) 43.226 +{ 43.227 + unsigned long nx, x, y = page->count_and_flags; 43.228 + 43.229 + do { 43.230 + x = y; 43.231 + nx = x - 1; 43.232 + } 43.233 + while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) ); 43.234 + 43.235 + if ( unlikely((nx & PGC_count_mask) == 0) ) 43.236 + free_domain_page(page); 43.237 +} 43.238 + 43.239 + 43.240 +static inline int get_page(struct pfn_info *page, 43.241 + struct task_struct *domain) 43.242 +{ 43.243 + unsigned long x, nx, y = page->count_and_flags; 43.244 + struct task_struct *p, *np = page->u.domain; 43.245 + 43.246 + do { 43.247 + x = y; 43.248 + nx = x + 1; 43.249 + p = np; 43.250 + if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */ 43.251 + unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ 43.252 + unlikely(x & PGC_zombie) || /* Zombie? */ 43.253 + unlikely(p != domain) ) /* Wrong owner? */ 43.254 + { 43.255 + DPRINTK("Error pfn %08lx: ed=%p,sd=%p,caf=%08lx\n", 43.256 + page_to_pfn(page), domain, p, x); 43.257 + return 0; 43.258 + } 43.259 + __asm__ __volatile__( 43.260 + LOCK_PREFIX "cmpxchg8b %3" 43.261 + : "=a" (np), "=d" (y), "=b" (p), 43.262 + "=m" (*(volatile unsigned long long *)(&page->u.domain)) 43.263 + : "0" (p), "1" (x), "b" (p), "c" (nx) ); 43.264 + } 43.265 + while ( unlikely(np != p) || unlikely(y != x) ); 43.266 + 43.267 + return 1; 43.268 +} 43.269 + 43.270 + 43.271 +static inline void put_page_type(struct pfn_info *page) 43.272 +{ 43.273 + unsigned long nx, x, y = page->type_and_flags; 43.274 + 43.275 + again: 43.276 + do { 43.277 + x = y; 43.278 + nx = x - 1; 43.279 + if ( unlikely((nx & PGT_count_mask) == 0) ) 43.280 + { 43.281 + page->tlbflush_timestamp = tlbflush_clock; 43.282 + if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && 43.283 + likely(nx & PGT_validated) ) 43.284 + { 43.285 + /* 43.286 + * Page-table pages must be unvalidated when count is zero. The 43.287 + * 'free' is safe because the refcnt is non-zero and the 43.288 + * validated bit is clear => other ops will spin or fail. 43.289 + */ 43.290 + if ( unlikely((y = cmpxchg(&page->type_and_flags, x, 43.291 + x & ~PGT_validated)) != x) ) 43.292 + goto again; 43.293 + /* We cleared the 'valid bit' so we must do the clear up. */ 43.294 + free_page_type(page, x & PGT_type_mask); 43.295 + /* Carry on as we were, but with the 'valid bit' now clear. */ 43.296 + x &= ~PGT_validated; 43.297 + nx &= ~PGT_validated; 43.298 + } 43.299 + } 43.300 + } 43.301 + while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) ); 43.302 +} 43.303 + 43.304 + 43.305 +static inline int get_page_type(struct pfn_info *page, unsigned long type) 43.306 +{ 43.307 + unsigned long nx, x, y = page->type_and_flags; 43.308 + again: 43.309 + do { 43.310 + x = y; 43.311 + nx = x + 1; 43.312 + if ( unlikely((nx & PGT_count_mask) == 0) ) 43.313 + { 43.314 + DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page)); 43.315 + return 0; 43.316 + } 43.317 + else if ( unlikely((x & PGT_count_mask) == 0) ) 43.318 + { 43.319 + if ( (x & PGT_type_mask) != type ) 43.320 + { 43.321 + nx &= ~(PGT_type_mask | PGT_validated); 43.322 + nx |= type; 43.323 + /* No extra validation needed for writeable pages. */ 43.324 + if ( type == PGT_writeable_page ) 43.325 + nx |= PGT_validated; 43.326 + } 43.327 + } 43.328 + else if ( unlikely((x & PGT_type_mask) != type) ) 43.329 + { 43.330 + DPRINTK("Unexpected type (saw %08lx != exp %08lx) for pfn %08lx\n", 43.331 + x & PGT_type_mask, type, page_to_pfn(page)); 43.332 + return 0; 43.333 + } 43.334 + else if ( unlikely(!(x & PGT_validated)) ) 43.335 + { 43.336 + /* Someone else is updating validation of this page. Wait... */ 43.337 + while ( (y = page->type_and_flags) != x ) 43.338 + { 43.339 + rep_nop(); 43.340 + barrier(); 43.341 + } 43.342 + goto again; 43.343 + } 43.344 + } 43.345 + while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) ); 43.346 + 43.347 + if ( unlikely(!(nx & PGT_validated)) ) 43.348 + { 43.349 + /* Try to validate page type; drop the new reference on failure. */ 43.350 + if ( unlikely(!alloc_page_type(page, type)) ) 43.351 + { 43.352 + DPRINTK("Error while validating pfn %08lx for type %08lx\n", 43.353 + page_to_pfn(page), type); 43.354 + put_page_type(page); 43.355 + return 0; 43.356 + } 43.357 + set_bit(_PGT_validated, &page->type_and_flags); 43.358 + } 43.359 + 43.360 + return 1; 43.361 +} 43.362 + 43.363 + 43.364 +static inline void put_page_and_type(struct pfn_info *page) 43.365 +{ 43.366 + put_page_type(page); 43.367 + put_page(page); 43.368 +} 43.369 + 43.370 + 43.371 +static inline int get_page_and_type(struct pfn_info *page, 43.372 + struct task_struct *domain, 43.373 + unsigned int type) 43.374 +{ 43.375 + int rc = get_page(page, domain); 43.376 + 43.377 + if ( likely(rc) && unlikely(!get_page_type(page, type)) ) 43.378 + { 43.379 + put_page(page); 43.380 + rc = 0; 43.381 + } 43.382 + 43.383 + return rc; 43.384 +} 43.385 + 43.386 +#define ASSERT_PAGE_IS_TYPE(_p, _t) \ 43.387 + ASSERT(((_p)->type_and_flags & PGT_type_mask) == (_t)); \ 43.388 + ASSERT(((_p)->type_and_flags & PGT_count_mask) != 0) 43.389 +#define ASSERT_PAGE_IS_DOMAIN(_p, _d) \ 43.390 + ASSERT(((_p)->count_and_flags & PGC_count_mask) != 0); \ 43.391 + ASSERT((_p)->u.domain == (_d)) 43.392 + 43.393 int check_descriptor(unsigned long a, unsigned long b); 43.394 43.395 /*
44.1 --- a/xen/include/xeno/perfc.h Sat Dec 20 23:39:49 2003 +0000 44.2 +++ b/xen/include/xeno/perfc.h Sat Dec 20 23:41:19 2003 +0000 44.3 @@ -1,6 +1,6 @@ 44.4 -/* 44.5 - * xen performance counters 44.6 - */ 44.7 + 44.8 +#ifndef __XENO_PERFC_H__ 44.9 +#define __XENO_PERFC_H__ 44.10 44.11 #include <asm/atomic.h> 44.12 44.13 @@ -53,3 +53,4 @@ extern struct perfcounter_t perfcounters 44.14 #define perfc_addc(x,y) atomic_add((y), &perfcounters.x[smp_processor_id()]) 44.15 #define perfc_adda(x,y,z) atomic_add((z), &perfcounters.x[y]) 44.16 44.17 +#endif /* __XENO_PERFC_H__ */
45.1 --- a/xen/include/xeno/perfc_defn.h Sat Dec 20 23:39:49 2003 +0000 45.2 +++ b/xen/include/xeno/perfc_defn.h Sat Dec 20 23:41:19 2003 +0000 45.3 @@ -12,7 +12,6 @@ PERFCOUNTER( net_hypercalls, "network hy 45.4 PERFCOUNTER( net_rx_congestion_drop, "net rx congestion drops" ) 45.5 PERFCOUNTER( net_rx_capacity_drop, "net rx capacity drops" ) 45.6 PERFCOUNTER( net_rx_delivered, "net rx delivered" ) 45.7 -PERFCOUNTER( net_rx_tlbflush, "net rx tlb flushes" ) 45.8 PERFCOUNTER( net_tx_transmitted, "net tx transmitted" ) 45.9 45.10 PERFCOUNTER_CPU( domain_page_tlb_flush, "domain page tlb flushes" )
46.1 --- a/xen/include/xeno/sched.h Sat Dec 20 23:39:49 2003 +0000 46.2 +++ b/xen/include/xeno/sched.h Sat Dec 20 23:41:19 2003 +0000 46.3 @@ -4,7 +4,6 @@ 46.4 #include <xeno/config.h> 46.5 #include <xeno/types.h> 46.6 #include <xeno/spinlock.h> 46.7 -#include <asm/page.h> 46.8 #include <asm/ptrace.h> 46.9 #include <xeno/smp.h> 46.10 #include <asm/processor.h> 46.11 @@ -16,7 +15,6 @@ 46.12 #include <xeno/time.h> 46.13 #include <xeno/ac_timer.h> 46.14 #include <xeno/delay.h> 46.15 -#include <xeno/slab.h> 46.16 46.17 #define MAX_DOMAIN_NAME 16 46.18 46.19 @@ -94,9 +92,10 @@ struct task_struct 46.20 46.21 unsigned int domain; /* domain id */ 46.22 46.23 - struct list_head pg_head; 46.24 - unsigned int tot_pages; /* number of pages currently possesed */ 46.25 - unsigned int max_pages; /* max number of pages that can be possesed */ 46.26 + spinlock_t page_list_lock; 46.27 + struct list_head page_list; 46.28 + unsigned int tot_pages; /* number of pages currently possesed */ 46.29 + unsigned int max_pages; /* max number of pages that can be possesed */ 46.30 46.31 /* scheduling */ 46.32 struct list_head run_list; 46.33 @@ -132,8 +131,6 @@ struct task_struct 46.34 46.35 /* VM */ 46.36 struct mm_struct mm; 46.37 - /* We need this lock to check page types and frob reference counts. */ 46.38 - spinlock_t page_lock; 46.39 46.40 mm_segment_t addr_limit; 46.41 46.42 @@ -194,6 +191,8 @@ extern struct task_struct *idle_task[NR_ 46.43 46.44 #define STACK_SIZE PAGE_SIZE 46.45 46.46 +#include <xeno/slab.h> 46.47 + 46.48 extern kmem_cache_t *task_struct_cachep; 46.49 #define alloc_task_struct() \ 46.50 ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
47.1 --- a/xen/include/xeno/vif.h Sat Dec 20 23:39:49 2003 +0000 47.2 +++ b/xen/include/xeno/vif.h Sat Dec 20 23:41:19 2003 +0000 47.3 @@ -34,7 +34,7 @@ extern struct net_device *the_dev; 47.4 typedef struct rx_shadow_entry_st 47.5 { 47.6 unsigned short id; 47.7 - unsigned short flush_count; /* 16 bits should be enough */ 47.8 + unsigned short _pad; 47.9 unsigned long pte_ptr; 47.10 unsigned long buf_pfn; 47.11 } rx_shadow_entry_t;
48.1 --- a/xen/net/dev.c Sat Dec 20 23:39:49 2003 +0000 48.2 +++ b/xen/net/dev.c Sat Dec 20 23:41:19 2003 +0000 48.3 @@ -39,12 +39,6 @@ 48.4 #define rtnl_lock() ((void)0) 48.5 #define rtnl_unlock() ((void)0) 48.6 48.7 -#if 0 48.8 -#define DPRINTK(_f, _a...) printk(_f , ## _a) 48.9 -#else 48.10 -#define DPRINTK(_f, _a...) ((void)0) 48.11 -#endif 48.12 - 48.13 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1)) 48.14 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1)) 48.15 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1)) 48.16 @@ -54,9 +48,9 @@ struct skb_completion_queues skb_queue[N 48.17 48.18 static int get_tx_bufs(net_vif_t *vif); 48.19 48.20 -static void __make_tx_response(net_vif_t *vif, 48.21 - unsigned short id, 48.22 - unsigned char st); 48.23 +static void make_tx_response(net_vif_t *vif, 48.24 + unsigned short id, 48.25 + unsigned char st); 48.26 static void make_rx_response(net_vif_t *vif, 48.27 unsigned short id, 48.28 unsigned short size, 48.29 @@ -499,89 +493,69 @@ struct netif_rx_stats netdev_rx_stat[NR_ 48.30 void deliver_packet(struct sk_buff *skb, net_vif_t *vif) 48.31 { 48.32 rx_shadow_entry_t *rx; 48.33 - unsigned long *ptep; 48.34 + unsigned long *ptep, pte; 48.35 struct pfn_info *old_page, *new_page, *pte_page; 48.36 unsigned int i; 48.37 unsigned short size; 48.38 unsigned char offset, status = RING_STATUS_OK; 48.39 + struct task_struct *p = vif->domain; 48.40 48.41 memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN); 48.42 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP ) 48.43 memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN); 48.44 48.45 - /* 48.46 - * Slightly gross: we need the page_lock so that we can do PTE checking. 48.47 - * However, we take it slightly early so that it can protect the update 48.48 - * of rx_cons. This saves us from grabbing two locks. 48.49 - */ 48.50 - spin_lock(&vif->domain->page_lock); 48.51 + spin_lock(&vif->rx_lock); 48.52 48.53 if ( (i = vif->rx_cons) == vif->rx_prod ) 48.54 { 48.55 - spin_unlock(&vif->domain->page_lock); 48.56 + spin_unlock(&vif->rx_lock); 48.57 perfc_incr(net_rx_capacity_drop); 48.58 return; 48.59 } 48.60 - rx = vif->rx_shadow_ring + i; 48.61 + rx = &vif->rx_shadow_ring[i]; 48.62 vif->rx_cons = RX_RING_INC(i); 48.63 48.64 size = (unsigned short)skb->len; 48.65 offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK); 48.66 48.67 - /* Release the page-table page. */ 48.68 - pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT); 48.69 - put_page_type(pte_page); 48.70 - put_page_tot(pte_page); 48.71 - 48.72 - old_page = frame_table + rx->buf_pfn; 48.73 + pte_page = &frame_table[rx->pte_ptr >> PAGE_SHIFT]; 48.74 + old_page = &frame_table[rx->buf_pfn]; 48.75 new_page = skb->pf; 48.76 48.77 ptep = map_domain_mem(rx->pte_ptr); 48.78 48.79 - if ( (*ptep & _PAGE_PRESENT) ) 48.80 + new_page->u.domain = p; 48.81 + wmb(); /* make dom ptr visible before updating refcnt. */ 48.82 + spin_lock(&p->page_list_lock); 48.83 + list_add(&new_page->list, &p->page_list); 48.84 + new_page->count_and_flags = PGC_allocated | 2; 48.85 + spin_unlock(&p->page_list_lock); 48.86 + get_page_type(new_page, PGT_writeable_page); 48.87 + set_bit(_PGC_tlb_flush_on_type_change, &new_page->count_and_flags); 48.88 + wmb(); /* Get type count and set flush bit before updating PTE. */ 48.89 + 48.90 + pte = *ptep; 48.91 + if ( unlikely(pte & _PAGE_PRESENT) || 48.92 + unlikely(cmpxchg(ptep, pte, 48.93 + (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT | 48.94 + ((new_page - frame_table) << PAGE_SHIFT))) != pte ) 48.95 { 48.96 - /* Bail out if the PTE has been reused under our feet. */ 48.97 - list_add(&old_page->list, &vif->domain->pg_head); 48.98 - old_page->flags = vif->domain->domain; 48.99 unmap_domain_mem(ptep); 48.100 - spin_unlock(&vif->domain->page_lock); 48.101 status = RING_STATUS_BAD_PAGE; 48.102 goto out; 48.103 } 48.104 48.105 - /* Give the new page to the domain, marking it writeable. */ 48.106 - set_page_type_count(new_page, 1); 48.107 - set_page_tot_count(new_page, 1); 48.108 - new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush; 48.109 - list_add(&new_page->list, &vif->domain->pg_head); 48.110 - 48.111 - /* Patch the PTE to map the new page as writeable. */ 48.112 machine_to_phys_mapping[new_page - frame_table] 48.113 - = machine_to_phys_mapping[old_page - frame_table]; 48.114 - *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT | 48.115 - (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK); 48.116 + = machine_to_phys_mapping[old_page - frame_table]; 48.117 48.118 unmap_domain_mem(ptep); 48.119 48.120 - spin_unlock(&vif->domain->page_lock); 48.121 - 48.122 /* Our skbuff now points at the guest's old frame. */ 48.123 skb->pf = old_page; 48.124 48.125 /* Updates must happen before releasing the descriptor. */ 48.126 smp_wmb(); 48.127 48.128 - /* 48.129 - * NB. The remote flush here should be safe, as we hold no locks. The 48.130 - * network driver that called us should also have no nasty locks. 48.131 - */ 48.132 - if ( rx->flush_count == (unsigned short) 48.133 - atomic_read(&tlb_flush_count[vif->domain->processor]) ) 48.134 - { 48.135 - perfc_incr(net_rx_tlbflush); 48.136 - flush_tlb_cpu(vif->domain->processor); 48.137 - } 48.138 - 48.139 perfc_incr(net_rx_delivered); 48.140 48.141 /* record this so they can be billed */ 48.142 @@ -589,7 +563,9 @@ void deliver_packet(struct sk_buff *skb, 48.143 vif->total_bytes_received += size; 48.144 48.145 out: 48.146 + put_page_and_type(pte_page); 48.147 make_rx_response(vif, rx->id, size, status, offset); 48.148 + spin_unlock(&vif->rx_lock); 48.149 } 48.150 48.151 /** 48.152 @@ -785,8 +761,8 @@ static void net_tx_action(unsigned long 48.153 skb->mac.raw = skb->data; 48.154 skb->guest_id = tx->id; 48.155 48.156 - skb_shinfo(skb)->frags[0].page = frame_table + 48.157 - (tx->payload >> PAGE_SHIFT); 48.158 + skb_shinfo(skb)->frags[0].page = 48.159 + &frame_table[tx->payload >> PAGE_SHIFT]; 48.160 skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN; 48.161 skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK; 48.162 skb_shinfo(skb)->nr_frags = 1; 48.163 @@ -856,10 +832,8 @@ static void tx_skb_release(struct sk_buf 48.164 48.165 vif = skb->src_vif; 48.166 48.167 - spin_lock(&vif->domain->page_lock); 48.168 for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ ) 48.169 - put_page_tot(skb_shinfo(skb)->frags[i].page); 48.170 - spin_unlock(&vif->domain->page_lock); 48.171 + put_page(skb_shinfo(skb)->frags[i].page); 48.172 48.173 if ( skb->skb_type == SKB_NODATA ) 48.174 kmem_cache_free(net_header_cachep, skb->head); 48.175 @@ -867,7 +841,7 @@ static void tx_skb_release(struct sk_buf 48.176 skb_shinfo(skb)->nr_frags = 0; 48.177 48.178 spin_lock(&vif->tx_lock); 48.179 - __make_tx_response(vif, skb->guest_id, RING_STATUS_OK); 48.180 + make_tx_response(vif, skb->guest_id, RING_STATUS_OK); 48.181 spin_unlock(&vif->tx_lock); 48.182 48.183 /* 48.184 @@ -1904,7 +1878,7 @@ static int get_tx_bufs(net_vif_t *vif) 48.185 if ( (tx.size <= PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) ) 48.186 { 48.187 DPRINTK("Bad packet size: %d\n", tx.size); 48.188 - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.189 + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.190 continue; 48.191 } 48.192 48.193 @@ -1932,23 +1906,21 @@ static int get_tx_bufs(net_vif_t *vif) 48.194 vif->remaining_credit -= tx.size; 48.195 48.196 /* No crossing a page boundary as the payload mustn't fragment. */ 48.197 - if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE ) 48.198 + if ( unlikely(((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE) ) 48.199 { 48.200 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 48.201 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size); 48.202 - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.203 + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.204 continue; 48.205 } 48.206 48.207 buf_pfn = tx.addr >> PAGE_SHIFT; 48.208 buf_page = frame_table + buf_pfn; 48.209 - spin_lock(&p->page_lock); 48.210 - if ( (buf_pfn >= max_page) || 48.211 - ((buf_page->flags & PG_domain_mask) != p->domain) ) 48.212 + if ( unlikely(buf_pfn >= max_page) || 48.213 + unlikely(!get_page(buf_page, p)) ) 48.214 { 48.215 DPRINTK("Bad page frame\n"); 48.216 - spin_unlock(&p->page_lock); 48.217 - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.218 + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.219 continue; 48.220 } 48.221 48.222 @@ -1958,8 +1930,8 @@ static int get_tx_bufs(net_vif_t *vif) 48.223 init_tx_header(vif, g_data, tx.size, the_dev)); 48.224 if ( protocol == 0 ) 48.225 { 48.226 - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.227 - goto tx_unmap_and_continue; 48.228 + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.229 + goto cleanup_and_continue; 48.230 } 48.231 48.232 target = net_get_target_vif(g_data, tx.size, vif); 48.233 @@ -1969,9 +1941,9 @@ static int get_tx_bufs(net_vif_t *vif) 48.234 /* Local delivery */ 48.235 if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL ) 48.236 { 48.237 - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.238 + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); 48.239 put_vif(target); 48.240 - goto tx_unmap_and_continue; 48.241 + goto cleanup_and_continue; 48.242 } 48.243 48.244 skb->src_vif = vif; 48.245 @@ -1995,7 +1967,7 @@ static int get_tx_bufs(net_vif_t *vif) 48.246 if ( netif_rx(skb) == NET_RX_DROP ) 48.247 kfree_skb(skb); 48.248 48.249 - __make_tx_response(vif, tx.id, RING_STATUS_OK); 48.250 + make_tx_response(vif, tx.id, RING_STATUS_OK); 48.251 } 48.252 else if ( (target == VIF_PHYS) || IS_PRIV(p) ) 48.253 { 48.254 @@ -2005,23 +1977,24 @@ static int get_tx_bufs(net_vif_t *vif) 48.255 kmem_cache_alloc(net_header_cachep, GFP_KERNEL); 48.256 if ( vif->tx_shadow_ring[j].header == NULL ) 48.257 { 48.258 - __make_tx_response(vif, tx.id, RING_STATUS_OK); 48.259 - goto tx_unmap_and_continue; 48.260 + make_tx_response(vif, tx.id, RING_STATUS_OK); 48.261 + goto cleanup_and_continue; 48.262 } 48.263 48.264 memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN); 48.265 vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN; 48.266 - get_page_tot(buf_page); 48.267 + buf_page = NULL; /* hand off our page reference */ 48.268 j = TX_RING_INC(j); 48.269 } 48.270 else 48.271 { 48.272 - __make_tx_response(vif, tx.id, RING_STATUS_DROPPED); 48.273 + make_tx_response(vif, tx.id, RING_STATUS_DROPPED); 48.274 } 48.275 48.276 - tx_unmap_and_continue: 48.277 + cleanup_and_continue: 48.278 + if ( buf_page != NULL ) 48.279 + put_page(buf_page); 48.280 unmap_domain_mem(g_data); 48.281 - spin_unlock(&p->page_lock); 48.282 } 48.283 48.284 /* 48.285 @@ -2044,33 +2017,18 @@ static int get_tx_bufs(net_vif_t *vif) 48.286 } 48.287 48.288 48.289 -static long get_bufs_from_vif(net_vif_t *vif) 48.290 +static void get_rx_bufs(net_vif_t *vif) 48.291 { 48.292 - net_ring_t *shared_rings; 48.293 - net_idx_t *shared_idxs; 48.294 + struct task_struct *p = vif->domain; 48.295 + net_ring_t *shared_rings = vif->shared_rings; 48.296 + net_idx_t *shared_idxs = vif->shared_idxs; 48.297 unsigned int i, j; 48.298 rx_req_entry_t rx; 48.299 unsigned long pte_pfn, buf_pfn; 48.300 struct pfn_info *pte_page, *buf_page; 48.301 - struct task_struct *p = vif->domain; 48.302 - unsigned long *ptep; 48.303 - 48.304 - shared_idxs = vif->shared_idxs; 48.305 - shared_rings = vif->shared_rings; 48.306 - 48.307 - /* 48.308 - * PHASE 1 -- TRANSMIT RING 48.309 - */ 48.310 + unsigned long *ptep, pte; 48.311 48.312 - if ( get_tx_bufs(vif) ) 48.313 - { 48.314 - add_to_net_schedule_list_tail(vif); 48.315 - maybe_schedule_tx_action(); 48.316 - } 48.317 - 48.318 - /* 48.319 - * PHASE 2 -- RECEIVE RING 48.320 - */ 48.321 + spin_lock(&vif->rx_lock); 48.322 48.323 /* 48.324 * Collect up new receive buffers. We collect up to the guest OS's new 48.325 @@ -2085,66 +2043,83 @@ static long get_bufs_from_vif(net_vif_t 48.326 { 48.327 rx = shared_rings->rx_ring[i].req; 48.328 48.329 - pte_pfn = rx.addr >> PAGE_SHIFT; 48.330 - pte_page = frame_table + pte_pfn; 48.331 + pte_pfn = rx.addr >> PAGE_SHIFT; 48.332 + pte_page = &frame_table[pte_pfn]; 48.333 48.334 - spin_lock(&p->page_lock); 48.335 - if ( (pte_pfn >= max_page) || 48.336 - ((pte_page->flags & (PG_type_mask | PG_domain_mask)) != 48.337 - (PGT_l1_page_table | p->domain)) ) 48.338 + /* The address passed down must be to a valid PTE. */ 48.339 + if ( unlikely(pte_pfn >= max_page) || 48.340 + unlikely(!get_page_and_type(pte_page, p, PGT_l1_page_table)) ) 48.341 { 48.342 DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n", 48.343 - p->domain, pte_pfn, max_page, pte_page->flags); 48.344 - spin_unlock(&p->page_lock); 48.345 + p->domain, pte_pfn, max_page, pte_page->type_and_flags); 48.346 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); 48.347 continue; 48.348 } 48.349 - 48.350 + 48.351 ptep = map_domain_mem(rx.addr); 48.352 - 48.353 - if ( !(*ptep & _PAGE_PRESENT) ) 48.354 + pte = *ptep; 48.355 + 48.356 + /* We must be passed a valid writeable mapping to swizzle. */ 48.357 + if ( unlikely((pte & (_PAGE_PRESENT|_PAGE_RW)) != 48.358 + (_PAGE_PRESENT|_PAGE_RW)) || 48.359 + unlikely(cmpxchg(ptep, pte, pte & ~_PAGE_PRESENT) != pte) ) 48.360 { 48.361 - DPRINTK("Invalid PTE passed down (not present)\n"); 48.362 + DPRINTK("Invalid PTE passed down (not present or changing)\n"); 48.363 + put_page_and_type(pte_page); 48.364 + make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); 48.365 + goto rx_unmap_and_continue; 48.366 + } 48.367 + 48.368 + buf_pfn = pte >> PAGE_SHIFT; 48.369 + buf_page = &frame_table[buf_pfn]; 48.370 + 48.371 + /* 48.372 + * The page must belong to the correct domain, and must be mapped 48.373 + * just once as a writeable page. 48.374 + */ 48.375 + if ( unlikely(buf_page->u.domain != p) || 48.376 + unlikely(!test_and_clear_bit(_PGC_allocated, 48.377 + &buf_page->count_and_flags)) || 48.378 + unlikely(cmpxchg(&buf_page->type_and_flags, 48.379 + PGT_writeable_page|PGT_validated|1, 48.380 + 0) != (PGT_writeable_page|PGT_validated|1)) ) 48.381 + { 48.382 + DPRINTK("Bad domain or page mapped writeable more than once.\n"); 48.383 + if ( buf_page->u.domain == p ) 48.384 + set_bit(_PGC_allocated, &buf_page->count_and_flags); 48.385 + if ( unlikely(cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) != 48.386 + (pte & ~_PAGE_PRESENT)) ) 48.387 + put_page_and_type(buf_page); 48.388 + make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); 48.389 + goto rx_unmap_and_continue; 48.390 + } 48.391 + 48.392 + /* 48.393 + * Now ensure that we can take the last references to this page. 48.394 + * The final count should be 2, because of PGC_allocated. 48.395 + */ 48.396 + if ( unlikely(cmpxchg(&buf_page->count_and_flags, 48.397 + PGC_tlb_flush_on_type_change | 2, 0) != 48.398 + (PGC_tlb_flush_on_type_change | 2)) ) 48.399 + { 48.400 + DPRINTK("Page held more than once\n"); 48.401 + /* Leave the page unmapped at 'ptep'. Stoopid domain! */ 48.402 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); 48.403 goto rx_unmap_and_continue; 48.404 } 48.405 48.406 - buf_pfn = *ptep >> PAGE_SHIFT; 48.407 - buf_page = frame_table + buf_pfn; 48.408 + /* Remove from the domain's allocation list. */ 48.409 + spin_lock(&p->page_list_lock); 48.410 + list_del(&buf_page->list); 48.411 + spin_unlock(&p->page_list_lock); 48.412 48.413 - if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) != 48.414 - (PGT_writeable_page | p->domain)) || 48.415 - (page_tot_count(buf_page) != 1) ) 48.416 - { 48.417 - DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n", 48.418 - page_type_count(buf_page), page_tot_count(buf_page), 48.419 - buf_page->flags); 48.420 - make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); 48.421 - goto rx_unmap_and_continue; 48.422 - } 48.423 - 48.424 - /* 48.425 - * The pte they passed was good, so take it away from them. We also 48.426 - * lock down the page-table page, so it doesn't go away. 48.427 - */ 48.428 - get_page_type(pte_page); 48.429 - get_page_tot(pte_page); 48.430 - *ptep &= ~_PAGE_PRESENT; 48.431 - buf_page->flags = 0; 48.432 - set_page_type_count(buf_page, 0); 48.433 - set_page_tot_count(buf_page, 0); 48.434 - list_del(&buf_page->list); 48.435 - 48.436 - vif->rx_shadow_ring[j].id = rx.id; 48.437 - vif->rx_shadow_ring[j].pte_ptr = rx.addr; 48.438 - vif->rx_shadow_ring[j].buf_pfn = buf_pfn; 48.439 - vif->rx_shadow_ring[j].flush_count = (unsigned short) 48.440 - atomic_read(&tlb_flush_count[smp_processor_id()]); 48.441 + vif->rx_shadow_ring[j].id = rx.id; 48.442 + vif->rx_shadow_ring[j].pte_ptr = rx.addr; 48.443 + vif->rx_shadow_ring[j].buf_pfn = buf_pfn; 48.444 j = RX_RING_INC(j); 48.445 48.446 rx_unmap_and_continue: 48.447 unmap_domain_mem(ptep); 48.448 - spin_unlock(&p->page_lock); 48.449 } 48.450 48.451 vif->rx_req_cons = i; 48.452 @@ -2155,6 +2130,20 @@ static long get_bufs_from_vif(net_vif_t 48.453 vif->rx_prod = j; 48.454 } 48.455 48.456 + spin_unlock(&vif->rx_lock); 48.457 +} 48.458 + 48.459 + 48.460 +static long get_bufs_from_vif(net_vif_t *vif) 48.461 +{ 48.462 + if ( get_tx_bufs(vif) ) 48.463 + { 48.464 + add_to_net_schedule_list_tail(vif); 48.465 + maybe_schedule_tx_action(); 48.466 + } 48.467 + 48.468 + get_rx_bufs(vif); 48.469 + 48.470 return 0; 48.471 } 48.472 48.473 @@ -2162,7 +2151,7 @@ static long get_bufs_from_vif(net_vif_t 48.474 long flush_bufs_for_vif(net_vif_t *vif) 48.475 { 48.476 int i; 48.477 - unsigned long *pte; 48.478 + unsigned long *ptep, pte; 48.479 struct pfn_info *page; 48.480 struct task_struct *p = vif->domain; 48.481 rx_shadow_entry_t *rx; 48.482 @@ -2170,7 +2159,7 @@ long flush_bufs_for_vif(net_vif_t *vif) 48.483 net_idx_t *shared_idxs = vif->shared_idxs; 48.484 48.485 /* Return any outstanding receive buffers to the guest OS. */ 48.486 - spin_lock(&p->page_lock); 48.487 + spin_lock(&vif->rx_lock); 48.488 for ( i = vif->rx_req_cons; 48.489 (i != shared_idxs->rx_req_prod) && 48.490 (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); 48.491 @@ -2184,32 +2173,32 @@ long flush_bufs_for_vif(net_vif_t *vif) 48.492 { 48.493 rx = &vif->rx_shadow_ring[i]; 48.494 48.495 - /* Release the page-table page. */ 48.496 - page = frame_table + (rx->pte_ptr >> PAGE_SHIFT); 48.497 - put_page_type(page); 48.498 - put_page_tot(page); 48.499 - 48.500 /* Give the buffer page back to the domain. */ 48.501 - page = frame_table + rx->buf_pfn; 48.502 - list_add(&page->list, &p->pg_head); 48.503 - page->flags = vif->domain->domain; 48.504 + page = &frame_table[rx->buf_pfn]; 48.505 + spin_lock(&p->page_list_lock); 48.506 + list_add(&page->list, &p->page_list); 48.507 + page->count_and_flags = PGC_allocated | 2; 48.508 + spin_unlock(&p->page_list_lock); 48.509 + get_page_type(page, PGT_writeable_page); 48.510 + set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags); 48.511 + wmb(); 48.512 48.513 /* Patch up the PTE if it hasn't changed under our feet. */ 48.514 - pte = map_domain_mem(rx->pte_ptr); 48.515 - if ( !(*pte & _PAGE_PRESENT) ) 48.516 - { 48.517 - *pte = (rx->buf_pfn<<PAGE_SHIFT) | (*pte & ~PAGE_MASK) | 48.518 - _PAGE_RW | _PAGE_PRESENT; 48.519 - page->flags |= PGT_writeable_page | PG_need_flush; 48.520 - set_page_type_count(page, 1); 48.521 - set_page_tot_count(page, 1); 48.522 - } 48.523 - unmap_domain_mem(pte); 48.524 + ptep = map_domain_mem(rx->pte_ptr); 48.525 + pte = *ptep; 48.526 + if ( unlikely(pte & _PAGE_PRESENT) || 48.527 + unlikely(cmpxchg(ptep, pte, (rx->buf_pfn<<PAGE_SHIFT) | 48.528 + (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT) 48.529 + != pte) ) 48.530 + put_page_and_type(page); 48.531 + unmap_domain_mem(ptep); 48.532 + 48.533 + put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]); 48.534 48.535 make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0); 48.536 } 48.537 vif->rx_cons = i; 48.538 - spin_unlock(&p->page_lock); 48.539 + spin_unlock(&vif->rx_lock); 48.540 48.541 /* 48.542 * Flush pending transmit buffers. The guest may still have to wait for 48.543 @@ -2221,7 +2210,7 @@ long flush_bufs_for_vif(net_vif_t *vif) 48.544 (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); 48.545 i = TX_RING_INC(i) ) 48.546 { 48.547 - __make_tx_response(vif, shared_rings->tx_ring[i].req.id, 48.548 + make_tx_response(vif, shared_rings->tx_ring[i].req.id, 48.549 RING_STATUS_DROPPED); 48.550 } 48.551 vif->tx_req_cons = i; 48.552 @@ -2296,9 +2285,9 @@ long do_net_io_op(netop_t *uop) 48.553 } 48.554 48.555 48.556 -static void __make_tx_response(net_vif_t *vif, 48.557 - unsigned short id, 48.558 - unsigned char st) 48.559 +static void make_tx_response(net_vif_t *vif, 48.560 + unsigned short id, 48.561 + unsigned char st) 48.562 { 48.563 unsigned int pos; 48.564 tx_resp_entry_t *resp; 48.565 @@ -2329,7 +2318,6 @@ static void make_rx_response(net_vif_t 48.566 rx_resp_entry_t *resp; 48.567 48.568 /* Place on the response ring for the relevant domain. */ 48.569 - spin_lock(&vif->rx_lock); 48.570 pos = vif->rx_resp_prod; 48.571 resp = &vif->shared_rings->rx_ring[pos].resp; 48.572 resp->id = id; 48.573 @@ -2344,7 +2332,6 @@ static void make_rx_response(net_vif_t 48.574 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET); 48.575 guest_event_notify(cpu_mask); 48.576 } 48.577 - spin_unlock(&vif->rx_lock); 48.578 } 48.579 48.580
49.1 --- a/xen/net/skbuff.c Sat Dec 20 23:39:49 2003 +0000 49.2 +++ b/xen/net/skbuff.c Sat Dec 20 23:41:19 2003 +0000 49.3 @@ -133,41 +133,20 @@ static __inline__ void skb_head_to_pool( 49.4 49.5 static inline u8 *alloc_skb_data_page(struct sk_buff *skb) 49.6 { 49.7 - struct list_head *list_ptr; 49.8 - struct pfn_info *pf; 49.9 - unsigned long flags; 49.10 - 49.11 - spin_lock_irqsave(&free_list_lock, flags); 49.12 - 49.13 - if (!free_pfns) return NULL; 49.14 - 49.15 - list_ptr = free_list.next; 49.16 - pf = list_entry(list_ptr, struct pfn_info, list); 49.17 - pf->flags = 0; 49.18 - list_del(&pf->list); 49.19 - free_pfns--; 49.20 - 49.21 - spin_unlock_irqrestore(&free_list_lock, flags); 49.22 - 49.23 + struct pfn_info *pf; 49.24 + if ( unlikely((pf = alloc_domain_page(NULL)) == NULL) ) 49.25 + return NULL; 49.26 skb->pf = pf; 49.27 return (u8 *)((pf - frame_table) << PAGE_SHIFT); 49.28 } 49.29 49.30 static inline void dealloc_skb_data_page(struct sk_buff *skb) 49.31 { 49.32 - struct pfn_info *pf; 49.33 + struct pfn_info *pf = skb->pf; 49.34 unsigned long flags; 49.35 - 49.36 - pf = skb->pf; 49.37 - 49.38 spin_lock_irqsave(&free_list_lock, flags); 49.39 - 49.40 - pf->flags = 0; 49.41 - set_page_type_count(pf, 0); 49.42 - set_page_tot_count(pf, 0); 49.43 list_add(&pf->list, &free_list); 49.44 free_pfns++; 49.45 - 49.46 spin_unlock_irqrestore(&free_list_lock, flags); 49.47 49.48 }
50.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c Sat Dec 20 23:39:49 2003 +0000 50.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c Sat Dec 20 23:41:19 2003 +0000 50.3 @@ -40,7 +40,7 @@ static void DEBUG_allow_pt_reads(void) 50.4 pte = update_debug_queue[i].ptep; 50.5 if ( pte == NULL ) continue; 50.6 update_debug_queue[i].ptep = NULL; 50.7 - update.ptr = pte; 50.8 + update.ptr = virt_to_machine(pte); 50.9 update.val = update_debug_queue[i].pteval; 50.10 HYPERVISOR_mmu_update(&update, 1); 50.11 } 50.12 @@ -59,7 +59,7 @@ static void DEBUG_disallow_pt_read(unsig 50.13 pgd = pgd_offset_k(va); 50.14 pmd = pmd_offset(pgd, va); 50.15 pte = pte_offset(pmd, va); 50.16 - update.ptr = pte; 50.17 + update.ptr = virt_to_machine(pte); 50.18 pteval = *(unsigned long *)pte; 50.19 update.val = pteval & ~_PAGE_PRESENT; 50.20 HYPERVISOR_mmu_update(&update, 1); 50.21 @@ -95,7 +95,9 @@ void MULTICALL_flush_page_update_queue(v 50.22 #if MMU_UPDATE_DEBUG > 0 50.23 DEBUG_allow_pt_reads(); 50.24 #endif 50.25 - queue_multicall2(__HYPERVISOR_mmu_update, (unsigned long)update_queue, idx); 50.26 + queue_multicall2(__HYPERVISOR_mmu_update, 50.27 + (unsigned long)update_queue, 50.28 + idx); 50.29 idx = 0; 50.30 } 50.31 spin_unlock_irqrestore(&update_lock, flags); 50.32 @@ -134,7 +136,7 @@ void queue_l1_entry_update(pte_t *ptr, u 50.33 #if MMU_UPDATE_DEBUG > 0 50.34 DEBUG_disallow_pt_read((unsigned long)ptr); 50.35 #endif 50.36 - update_queue[idx].ptr = (unsigned long)ptr; 50.37 + update_queue[idx].ptr = virt_to_machine(ptr); 50.38 update_queue[idx].val = val; 50.39 increment_index(); 50.40 spin_unlock_irqrestore(&update_lock, flags); 50.41 @@ -144,7 +146,7 @@ void queue_l2_entry_update(pmd_t *ptr, u 50.42 { 50.43 unsigned long flags; 50.44 spin_lock_irqsave(&update_lock, flags); 50.45 - update_queue[idx].ptr = (unsigned long)ptr; 50.46 + update_queue[idx].ptr = virt_to_machine(ptr); 50.47 update_queue[idx].val = val; 50.48 increment_index(); 50.49 spin_unlock_irqrestore(&update_lock, flags);
51.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c Sat Dec 20 23:39:49 2003 +0000 51.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c Sat Dec 20 23:41:19 2003 +0000 51.3 @@ -113,13 +113,10 @@ static inline void set_pte_phys (unsigne 51.4 } 51.5 pte = pte_offset(pmd, vaddr); 51.6 51.7 -#if 0 /* Not in Xen, since this breaks clear_fixmap. */ 51.8 - if (pte_val(*pte)) 51.9 - pte_ERROR(*pte); 51.10 -#endif 51.11 - 51.12 - /* We queue directly, avoiding hidden phys->machine translation. */ 51.13 - queue_l1_entry_update(pte, phys | pgprot_val(prot)); 51.14 + if ( pte_io(*pte) || (pgprot_val(prot) & _PAGE_IO) ) 51.15 + queue_unchecked_mmu_update(pte, phys | pgprot_val(prot)); 51.16 + else 51.17 + queue_l1_entry_update(pte, phys | pgprot_val(prot)); 51.18 51.19 /* 51.20 * It's enough to flush this one mapping. 51.21 @@ -137,8 +134,7 @@ void __set_fixmap(enum fixed_addresses i 51.22 printk("Invalid __set_fixmap\n"); 51.23 return; 51.24 } 51.25 - set_pte_phys(address, phys, 51.26 - __pgprot(pgprot_val(PAGE_KERNEL)|pgprot_val(flags))); 51.27 + set_pte_phys(address, phys, flags); 51.28 } 51.29 51.30 void clear_fixmap(enum fixed_addresses idx)
52.1 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c Sat Dec 20 23:39:49 2003 +0000 52.2 +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c Sat Dec 20 23:41:19 2003 +0000 52.3 @@ -202,14 +202,15 @@ void __init *bt_ioremap(unsigned long ma 52.4 */ 52.5 nrpages = size >> PAGE_SHIFT; 52.6 if (nrpages > NR_FIX_BTMAPS) 52.7 - return NULL; 52.8 + return NULL; 52.9 52.10 /* 52.11 * Ok, go for it.. 52.12 */ 52.13 idx = FIX_BTMAP_BEGIN; 52.14 while (nrpages > 0) { 52.15 - set_fixmap(idx, machine_addr); 52.16 + __set_fixmap(idx, machine_addr, 52.17 + __pgprot(__PAGE_KERNEL|_PAGE_IO)); 52.18 machine_addr += PAGE_SIZE; 52.19 --idx; 52.20 --nrpages;