ia64/xen-unstable

changeset 5663:f6e7c967212e

Add hook in get_user_pages to allow lookups of foreign mapped pages.

Direct IO to userspace (e.g. with libaio) needs to map user virtual addresses
down to page structs. This patch adds a new vma flag (VM_FOREIGN) to tell
get_user_pages that there are foreign frames in the vma. If VM_FOREIGN is set
vm_private_data points to a map of struct page pointers, indicating the
physical page underpinning the vaddr.

After a fair bit of discussion with Keir, this seems to be the least
intrusive way to allow this sort of lookup. If this solves things, we
can pull the VM_FOREIGN clause out into make it a noop on non-Xen
arches in the same way that the gate_area check above it is.
author akw27@arcadians.cl.cam.ac.uk
date Mon Jul 04 15:31:47 2005 +0000 (2005-07-04)
parents f8acd354e129
children 9b1866006aea
files linux-2.6.11-xen-sparse/include/linux/mm.h linux-2.6.11-xen-sparse/mm/memory.c
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/linux-2.6.11-xen-sparse/include/linux/mm.h	Mon Jul 04 15:31:47 2005 +0000
     1.3 @@ -0,0 +1,865 @@
     1.4 +#ifndef _LINUX_MM_H
     1.5 +#define _LINUX_MM_H
     1.6 +
     1.7 +#include <linux/sched.h>
     1.8 +#include <linux/errno.h>
     1.9 +
    1.10 +#ifdef __KERNEL__
    1.11 +
    1.12 +#include <linux/config.h>
    1.13 +#include <linux/gfp.h>
    1.14 +#include <linux/list.h>
    1.15 +#include <linux/mmzone.h>
    1.16 +#include <linux/rbtree.h>
    1.17 +#include <linux/prio_tree.h>
    1.18 +#include <linux/fs.h>
    1.19 +
    1.20 +struct mempolicy;
    1.21 +struct anon_vma;
    1.22 +
    1.23 +#ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
    1.24 +extern unsigned long max_mapnr;
    1.25 +#endif
    1.26 +
    1.27 +extern unsigned long num_physpages;
    1.28 +extern void * high_memory;
    1.29 +extern unsigned long vmalloc_earlyreserve;
    1.30 +extern int page_cluster;
    1.31 +
    1.32 +#ifdef CONFIG_SYSCTL
    1.33 +extern int sysctl_legacy_va_layout;
    1.34 +#else
    1.35 +#define sysctl_legacy_va_layout 0
    1.36 +#endif
    1.37 +
    1.38 +#include <asm/page.h>
    1.39 +#include <asm/pgtable.h>
    1.40 +#include <asm/processor.h>
    1.41 +#include <asm/atomic.h>
    1.42 +
    1.43 +#ifndef MM_VM_SIZE
    1.44 +#define MM_VM_SIZE(mm)	((TASK_SIZE + PGDIR_SIZE - 1) & PGDIR_MASK)
    1.45 +#endif
    1.46 +
    1.47 +#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
    1.48 +
    1.49 +/*
    1.50 + * Linux kernel virtual memory manager primitives.
    1.51 + * The idea being to have a "virtual" mm in the same way
    1.52 + * we have a virtual fs - giving a cleaner interface to the
    1.53 + * mm details, and allowing different kinds of memory mappings
    1.54 + * (from shared memory to executable loading to arbitrary
    1.55 + * mmap() functions).
    1.56 + */
    1.57 +
    1.58 +/*
    1.59 + * This struct defines a memory VMM memory area. There is one of these
    1.60 + * per VM-area/task.  A VM area is any part of the process virtual memory
    1.61 + * space that has a special rule for the page-fault handlers (ie a shared
    1.62 + * library, the executable area etc).
    1.63 + */
    1.64 +struct vm_area_struct {
    1.65 +	struct mm_struct * vm_mm;	/* The address space we belong to. */
    1.66 +	unsigned long vm_start;		/* Our start address within vm_mm. */
    1.67 +	unsigned long vm_end;		/* The first byte after our end address
    1.68 +					   within vm_mm. */
    1.69 +
    1.70 +	/* linked list of VM areas per task, sorted by address */
    1.71 +	struct vm_area_struct *vm_next;
    1.72 +
    1.73 +	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
    1.74 +	unsigned long vm_flags;		/* Flags, listed below. */
    1.75 +
    1.76 +	struct rb_node vm_rb;
    1.77 +
    1.78 +	/*
    1.79 +	 * For areas with an address space and backing store,
    1.80 +	 * linkage into the address_space->i_mmap prio tree, or
    1.81 +	 * linkage to the list of like vmas hanging off its node, or
    1.82 +	 * linkage of vma in the address_space->i_mmap_nonlinear list.
    1.83 +	 */
    1.84 +	union {
    1.85 +		struct {
    1.86 +			struct list_head list;
    1.87 +			void *parent;	/* aligns with prio_tree_node parent */
    1.88 +			struct vm_area_struct *head;
    1.89 +		} vm_set;
    1.90 +
    1.91 +		struct raw_prio_tree_node prio_tree_node;
    1.92 +	} shared;
    1.93 +
    1.94 +	/*
    1.95 +	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
    1.96 +	 * list, after a COW of one of the file pages.  A MAP_SHARED vma
    1.97 +	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
    1.98 +	 * or brk vma (with NULL file) can only be in an anon_vma list.
    1.99 +	 */
   1.100 +	struct list_head anon_vma_node;	/* Serialized by anon_vma->lock */
   1.101 +	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
   1.102 +
   1.103 +	/* Function pointers to deal with this struct. */
   1.104 +	struct vm_operations_struct * vm_ops;
   1.105 +
   1.106 +	/* Information about our backing store: */
   1.107 +	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
   1.108 +					   units, *not* PAGE_CACHE_SIZE */
   1.109 +	struct file * vm_file;		/* File we map to (can be NULL). */
   1.110 +	void * vm_private_data;		/* was vm_pte (shared mem) */
   1.111 +	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
   1.112 +
   1.113 +#ifndef CONFIG_MMU
   1.114 +	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
   1.115 +#endif
   1.116 +#ifdef CONFIG_NUMA
   1.117 +	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
   1.118 +#endif
   1.119 +};
   1.120 +
   1.121 +/*
   1.122 + * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
   1.123 + * disabled, then there's a single shared list of VMAs maintained by the
   1.124 + * system, and mm's subscribe to these individually
   1.125 + */
   1.126 +struct vm_list_struct {
   1.127 +	struct vm_list_struct	*next;
   1.128 +	struct vm_area_struct	*vma;
   1.129 +};
   1.130 +
   1.131 +#ifndef CONFIG_MMU
   1.132 +extern struct rb_root nommu_vma_tree;
   1.133 +extern struct rw_semaphore nommu_vma_sem;
   1.134 +
   1.135 +extern unsigned int kobjsize(const void *objp);
   1.136 +#endif
   1.137 +
   1.138 +/*
   1.139 + * vm_flags..
   1.140 + */
   1.141 +#define VM_READ		0x00000001	/* currently active flags */
   1.142 +#define VM_WRITE	0x00000002
   1.143 +#define VM_EXEC		0x00000004
   1.144 +#define VM_SHARED	0x00000008
   1.145 +
   1.146 +#define VM_MAYREAD	0x00000010	/* limits for mprotect() etc */
   1.147 +#define VM_MAYWRITE	0x00000020
   1.148 +#define VM_MAYEXEC	0x00000040
   1.149 +#define VM_MAYSHARE	0x00000080
   1.150 +
   1.151 +#define VM_GROWSDOWN	0x00000100	/* general info on the segment */
   1.152 +#define VM_GROWSUP	0x00000200
   1.153 +#define VM_SHM		0x00000400	/* shared memory area, don't swap out */
   1.154 +#define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
   1.155 +
   1.156 +#define VM_EXECUTABLE	0x00001000
   1.157 +#define VM_LOCKED	0x00002000
   1.158 +#define VM_IO           0x00004000	/* Memory mapped I/O or similar */
   1.159 +
   1.160 +					/* Used by sys_madvise() */
   1.161 +#define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
   1.162 +#define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */
   1.163 +
   1.164 +#define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
   1.165 +#define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
   1.166 +#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
   1.167 +#define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
   1.168 +#define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
   1.169 +#define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
   1.170 +#define VM_FOREIGN      0x01000000      /* Has pages belonging to another VM */
   1.171 +
   1.172 +#ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
   1.173 +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
   1.174 +#endif
   1.175 +
   1.176 +#ifdef CONFIG_STACK_GROWSUP
   1.177 +#define VM_STACK_FLAGS	(VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
   1.178 +#else
   1.179 +#define VM_STACK_FLAGS	(VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
   1.180 +#endif
   1.181 +
   1.182 +#define VM_READHINTMASK			(VM_SEQ_READ | VM_RAND_READ)
   1.183 +#define VM_ClearReadHint(v)		(v)->vm_flags &= ~VM_READHINTMASK
   1.184 +#define VM_NormalReadHint(v)		(!((v)->vm_flags & VM_READHINTMASK))
   1.185 +#define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
   1.186 +#define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
   1.187 +
   1.188 +/*
   1.189 + * mapping from the currently active vm_flags protection bits (the
   1.190 + * low four bits) to a page protection mask..
   1.191 + */
   1.192 +extern pgprot_t protection_map[16];
   1.193 +
   1.194 +
   1.195 +/*
   1.196 + * These are the virtual MM functions - opening of an area, closing and
   1.197 + * unmapping it (needed to keep files on disk up-to-date etc), pointer
   1.198 + * to the functions called when a no-page or a wp-page exception occurs. 
   1.199 + */
   1.200 +struct vm_operations_struct {
   1.201 +	void (*open)(struct vm_area_struct * area);
   1.202 +	void (*close)(struct vm_area_struct * area);
   1.203 +	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
   1.204 +	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
   1.205 +#ifdef CONFIG_NUMA
   1.206 +	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
   1.207 +	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
   1.208 +					unsigned long addr);
   1.209 +#endif
   1.210 +};
   1.211 +
   1.212 +struct mmu_gather;
   1.213 +struct inode;
   1.214 +
   1.215 +#ifdef ARCH_HAS_ATOMIC_UNSIGNED
   1.216 +typedef unsigned page_flags_t;
   1.217 +#else
   1.218 +typedef unsigned long page_flags_t;
   1.219 +#endif
   1.220 +
   1.221 +/*
   1.222 + * Each physical page in the system has a struct page associated with
   1.223 + * it to keep track of whatever it is we are using the page for at the
   1.224 + * moment. Note that we have no way to track which tasks are using
   1.225 + * a page.
   1.226 + */
   1.227 +struct page {
   1.228 +	page_flags_t flags;		/* Atomic flags, some possibly
   1.229 +					 * updated asynchronously */
   1.230 +	atomic_t _count;		/* Usage count, see below. */
   1.231 +	atomic_t _mapcount;		/* Count of ptes mapped in mms,
   1.232 +					 * to show when page is mapped
   1.233 +					 * & limit reverse map searches.
   1.234 +					 */
   1.235 +	unsigned long private;		/* Mapping-private opaque data:
   1.236 +					 * usually used for buffer_heads
   1.237 +					 * if PagePrivate set; used for
   1.238 +					 * swp_entry_t if PageSwapCache
   1.239 +					 * When page is free, this indicates
   1.240 +					 * order in the buddy system.
   1.241 +					 */
   1.242 +	struct address_space *mapping;	/* If low bit clear, points to
   1.243 +					 * inode address_space, or NULL.
   1.244 +					 * If page mapped as anonymous
   1.245 +					 * memory, low bit is set, and
   1.246 +					 * it points to anon_vma object:
   1.247 +					 * see PAGE_MAPPING_ANON below.
   1.248 +					 */
   1.249 +	pgoff_t index;			/* Our offset within mapping. */
   1.250 +	struct list_head lru;		/* Pageout list, eg. active_list
   1.251 +					 * protected by zone->lru_lock !
   1.252 +					 */
   1.253 +	/*
   1.254 +	 * On machines where all RAM is mapped into kernel address space,
   1.255 +	 * we can simply calculate the virtual address. On machines with
   1.256 +	 * highmem some memory is mapped into kernel virtual memory
   1.257 +	 * dynamically, so we need a place to store that address.
   1.258 +	 * Note that this field could be 16 bits on x86 ... ;)
   1.259 +	 *
   1.260 +	 * Architectures with slow multiplication can define
   1.261 +	 * WANT_PAGE_VIRTUAL in asm/page.h
   1.262 +	 */
   1.263 +#if defined(WANT_PAGE_VIRTUAL)
   1.264 +	void *virtual;			/* Kernel virtual address (NULL if
   1.265 +					   not kmapped, ie. highmem) */
   1.266 +#endif /* WANT_PAGE_VIRTUAL */
   1.267 +};
   1.268 +
   1.269 +/*
   1.270 + * FIXME: take this include out, include page-flags.h in
   1.271 + * files which need it (119 of them)
   1.272 + */
   1.273 +#include <linux/page-flags.h>
   1.274 +
   1.275 +/*
   1.276 + * Methods to modify the page usage count.
   1.277 + *
   1.278 + * What counts for a page usage:
   1.279 + * - cache mapping   (page->mapping)
   1.280 + * - private data    (page->private)
   1.281 + * - page mapped in a task's page tables, each mapping
   1.282 + *   is counted separately
   1.283 + *
   1.284 + * Also, many kernel routines increase the page count before a critical
   1.285 + * routine so they can be sure the page doesn't go away from under them.
   1.286 + *
   1.287 + * Since 2.6.6 (approx), a free page has ->_count = -1.  This is so that we
   1.288 + * can use atomic_add_negative(-1, page->_count) to detect when the page
   1.289 + * becomes free and so that we can also use atomic_inc_and_test to atomically
   1.290 + * detect when we just tried to grab a ref on a page which some other CPU has
   1.291 + * already deemed to be freeable.
   1.292 + *
   1.293 + * NO code should make assumptions about this internal detail!  Use the provided
   1.294 + * macros which retain the old rules: page_count(page) == 0 is a free page.
   1.295 + */
   1.296 +
   1.297 +/*
   1.298 + * Drop a ref, return true if the logical refcount fell to zero (the page has
   1.299 + * no users)
   1.300 + */
   1.301 +#define put_page_testzero(p)				\
   1.302 +	({						\
   1.303 +		BUG_ON(page_count(p) == 0);		\
   1.304 +		atomic_add_negative(-1, &(p)->_count);	\
   1.305 +	})
   1.306 +
   1.307 +/*
   1.308 + * Grab a ref, return true if the page previously had a logical refcount of
   1.309 + * zero.  ie: returns true if we just grabbed an already-deemed-to-be-free page
   1.310 + */
   1.311 +#define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
   1.312 +
   1.313 +#define set_page_count(p,v) 	atomic_set(&(p)->_count, v - 1)
   1.314 +#define __put_page(p)		atomic_dec(&(p)->_count)
   1.315 +
   1.316 +extern void FASTCALL(__page_cache_release(struct page *));
   1.317 +
   1.318 +#ifdef CONFIG_HUGETLB_PAGE
   1.319 +
   1.320 +static inline int page_count(struct page *p)
   1.321 +{
   1.322 +	if (PageCompound(p))
   1.323 +		p = (struct page *)p->private;
   1.324 +	return atomic_read(&(p)->_count) + 1;
   1.325 +}
   1.326 +
   1.327 +static inline void get_page(struct page *page)
   1.328 +{
   1.329 +	if (unlikely(PageCompound(page)))
   1.330 +		page = (struct page *)page->private;
   1.331 +	atomic_inc(&page->_count);
   1.332 +}
   1.333 +
   1.334 +void put_page(struct page *page);
   1.335 +
   1.336 +#else		/* CONFIG_HUGETLB_PAGE */
   1.337 +
   1.338 +#define page_count(p)		(atomic_read(&(p)->_count) + 1)
   1.339 +
   1.340 +static inline void get_page(struct page *page)
   1.341 +{
   1.342 +	atomic_inc(&page->_count);
   1.343 +}
   1.344 +
   1.345 +static inline void put_page(struct page *page)
   1.346 +{
   1.347 +	if (!PageReserved(page) && put_page_testzero(page))
   1.348 +		__page_cache_release(page);
   1.349 +}
   1.350 +
   1.351 +#endif		/* CONFIG_HUGETLB_PAGE */
   1.352 +
   1.353 +/*
   1.354 + * Multiple processes may "see" the same page. E.g. for untouched
   1.355 + * mappings of /dev/null, all processes see the same page full of
   1.356 + * zeroes, and text pages of executables and shared libraries have
   1.357 + * only one copy in memory, at most, normally.
   1.358 + *
   1.359 + * For the non-reserved pages, page_count(page) denotes a reference count.
   1.360 + *   page_count() == 0 means the page is free.
   1.361 + *   page_count() == 1 means the page is used for exactly one purpose
   1.362 + *   (e.g. a private data page of one process).
   1.363 + *
   1.364 + * A page may be used for kmalloc() or anyone else who does a
   1.365 + * __get_free_page(). In this case the page_count() is at least 1, and
   1.366 + * all other fields are unused but should be 0 or NULL. The
   1.367 + * management of this page is the responsibility of the one who uses
   1.368 + * it.
   1.369 + *
   1.370 + * The other pages (we may call them "process pages") are completely
   1.371 + * managed by the Linux memory manager: I/O, buffers, swapping etc.
   1.372 + * The following discussion applies only to them.
   1.373 + *
   1.374 + * A page may belong to an inode's memory mapping. In this case,
   1.375 + * page->mapping is the pointer to the inode, and page->index is the
   1.376 + * file offset of the page, in units of PAGE_CACHE_SIZE.
   1.377 + *
   1.378 + * A page contains an opaque `private' member, which belongs to the
   1.379 + * page's address_space.  Usually, this is the address of a circular
   1.380 + * list of the page's disk buffers.
   1.381 + *
   1.382 + * For pages belonging to inodes, the page_count() is the number of
   1.383 + * attaches, plus 1 if `private' contains something, plus one for
   1.384 + * the page cache itself.
   1.385 + *
   1.386 + * All pages belonging to an inode are in these doubly linked lists:
   1.387 + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages;
   1.388 + * using the page->list list_head. These fields are also used for
   1.389 + * freelist managemet (when page_count()==0).
   1.390 + *
   1.391 + * There is also a per-mapping radix tree mapping index to the page
   1.392 + * in memory if present. The tree is rooted at mapping->root.  
   1.393 + *
   1.394 + * All process pages can do I/O:
   1.395 + * - inode pages may need to be read from disk,
   1.396 + * - inode pages which have been modified and are MAP_SHARED may need
   1.397 + *   to be written to disk,
   1.398 + * - private pages which have been modified may need to be swapped out
   1.399 + *   to swap space and (later) to be read back into memory.
   1.400 + */
   1.401 +
   1.402 +/*
   1.403 + * The zone field is never updated after free_area_init_core()
   1.404 + * sets it, so none of the operations on it need to be atomic.
   1.405 + * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total,
   1.406 + * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits.
   1.407 + */
   1.408 +#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT)
   1.409 +#define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
   1.410 +
   1.411 +static inline unsigned long page_zonenum(struct page *page)
   1.412 +{
   1.413 +	return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT));
   1.414 +}
   1.415 +static inline unsigned long page_to_nid(struct page *page)
   1.416 +{
   1.417 +	return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT));
   1.418 +}
   1.419 +
   1.420 +struct zone;
   1.421 +extern struct zone *zone_table[];
   1.422 +
   1.423 +static inline struct zone *page_zone(struct page *page)
   1.424 +{
   1.425 +	return zone_table[page->flags >> NODEZONE_SHIFT];
   1.426 +}
   1.427 +
   1.428 +static inline void set_page_zone(struct page *page, unsigned long nodezone_num)
   1.429 +{
   1.430 +	page->flags &= ~(~0UL << NODEZONE_SHIFT);
   1.431 +	page->flags |= nodezone_num << NODEZONE_SHIFT;
   1.432 +}
   1.433 +
   1.434 +#ifndef CONFIG_DISCONTIGMEM
   1.435 +/* The array of struct pages - for discontigmem use pgdat->lmem_map */
   1.436 +extern struct page *mem_map;
   1.437 +#endif
   1.438 +
   1.439 +static inline void *lowmem_page_address(struct page *page)
   1.440 +{
   1.441 +	return __va(page_to_pfn(page) << PAGE_SHIFT);
   1.442 +}
   1.443 +
   1.444 +#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
   1.445 +#define HASHED_PAGE_VIRTUAL
   1.446 +#endif
   1.447 +
   1.448 +#if defined(WANT_PAGE_VIRTUAL)
   1.449 +#define page_address(page) ((page)->virtual)
   1.450 +#define set_page_address(page, address)			\
   1.451 +	do {						\
   1.452 +		(page)->virtual = (address);		\
   1.453 +	} while(0)
   1.454 +#define page_address_init()  do { } while(0)
   1.455 +#endif
   1.456 +
   1.457 +#if defined(HASHED_PAGE_VIRTUAL)
   1.458 +void *page_address(struct page *page);
   1.459 +void set_page_address(struct page *page, void *virtual);
   1.460 +void page_address_init(void);
   1.461 +#endif
   1.462 +
   1.463 +#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
   1.464 +#define page_address(page) lowmem_page_address(page)
   1.465 +#define set_page_address(page, address)  do { } while(0)
   1.466 +#define page_address_init()  do { } while(0)
   1.467 +#endif
   1.468 +
   1.469 +/*
   1.470 + * On an anonymous page mapped into a user virtual memory area,
   1.471 + * page->mapping points to its anon_vma, not to a struct address_space;
   1.472 + * with the PAGE_MAPPING_ANON bit set to distinguish it.
   1.473 + *
   1.474 + * Please note that, confusingly, "page_mapping" refers to the inode
   1.475 + * address_space which maps the page from disk; whereas "page_mapped"
   1.476 + * refers to user virtual address space into which the page is mapped.
   1.477 + */
   1.478 +#define PAGE_MAPPING_ANON	1
   1.479 +
   1.480 +extern struct address_space swapper_space;
   1.481 +static inline struct address_space *page_mapping(struct page *page)
   1.482 +{
   1.483 +	struct address_space *mapping = page->mapping;
   1.484 +
   1.485 +	if (unlikely(PageSwapCache(page)))
   1.486 +		mapping = &swapper_space;
   1.487 +	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
   1.488 +		mapping = NULL;
   1.489 +	return mapping;
   1.490 +}
   1.491 +
   1.492 +static inline int PageAnon(struct page *page)
   1.493 +{
   1.494 +	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
   1.495 +}
   1.496 +
   1.497 +/*
   1.498 + * Return the pagecache index of the passed page.  Regular pagecache pages
   1.499 + * use ->index whereas swapcache pages use ->private
   1.500 + */
   1.501 +static inline pgoff_t page_index(struct page *page)
   1.502 +{
   1.503 +	if (unlikely(PageSwapCache(page)))
   1.504 +		return page->private;
   1.505 +	return page->index;
   1.506 +}
   1.507 +
   1.508 +/*
   1.509 + * The atomic page->_mapcount, like _count, starts from -1:
   1.510 + * so that transitions both from it and to it can be tracked,
   1.511 + * using atomic_inc_and_test and atomic_add_negative(-1).
   1.512 + */
   1.513 +static inline void reset_page_mapcount(struct page *page)
   1.514 +{
   1.515 +	atomic_set(&(page)->_mapcount, -1);
   1.516 +}
   1.517 +
   1.518 +static inline int page_mapcount(struct page *page)
   1.519 +{
   1.520 +	return atomic_read(&(page)->_mapcount) + 1;
   1.521 +}
   1.522 +
   1.523 +/*
   1.524 + * Return true if this page is mapped into pagetables.
   1.525 + */
   1.526 +static inline int page_mapped(struct page *page)
   1.527 +{
   1.528 +	return atomic_read(&(page)->_mapcount) >= 0;
   1.529 +}
   1.530 +
   1.531 +/*
   1.532 + * Error return values for the *_nopage functions
   1.533 + */
   1.534 +#define NOPAGE_SIGBUS	(NULL)
   1.535 +#define NOPAGE_OOM	((struct page *) (-1))
   1.536 +
   1.537 +/*
   1.538 + * Different kinds of faults, as returned by handle_mm_fault().
   1.539 + * Used to decide whether a process gets delivered SIGBUS or
   1.540 + * just gets major/minor fault counters bumped up.
   1.541 + */
   1.542 +#define VM_FAULT_OOM	(-1)
   1.543 +#define VM_FAULT_SIGBUS	0
   1.544 +#define VM_FAULT_MINOR	1
   1.545 +#define VM_FAULT_MAJOR	2
   1.546 +
   1.547 +#define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
   1.548 +
   1.549 +extern void show_free_areas(void);
   1.550 +
   1.551 +#ifdef CONFIG_SHMEM
   1.552 +struct page *shmem_nopage(struct vm_area_struct *vma,
   1.553 +			unsigned long address, int *type);
   1.554 +int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new);
   1.555 +struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
   1.556 +					unsigned long addr);
   1.557 +int shmem_lock(struct file *file, int lock, struct user_struct *user);
   1.558 +#else
   1.559 +#define shmem_nopage filemap_nopage
   1.560 +#define shmem_lock(a, b, c) 	({0;})	/* always in memory, no need to lock */
   1.561 +#define shmem_set_policy(a, b)	(0)
   1.562 +#define shmem_get_policy(a, b)	(NULL)
   1.563 +#endif
   1.564 +struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
   1.565 +
   1.566 +int shmem_zero_setup(struct vm_area_struct *);
   1.567 +
   1.568 +static inline int can_do_mlock(void)
   1.569 +{
   1.570 +	if (capable(CAP_IPC_LOCK))
   1.571 +		return 1;
   1.572 +	if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
   1.573 +		return 1;
   1.574 +	return 0;
   1.575 +}
   1.576 +extern int user_shm_lock(size_t, struct user_struct *);
   1.577 +extern void user_shm_unlock(size_t, struct user_struct *);
   1.578 +
   1.579 +/*
   1.580 + * Parameter block passed down to zap_pte_range in exceptional cases.
   1.581 + */
   1.582 +struct zap_details {
   1.583 +	struct vm_area_struct *nonlinear_vma;	/* Check page->index if set */
   1.584 +	struct address_space *check_mapping;	/* Check page->mapping if set */
   1.585 +	pgoff_t	first_index;			/* Lowest page->index to unmap */
   1.586 +	pgoff_t last_index;			/* Highest page->index to unmap */
   1.587 +	spinlock_t *i_mmap_lock;		/* For unmap_mapping_range: */
   1.588 +	unsigned long break_addr;		/* Where unmap_vmas stopped */
   1.589 +	unsigned long truncate_count;		/* Compare vm_truncate_count */
   1.590 +};
   1.591 +
   1.592 +void zap_page_range(struct vm_area_struct *vma, unsigned long address,
   1.593 +		unsigned long size, struct zap_details *);
   1.594 +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
   1.595 +		struct vm_area_struct *start_vma, unsigned long start_addr,
   1.596 +		unsigned long end_addr, unsigned long *nr_accounted,
   1.597 +		struct zap_details *);
   1.598 +void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end);
   1.599 +int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
   1.600 +			struct vm_area_struct *vma);
   1.601 +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
   1.602 +			unsigned long size, pgprot_t prot);
   1.603 +void unmap_mapping_range(struct address_space *mapping,
   1.604 +		loff_t const holebegin, loff_t const holelen, int even_cows);
   1.605 +
   1.606 +static inline void unmap_shared_mapping_range(struct address_space *mapping,
   1.607 +		loff_t const holebegin, loff_t const holelen)
   1.608 +{
   1.609 +	unmap_mapping_range(mapping, holebegin, holelen, 0);
   1.610 +}
   1.611 +
   1.612 +extern int vmtruncate(struct inode * inode, loff_t offset);
   1.613 +extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
   1.614 +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
   1.615 +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
   1.616 +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
   1.617 +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
   1.618 +extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
   1.619 +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
   1.620 +extern int make_pages_present(unsigned long addr, unsigned long end);
   1.621 +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
   1.622 +void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
   1.623 +
   1.624 +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
   1.625 +		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
   1.626 +
   1.627 +int __set_page_dirty_buffers(struct page *page);
   1.628 +int __set_page_dirty_nobuffers(struct page *page);
   1.629 +int redirty_page_for_writepage(struct writeback_control *wbc,
   1.630 +				struct page *page);
   1.631 +int FASTCALL(set_page_dirty(struct page *page));
   1.632 +int set_page_dirty_lock(struct page *page);
   1.633 +int clear_page_dirty_for_io(struct page *page);
   1.634 +
   1.635 +extern unsigned long do_mremap(unsigned long addr,
   1.636 +			       unsigned long old_len, unsigned long new_len,
   1.637 +			       unsigned long flags, unsigned long new_addr);
   1.638 +
   1.639 +/*
   1.640 + * Prototype to add a shrinker callback for ageable caches.
   1.641 + * 
   1.642 + * These functions are passed a count `nr_to_scan' and a gfpmask.  They should
   1.643 + * scan `nr_to_scan' objects, attempting to free them.
   1.644 + *
   1.645 + * The callback must the number of objects which remain in the cache.
   1.646 + *
   1.647 + * The callback will be passes nr_to_scan == 0 when the VM is querying the
   1.648 + * cache size, so a fastpath for that case is appropriate.
   1.649 + */
   1.650 +typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask);
   1.651 +
   1.652 +/*
   1.653 + * Add an aging callback.  The int is the number of 'seeks' it takes
   1.654 + * to recreate one of the objects that these functions age.
   1.655 + */
   1.656 +
   1.657 +#define DEFAULT_SEEKS 2
   1.658 +struct shrinker;
   1.659 +extern struct shrinker *set_shrinker(int, shrinker_t);
   1.660 +extern void remove_shrinker(struct shrinker *shrinker);
   1.661 +
   1.662 +/*
   1.663 + * On a two-level or three-level page table, this ends up being trivial. Thus
   1.664 + * the inlining and the symmetry break with pte_alloc_map() that does all
   1.665 + * of this out-of-line.
   1.666 + */
   1.667 +/*
   1.668 + * The following ifdef needed to get the 4level-fixup.h header to work.
   1.669 + * Remove it when 4level-fixup.h has been removed.
   1.670 + */
   1.671 +#ifdef CONFIG_MMU
   1.672 +#ifndef __ARCH_HAS_4LEVEL_HACK 
   1.673 +static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
   1.674 +{
   1.675 +	if (pgd_none(*pgd))
   1.676 +		return __pud_alloc(mm, pgd, address);
   1.677 +	return pud_offset(pgd, address);
   1.678 +}
   1.679 +
   1.680 +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
   1.681 +{
   1.682 +	if (pud_none(*pud))
   1.683 +		return __pmd_alloc(mm, pud, address);
   1.684 +	return pmd_offset(pud, address);
   1.685 +}
   1.686 +#endif
   1.687 +#endif /* CONFIG_MMU */
   1.688 +
   1.689 +extern void free_area_init(unsigned long * zones_size);
   1.690 +extern void free_area_init_node(int nid, pg_data_t *pgdat,
   1.691 +	unsigned long * zones_size, unsigned long zone_start_pfn, 
   1.692 +	unsigned long *zholes_size);
   1.693 +extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
   1.694 +extern void mem_init(void);
   1.695 +extern void show_mem(void);
   1.696 +extern void si_meminfo(struct sysinfo * val);
   1.697 +extern void si_meminfo_node(struct sysinfo *val, int nid);
   1.698 +
   1.699 +/* prio_tree.c */
   1.700 +void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
   1.701 +void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
   1.702 +void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
   1.703 +struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
   1.704 +	struct prio_tree_iter *iter);
   1.705 +
   1.706 +#define vma_prio_tree_foreach(vma, iter, root, begin, end)	\
   1.707 +	for (prio_tree_iter_init(iter, root, begin, end), vma = NULL;	\
   1.708 +		(vma = vma_prio_tree_next(vma, iter)); )
   1.709 +
   1.710 +static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
   1.711 +					struct list_head *list)
   1.712 +{
   1.713 +	vma->shared.vm_set.parent = NULL;
   1.714 +	list_add_tail(&vma->shared.vm_set.list, list);
   1.715 +}
   1.716 +
   1.717 +/* mmap.c */
   1.718 +extern int __vm_enough_memory(long pages, int cap_sys_admin);
   1.719 +extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
   1.720 +	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
   1.721 +extern struct vm_area_struct *vma_merge(struct mm_struct *,
   1.722 +	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
   1.723 +	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
   1.724 +	struct mempolicy *);
   1.725 +extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
   1.726 +extern int split_vma(struct mm_struct *,
   1.727 +	struct vm_area_struct *, unsigned long addr, int new_below);
   1.728 +extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
   1.729 +extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
   1.730 +	struct rb_node **, struct rb_node *);
   1.731 +extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
   1.732 +	unsigned long addr, unsigned long len, pgoff_t pgoff);
   1.733 +extern void exit_mmap(struct mm_struct *);
   1.734 +
   1.735 +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
   1.736 +
   1.737 +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
   1.738 +	unsigned long len, unsigned long prot,
   1.739 +	unsigned long flag, unsigned long pgoff);
   1.740 +
   1.741 +static inline unsigned long do_mmap(struct file *file, unsigned long addr,
   1.742 +	unsigned long len, unsigned long prot,
   1.743 +	unsigned long flag, unsigned long offset)
   1.744 +{
   1.745 +	unsigned long ret = -EINVAL;
   1.746 +	if ((offset + PAGE_ALIGN(len)) < offset)
   1.747 +		goto out;
   1.748 +	if (!(offset & ~PAGE_MASK))
   1.749 +		ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
   1.750 +out:
   1.751 +	return ret;
   1.752 +}
   1.753 +
   1.754 +extern int do_munmap(struct mm_struct *, unsigned long, size_t);
   1.755 +
   1.756 +extern unsigned long do_brk(unsigned long, unsigned long);
   1.757 +
   1.758 +/* filemap.c */
   1.759 +extern unsigned long page_unuse(struct page *);
   1.760 +extern void truncate_inode_pages(struct address_space *, loff_t);
   1.761 +
   1.762 +/* generic vm_area_ops exported for stackable file systems */
   1.763 +extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
   1.764 +extern int filemap_populate(struct vm_area_struct *, unsigned long,
   1.765 +		unsigned long, pgprot_t, unsigned long, int);
   1.766 +
   1.767 +/* mm/page-writeback.c */
   1.768 +int write_one_page(struct page *page, int wait);
   1.769 +
   1.770 +/* readahead.c */
   1.771 +#define VM_MAX_READAHEAD	128	/* kbytes */
   1.772 +#define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
   1.773 +#define VM_MAX_CACHE_HIT    	256	/* max pages in a row in cache before
   1.774 +					 * turning readahead off */
   1.775 +
   1.776 +int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
   1.777 +			unsigned long offset, unsigned long nr_to_read);
   1.778 +int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
   1.779 +			unsigned long offset, unsigned long nr_to_read);
   1.780 +unsigned long  page_cache_readahead(struct address_space *mapping,
   1.781 +			  struct file_ra_state *ra,
   1.782 +			  struct file *filp,
   1.783 +			  unsigned long offset,
   1.784 +			  unsigned long size);
   1.785 +void handle_ra_miss(struct address_space *mapping, 
   1.786 +		    struct file_ra_state *ra, pgoff_t offset);
   1.787 +unsigned long max_sane_readahead(unsigned long nr);
   1.788 +
   1.789 +/* Do stack extension */
   1.790 +extern int expand_stack(struct vm_area_struct * vma, unsigned long address);
   1.791 +
   1.792 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
   1.793 +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
   1.794 +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
   1.795 +					     struct vm_area_struct **pprev);
   1.796 +
   1.797 +/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
   1.798 +   NULL if none.  Assume start_addr < end_addr. */
   1.799 +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
   1.800 +{
   1.801 +	struct vm_area_struct * vma = find_vma(mm,start_addr);
   1.802 +
   1.803 +	if (vma && end_addr <= vma->vm_start)
   1.804 +		vma = NULL;
   1.805 +	return vma;
   1.806 +}
   1.807 +
   1.808 +static inline unsigned long vma_pages(struct vm_area_struct *vma)
   1.809 +{
   1.810 +	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
   1.811 +}
   1.812 +
   1.813 +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
   1.814 +
   1.815 +extern struct page * vmalloc_to_page(void *addr);
   1.816 +extern unsigned long vmalloc_to_pfn(void *addr);
   1.817 +extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
   1.818 +		int write);
   1.819 +extern int check_user_page_readable(struct mm_struct *mm, unsigned long address);
   1.820 +int remap_pfn_range(struct vm_area_struct *, unsigned long,
   1.821 +		unsigned long, unsigned long, pgprot_t);
   1.822 +/* Allow arch override for mapping of device and I/O (non-RAM) pages. */
   1.823 +#ifndef io_remap_pfn_range
   1.824 +#define io_remap_pfn_range remap_pfn_range
   1.825 +#endif
   1.826 +
   1.827 +#ifdef CONFIG_PROC_FS
   1.828 +void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
   1.829 +#else
   1.830 +static inline void __vm_stat_account(struct mm_struct *mm,
   1.831 +			unsigned long flags, struct file *file, long pages)
   1.832 +{
   1.833 +}
   1.834 +#endif /* CONFIG_PROC_FS */
   1.835 +
   1.836 +static inline void vm_stat_account(struct vm_area_struct *vma)
   1.837 +{
   1.838 +	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
   1.839 +							vma_pages(vma));
   1.840 +}
   1.841 +
   1.842 +static inline void vm_stat_unaccount(struct vm_area_struct *vma)
   1.843 +{
   1.844 +	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
   1.845 +							-vma_pages(vma));
   1.846 +}
   1.847 +
   1.848 +/* update per process rss and vm hiwater data */
   1.849 +extern void update_mem_hiwater(void);
   1.850 +
   1.851 +#ifndef CONFIG_DEBUG_PAGEALLOC
   1.852 +static inline void
   1.853 +kernel_map_pages(struct page *page, int numpages, int enable)
   1.854 +{
   1.855 +}
   1.856 +#endif
   1.857 +
   1.858 +extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
   1.859 +#ifdef	__HAVE_ARCH_GATE_AREA
   1.860 +int in_gate_area_no_task(unsigned long addr);
   1.861 +int in_gate_area(struct task_struct *task, unsigned long addr);
   1.862 +#else
   1.863 +int in_gate_area_no_task(unsigned long addr);
   1.864 +#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
   1.865 +#endif	/* __HAVE_ARCH_GATE_AREA */
   1.866 +
   1.867 +#endif /* __KERNEL__ */
   1.868 +#endif /* _LINUX_MM_H */
     2.1 --- a/linux-2.6.11-xen-sparse/mm/memory.c	Sun Jul 03 22:36:48 2005 +0000
     2.2 +++ b/linux-2.6.11-xen-sparse/mm/memory.c	Mon Jul 04 15:31:47 2005 +0000
     2.3 @@ -907,6 +907,24 @@ int get_user_pages(struct task_struct *t
     2.4  			continue;
     2.5  		}
     2.6  
     2.7 +                if (vma && (vma->vm_flags & VM_FOREIGN))
     2.8 +                {
     2.9 +                    struct page **map = vma->vm_private_data;
    2.10 +                    int offset = (start - vma->vm_start) >> PAGE_SHIFT;
    2.11 +
    2.12 +                    if (map[offset] != NULL) {
    2.13 +                        if (pages) {
    2.14 +                            pages[i] = map[offset];
    2.15 +                        } 
    2.16 +                        if (vmas) 
    2.17 +                            vmas[i] = vma;
    2.18 +                        i++;
    2.19 +                        start += PAGE_SIZE;
    2.20 +                        len--;
    2.21 +                        continue;
    2.22 +                    }
    2.23 +                }
    2.24 +
    2.25  		if (!vma || (vma->vm_flags & VM_IO)
    2.26  				|| !(flags & vma->vm_flags))
    2.27  			return i ? : -EFAULT;