ia64/linux-2.6.18-xen.hg

view include/linux/mm.h @ 878:eba6fe6d8d53

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:23:16 2009 +0100 (2009-05-26)
parents 906b14e26f2f
children
line source
1 #ifndef _LINUX_MM_H
2 #define _LINUX_MM_H
4 #include <linux/sched.h>
5 #include <linux/errno.h>
6 #include <linux/capability.h>
8 #ifdef __KERNEL__
10 #include <linux/gfp.h>
11 #include <linux/list.h>
12 #include <linux/mmzone.h>
13 #include <linux/rbtree.h>
14 #include <linux/prio_tree.h>
15 #include <linux/fs.h>
16 #include <linux/mutex.h>
17 #include <linux/debug_locks.h>
19 struct mempolicy;
20 struct anon_vma;
22 #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
23 extern unsigned long max_mapnr;
24 #endif
26 extern unsigned long num_physpages;
27 extern void * high_memory;
28 extern unsigned long vmalloc_earlyreserve;
29 extern int page_cluster;
31 #ifdef CONFIG_SYSCTL
32 extern int sysctl_legacy_va_layout;
33 #else
34 #define sysctl_legacy_va_layout 0
35 #endif
37 #include <asm/page.h>
38 #include <asm/pgtable.h>
39 #include <asm/processor.h>
41 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
43 /*
44 * Linux kernel virtual memory manager primitives.
45 * The idea being to have a "virtual" mm in the same way
46 * we have a virtual fs - giving a cleaner interface to the
47 * mm details, and allowing different kinds of memory mappings
48 * (from shared memory to executable loading to arbitrary
49 * mmap() functions).
50 */
52 /*
53 * This struct defines a memory VMM memory area. There is one of these
54 * per VM-area/task. A VM area is any part of the process virtual memory
55 * space that has a special rule for the page-fault handlers (ie a shared
56 * library, the executable area etc).
57 */
58 struct vm_area_struct {
59 struct mm_struct * vm_mm; /* The address space we belong to. */
60 unsigned long vm_start; /* Our start address within vm_mm. */
61 unsigned long vm_end; /* The first byte after our end address
62 within vm_mm. */
64 /* linked list of VM areas per task, sorted by address */
65 struct vm_area_struct *vm_next;
67 pgprot_t vm_page_prot; /* Access permissions of this VMA. */
68 unsigned long vm_flags; /* Flags, listed below. */
70 struct rb_node vm_rb;
72 /*
73 * For areas with an address space and backing store,
74 * linkage into the address_space->i_mmap prio tree, or
75 * linkage to the list of like vmas hanging off its node, or
76 * linkage of vma in the address_space->i_mmap_nonlinear list.
77 */
78 union {
79 struct {
80 struct list_head list;
81 void *parent; /* aligns with prio_tree_node parent */
82 struct vm_area_struct *head;
83 } vm_set;
85 struct raw_prio_tree_node prio_tree_node;
86 } shared;
88 /*
89 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
90 * list, after a COW of one of the file pages. A MAP_SHARED vma
91 * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
92 * or brk vma (with NULL file) can only be in an anon_vma list.
93 */
94 struct list_head anon_vma_node; /* Serialized by anon_vma->lock */
95 struct anon_vma *anon_vma; /* Serialized by page_table_lock */
97 /* Function pointers to deal with this struct. */
98 struct vm_operations_struct * vm_ops;
100 /* Information about our backing store: */
101 unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
102 units, *not* PAGE_CACHE_SIZE */
103 struct file * vm_file; /* File we map to (can be NULL). */
104 void * vm_private_data; /* was vm_pte (shared mem) */
105 unsigned long vm_truncate_count;/* truncate_count or restart_addr */
107 #ifndef CONFIG_MMU
108 atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
109 #endif
110 #ifdef CONFIG_NUMA
111 struct mempolicy *vm_policy; /* NUMA policy for the VMA */
112 #endif
113 };
115 /*
116 * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
117 * disabled, then there's a single shared list of VMAs maintained by the
118 * system, and mm's subscribe to these individually
119 */
120 struct vm_list_struct {
121 struct vm_list_struct *next;
122 struct vm_area_struct *vma;
123 };
125 #ifndef CONFIG_MMU
126 extern struct rb_root nommu_vma_tree;
127 extern struct rw_semaphore nommu_vma_sem;
129 extern unsigned int kobjsize(const void *objp);
130 #endif
132 /*
133 * vm_flags..
134 */
135 #define VM_READ 0x00000001 /* currently active flags */
136 #define VM_WRITE 0x00000002
137 #define VM_EXEC 0x00000004
138 #define VM_SHARED 0x00000008
140 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
141 #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
142 #define VM_MAYWRITE 0x00000020
143 #define VM_MAYEXEC 0x00000040
144 #define VM_MAYSHARE 0x00000080
146 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */
147 #define VM_GROWSUP 0x00000200
148 #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
149 #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
151 #define VM_EXECUTABLE 0x00001000
152 #define VM_LOCKED 0x00002000
153 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
155 /* Used by sys_madvise() */
156 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
157 #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
159 #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
160 #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
161 #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */
162 #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
163 #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
164 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
165 #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
166 #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
167 #ifdef CONFIG_XEN
168 #define VM_FOREIGN 0x04000000 /* Has pages belonging to another VM */
169 struct vm_foreign_map {
170 struct page **map;
171 };
172 #endif
173 #define VM_ALWAYSDUMP 0x08000000 /* Always include in core dumps */
175 #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
176 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
177 #endif
179 #ifdef CONFIG_STACK_GROWSUP
180 #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
181 #else
182 #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
183 #endif
185 #define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ)
186 #define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK
187 #define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK))
188 #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ)
189 #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
191 /*
192 * mapping from the currently active vm_flags protection bits (the
193 * low four bits) to a page protection mask..
194 */
195 extern pgprot_t protection_map[16];
198 /*
199 * These are the virtual MM functions - opening of an area, closing and
200 * unmapping it (needed to keep files on disk up-to-date etc), pointer
201 * to the functions called when a no-page or a wp-page exception occurs.
202 */
203 struct vm_operations_struct {
204 void (*open)(struct vm_area_struct * area);
205 void (*close)(struct vm_area_struct * area);
206 struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
207 int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
209 /* notification that a previously read-only page is about to become
210 * writable, if an error is returned it will cause a SIGBUS */
211 int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
212 /* Area-specific function for clearing the PTE at @ptep. Returns the
213 * original value of @ptep. */
214 pte_t (*zap_pte)(struct vm_area_struct *vma,
215 unsigned long addr, pte_t *ptep, int is_fullmm);
217 /* called before close() to indicate no more pages should be mapped */
218 void (*unmap)(struct vm_area_struct *area);
220 #ifdef CONFIG_NUMA
221 int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
222 struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
223 unsigned long addr);
224 int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
225 const nodemask_t *to, unsigned long flags);
226 #endif
227 };
229 struct mmu_gather;
230 struct inode;
232 /*
233 * Each physical page in the system has a struct page associated with
234 * it to keep track of whatever it is we are using the page for at the
235 * moment. Note that we have no way to track which tasks are using
236 * a page.
237 */
238 struct page {
239 unsigned long flags; /* Atomic flags, some possibly
240 * updated asynchronously */
241 atomic_t _count; /* Usage count, see below. */
242 atomic_t _mapcount; /* Count of ptes mapped in mms,
243 * to show when page is mapped
244 * & limit reverse map searches.
245 */
246 union {
247 struct {
248 unsigned long private; /* Mapping-private opaque data:
249 * usually used for buffer_heads
250 * if PagePrivate set; used for
251 * swp_entry_t if PageSwapCache;
252 * indicates order in the buddy
253 * system if PG_buddy is set.
254 */
255 struct address_space *mapping; /* If low bit clear, points to
256 * inode address_space, or NULL.
257 * If page mapped as anonymous
258 * memory, low bit is set, and
259 * it points to anon_vma object:
260 * see PAGE_MAPPING_ANON below.
261 */
262 };
263 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
264 spinlock_t ptl;
265 #endif
266 };
267 pgoff_t index; /* Our offset within mapping. */
268 struct list_head lru; /* Pageout list, eg. active_list
269 * protected by zone->lru_lock !
270 */
271 /*
272 * On machines where all RAM is mapped into kernel address space,
273 * we can simply calculate the virtual address. On machines with
274 * highmem some memory is mapped into kernel virtual memory
275 * dynamically, so we need a place to store that address.
276 * Note that this field could be 16 bits on x86 ... ;)
277 *
278 * Architectures with slow multiplication can define
279 * WANT_PAGE_VIRTUAL in asm/page.h
280 */
281 #if defined(WANT_PAGE_VIRTUAL)
282 void *virtual; /* Kernel virtual address (NULL if
283 not kmapped, ie. highmem) */
284 #endif /* WANT_PAGE_VIRTUAL */
285 };
287 #define page_private(page) ((page)->private)
288 #define set_page_private(page, v) ((page)->private = (v))
290 /*
291 * FIXME: take this include out, include page-flags.h in
292 * files which need it (119 of them)
293 */
294 #include <linux/page-flags.h>
296 /*
297 * Methods to modify the page usage count.
298 *
299 * What counts for a page usage:
300 * - cache mapping (page->mapping)
301 * - private data (page->private)
302 * - page mapped in a task's page tables, each mapping
303 * is counted separately
304 *
305 * Also, many kernel routines increase the page count before a critical
306 * routine so they can be sure the page doesn't go away from under them.
307 */
309 /*
310 * Drop a ref, return true if the logical refcount fell to zero (the page has
311 * no users)
312 */
313 static inline int put_page_testzero(struct page *page)
314 {
315 BUG_ON(atomic_read(&page->_count) == 0);
316 return atomic_dec_and_test(&page->_count);
317 }
319 /*
320 * Try to grab a ref unless the page has a refcount of zero, return false if
321 * that is the case.
322 */
323 static inline int get_page_unless_zero(struct page *page)
324 {
325 return atomic_inc_not_zero(&page->_count);
326 }
328 extern void FASTCALL(__page_cache_release(struct page *));
330 static inline int page_count(struct page *page)
331 {
332 if (unlikely(PageCompound(page)))
333 page = (struct page *)page_private(page);
334 return atomic_read(&page->_count);
335 }
337 static inline void get_page(struct page *page)
338 {
339 if (unlikely(PageCompound(page)))
340 page = (struct page *)page_private(page);
341 atomic_inc(&page->_count);
342 }
344 /*
345 * Setup the page count before being freed into the page allocator for
346 * the first time (boot or memory hotplug)
347 */
348 static inline void init_page_count(struct page *page)
349 {
350 atomic_set(&page->_count, 1);
351 }
353 void put_page(struct page *page);
354 void put_pages_list(struct list_head *pages);
356 void split_page(struct page *page, unsigned int order);
358 /*
359 * Multiple processes may "see" the same page. E.g. for untouched
360 * mappings of /dev/null, all processes see the same page full of
361 * zeroes, and text pages of executables and shared libraries have
362 * only one copy in memory, at most, normally.
363 *
364 * For the non-reserved pages, page_count(page) denotes a reference count.
365 * page_count() == 0 means the page is free. page->lru is then used for
366 * freelist management in the buddy allocator.
367 * page_count() == 1 means the page is used for exactly one purpose
368 * (e.g. a private data page of one process).
369 *
370 * A page may be used for kmalloc() or anyone else who does a
371 * __get_free_page(). In this case the page_count() is at least 1, and
372 * all other fields are unused but should be 0 or NULL. The
373 * management of this page is the responsibility of the one who uses
374 * it.
375 *
376 * The other pages (we may call them "process pages") are completely
377 * managed by the Linux memory manager: I/O, buffers, swapping etc.
378 * The following discussion applies only to them.
379 *
380 * A page may belong to an inode's memory mapping. In this case,
381 * page->mapping is the pointer to the inode, and page->index is the
382 * file offset of the page, in units of PAGE_CACHE_SIZE.
383 *
384 * A page contains an opaque `private' member, which belongs to the
385 * page's address_space. Usually, this is the address of a circular
386 * list of the page's disk buffers.
387 *
388 * For pages belonging to inodes, the page_count() is the number of
389 * attaches, plus 1 if `private' contains something, plus one for
390 * the page cache itself.
391 *
392 * Instead of keeping dirty/clean pages in per address-space lists, we instead
393 * now tag pages as dirty/under writeback in the radix tree.
394 *
395 * There is also a per-mapping radix tree mapping index to the page
396 * in memory if present. The tree is rooted at mapping->root.
397 *
398 * All process pages can do I/O:
399 * - inode pages may need to be read from disk,
400 * - inode pages which have been modified and are MAP_SHARED may need
401 * to be written to disk,
402 * - private pages which have been modified may need to be swapped out
403 * to swap space and (later) to be read back into memory.
404 */
406 /*
407 * The zone field is never updated after free_area_init_core()
408 * sets it, so none of the operations on it need to be atomic.
409 */
412 /*
413 * page->flags layout:
414 *
415 * There are three possibilities for how page->flags get
416 * laid out. The first is for the normal case, without
417 * sparsemem. The second is for sparsemem when there is
418 * plenty of space for node and section. The last is when
419 * we have run out of space and have to fall back to an
420 * alternate (slower) way of determining the node.
421 *
422 * No sparsemem: | NODE | ZONE | ... | FLAGS |
423 * with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
424 * no space for node: | SECTION | ZONE | ... | FLAGS |
425 */
426 #ifdef CONFIG_SPARSEMEM
427 #define SECTIONS_WIDTH SECTIONS_SHIFT
428 #else
429 #define SECTIONS_WIDTH 0
430 #endif
432 #define ZONES_WIDTH ZONES_SHIFT
434 #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
435 #define NODES_WIDTH NODES_SHIFT
436 #else
437 #define NODES_WIDTH 0
438 #endif
440 /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
441 #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
442 #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
443 #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
445 /*
446 * We are going to use the flags for the page to node mapping if its in
447 * there. This includes the case where there is no node, so it is implicit.
448 */
449 #define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0)
451 #ifndef PFN_SECTION_SHIFT
452 #define PFN_SECTION_SHIFT 0
453 #endif
455 /*
456 * Define the bit shifts to access each section. For non-existant
457 * sections we define the shift as 0; that plus a 0 mask ensures
458 * the compiler will optimise away reference to them.
459 */
460 #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
461 #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
462 #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
464 /* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
465 #if FLAGS_HAS_NODE
466 #define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT)
467 #else
468 #define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
469 #endif
470 #define ZONETABLE_PGSHIFT ZONES_PGSHIFT
472 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
473 #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
474 #endif
476 #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
477 #define NODES_MASK ((1UL << NODES_WIDTH) - 1)
478 #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
479 #define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1)
481 static inline unsigned long page_zonenum(struct page *page)
482 {
483 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
484 }
486 struct zone;
487 extern struct zone *zone_table[];
489 static inline int page_zone_id(struct page *page)
490 {
491 return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
492 }
493 static inline struct zone *page_zone(struct page *page)
494 {
495 return zone_table[page_zone_id(page)];
496 }
498 static inline unsigned long page_to_nid(struct page *page)
499 {
500 if (FLAGS_HAS_NODE)
501 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
502 else
503 return page_zone(page)->zone_pgdat->node_id;
504 }
505 static inline unsigned long page_to_section(struct page *page)
506 {
507 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
508 }
510 static inline void set_page_zone(struct page *page, unsigned long zone)
511 {
512 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
513 page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
514 }
515 static inline void set_page_node(struct page *page, unsigned long node)
516 {
517 page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
518 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
519 }
520 static inline void set_page_section(struct page *page, unsigned long section)
521 {
522 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
523 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
524 }
526 static inline void set_page_links(struct page *page, unsigned long zone,
527 unsigned long node, unsigned long pfn)
528 {
529 set_page_zone(page, zone);
530 set_page_node(page, node);
531 set_page_section(page, pfn_to_section_nr(pfn));
532 }
534 /*
535 * Some inline functions in vmstat.h depend on page_zone()
536 */
537 #include <linux/vmstat.h>
539 #ifndef CONFIG_DISCONTIGMEM
540 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
541 extern struct page *mem_map;
542 #endif
544 static __always_inline void *lowmem_page_address(struct page *page)
545 {
546 return __va(page_to_pfn(page) << PAGE_SHIFT);
547 }
549 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
550 #define HASHED_PAGE_VIRTUAL
551 #endif
553 #if defined(WANT_PAGE_VIRTUAL)
554 #define page_address(page) ((page)->virtual)
555 #define set_page_address(page, address) \
556 do { \
557 (page)->virtual = (address); \
558 } while(0)
559 #define page_address_init() do { } while(0)
560 #endif
562 #if defined(HASHED_PAGE_VIRTUAL)
563 void *page_address(struct page *page);
564 void set_page_address(struct page *page, void *virtual);
565 void page_address_init(void);
566 #endif
568 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
569 #define page_address(page) lowmem_page_address(page)
570 #define set_page_address(page, address) do { } while(0)
571 #define page_address_init() do { } while(0)
572 #endif
574 /*
575 * On an anonymous page mapped into a user virtual memory area,
576 * page->mapping points to its anon_vma, not to a struct address_space;
577 * with the PAGE_MAPPING_ANON bit set to distinguish it.
578 *
579 * Please note that, confusingly, "page_mapping" refers to the inode
580 * address_space which maps the page from disk; whereas "page_mapped"
581 * refers to user virtual address space into which the page is mapped.
582 */
583 #define PAGE_MAPPING_ANON 1
585 extern struct address_space swapper_space;
586 static inline struct address_space *page_mapping(struct page *page)
587 {
588 struct address_space *mapping = page->mapping;
590 if (unlikely(PageSwapCache(page)))
591 mapping = &swapper_space;
592 else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
593 mapping = NULL;
594 return mapping;
595 }
597 static inline int PageAnon(struct page *page)
598 {
599 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
600 }
602 /*
603 * Return the pagecache index of the passed page. Regular pagecache pages
604 * use ->index whereas swapcache pages use ->private
605 */
606 static inline pgoff_t page_index(struct page *page)
607 {
608 if (unlikely(PageSwapCache(page)))
609 return page_private(page);
610 return page->index;
611 }
613 /*
614 * The atomic page->_mapcount, like _count, starts from -1:
615 * so that transitions both from it and to it can be tracked,
616 * using atomic_inc_and_test and atomic_add_negative(-1).
617 */
618 static inline void reset_page_mapcount(struct page *page)
619 {
620 atomic_set(&(page)->_mapcount, -1);
621 }
623 static inline int page_mapcount(struct page *page)
624 {
625 return atomic_read(&(page)->_mapcount) + 1;
626 }
628 /*
629 * Return true if this page is mapped into pagetables.
630 */
631 static inline int page_mapped(struct page *page)
632 {
633 return atomic_read(&(page)->_mapcount) >= 0;
634 }
636 /*
637 * Error return values for the *_nopage functions
638 */
639 #define NOPAGE_SIGBUS (NULL)
640 #define NOPAGE_OOM ((struct page *) (-1))
642 /*
643 * Different kinds of faults, as returned by handle_mm_fault().
644 * Used to decide whether a process gets delivered SIGBUS or
645 * just gets major/minor fault counters bumped up.
646 */
647 #define VM_FAULT_OOM 0x00
648 #define VM_FAULT_SIGBUS 0x01
649 #define VM_FAULT_MINOR 0x02
650 #define VM_FAULT_MAJOR 0x03
652 /*
653 * Special case for get_user_pages.
654 * Must be in a distinct bit from the above VM_FAULT_ flags.
655 */
656 #define VM_FAULT_WRITE 0x10
658 #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
660 extern void show_free_areas(void);
662 #ifdef CONFIG_SHMEM
663 struct page *shmem_nopage(struct vm_area_struct *vma,
664 unsigned long address, int *type);
665 int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new);
666 struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
667 unsigned long addr);
668 int shmem_lock(struct file *file, int lock, struct user_struct *user);
669 #else
670 #define shmem_nopage filemap_nopage
672 static inline int shmem_lock(struct file *file, int lock,
673 struct user_struct *user)
674 {
675 return 0;
676 }
678 static inline int shmem_set_policy(struct vm_area_struct *vma,
679 struct mempolicy *new)
680 {
681 return 0;
682 }
684 static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
685 unsigned long addr)
686 {
687 return NULL;
688 }
689 #endif
690 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
691 extern int shmem_mmap(struct file *file, struct vm_area_struct *vma);
693 int shmem_zero_setup(struct vm_area_struct *);
695 #ifndef CONFIG_MMU
696 extern unsigned long shmem_get_unmapped_area(struct file *file,
697 unsigned long addr,
698 unsigned long len,
699 unsigned long pgoff,
700 unsigned long flags);
701 #endif
703 static inline int can_do_mlock(void)
704 {
705 if (capable(CAP_IPC_LOCK))
706 return 1;
707 if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
708 return 1;
709 return 0;
710 }
711 extern int user_shm_lock(size_t, struct user_struct *);
712 extern void user_shm_unlock(size_t, struct user_struct *);
714 /*
715 * Parameter block passed down to zap_pte_range in exceptional cases.
716 */
717 struct zap_details {
718 struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
719 struct address_space *check_mapping; /* Check page->mapping if set */
720 pgoff_t first_index; /* Lowest page->index to unmap */
721 pgoff_t last_index; /* Highest page->index to unmap */
722 spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
723 unsigned long truncate_count; /* Compare vm_truncate_count */
724 };
726 struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
727 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
728 unsigned long size, struct zap_details *);
729 unsigned long unmap_vmas(struct mmu_gather **tlb,
730 struct vm_area_struct *start_vma, unsigned long start_addr,
731 unsigned long end_addr, unsigned long *nr_accounted,
732 struct zap_details *);
733 void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
734 unsigned long end, unsigned long floor, unsigned long ceiling);
735 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
736 unsigned long floor, unsigned long ceiling);
737 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
738 struct vm_area_struct *vma);
739 int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
740 unsigned long size, pgprot_t prot);
741 void unmap_mapping_range(struct address_space *mapping,
742 loff_t const holebegin, loff_t const holelen, int even_cows);
744 static inline void unmap_shared_mapping_range(struct address_space *mapping,
745 loff_t const holebegin, loff_t const holelen)
746 {
747 unmap_mapping_range(mapping, holebegin, holelen, 0);
748 }
750 extern int vmtruncate(struct inode * inode, loff_t offset);
751 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
752 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
753 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
755 #ifdef CONFIG_MMU
756 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
757 unsigned long address, int write_access);
759 static inline int handle_mm_fault(struct mm_struct *mm,
760 struct vm_area_struct *vma, unsigned long address,
761 int write_access)
762 {
763 return __handle_mm_fault(mm, vma, address, write_access) &
764 (~VM_FAULT_WRITE);
765 }
766 #else
767 static inline int handle_mm_fault(struct mm_struct *mm,
768 struct vm_area_struct *vma, unsigned long address,
769 int write_access)
770 {
771 /* should never happen if there's no MMU */
772 BUG();
773 return VM_FAULT_SIGBUS;
774 }
775 #endif
777 extern int make_pages_present(unsigned long addr, unsigned long end);
778 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
779 void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
781 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
782 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
783 void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
785 int __set_page_dirty_buffers(struct page *page);
786 int __set_page_dirty_nobuffers(struct page *page);
787 int redirty_page_for_writepage(struct writeback_control *wbc,
788 struct page *page);
789 int FASTCALL(set_page_dirty(struct page *page));
790 int set_page_dirty_lock(struct page *page);
791 int clear_page_dirty_for_io(struct page *page);
793 extern unsigned long do_mremap(unsigned long addr,
794 unsigned long old_len, unsigned long new_len,
795 unsigned long flags, unsigned long new_addr);
797 /*
798 * Prototype to add a shrinker callback for ageable caches.
799 *
800 * These functions are passed a count `nr_to_scan' and a gfpmask. They should
801 * scan `nr_to_scan' objects, attempting to free them.
802 *
803 * The callback must return the number of objects which remain in the cache.
804 *
805 * The callback will be passed nr_to_scan == 0 when the VM is querying the
806 * cache size, so a fastpath for that case is appropriate.
807 */
808 typedef int (*shrinker_t)(int nr_to_scan, gfp_t gfp_mask);
810 /*
811 * Add an aging callback. The int is the number of 'seeks' it takes
812 * to recreate one of the objects that these functions age.
813 */
815 #define DEFAULT_SEEKS 2
816 struct shrinker;
817 extern struct shrinker *set_shrinker(int, shrinker_t);
818 extern void remove_shrinker(struct shrinker *shrinker);
820 extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
822 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
823 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
824 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
825 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
827 /*
828 * The following ifdef needed to get the 4level-fixup.h header to work.
829 * Remove it when 4level-fixup.h has been removed.
830 */
831 #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
832 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
833 {
834 return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
835 NULL: pud_offset(pgd, address);
836 }
838 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
839 {
840 return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
841 NULL: pmd_offset(pud, address);
842 }
843 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
845 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
846 /*
847 * We tuck a spinlock to guard each pagetable page into its struct page,
848 * at page->private, with BUILD_BUG_ON to make sure that this will not
849 * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
850 * When freeing, reset page->mapping so free_pages_check won't complain.
851 */
852 #define __pte_lockptr(page) &((page)->ptl)
853 #define pte_lock_init(_page) do { \
854 spin_lock_init(__pte_lockptr(_page)); \
855 } while (0)
856 #define pte_lock_deinit(page) ((page)->mapping = NULL)
857 #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
858 #else
859 /*
860 * We use mm->page_table_lock to guard all pagetable pages of the mm.
861 */
862 #define pte_lock_init(page) do {} while (0)
863 #define pte_lock_deinit(page) do {} while (0)
864 #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
865 #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
867 #define pte_offset_map_lock(mm, pmd, address, ptlp) \
868 ({ \
869 spinlock_t *__ptl = pte_lockptr(mm, pmd); \
870 pte_t *__pte = pte_offset_map(pmd, address); \
871 *(ptlp) = __ptl; \
872 spin_lock(__ptl); \
873 __pte; \
874 })
876 #define pte_unmap_unlock(pte, ptl) do { \
877 spin_unlock(ptl); \
878 pte_unmap(pte); \
879 } while (0)
881 #define pte_alloc_map(mm, pmd, address) \
882 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
883 NULL: pte_offset_map(pmd, address))
885 #define pte_alloc_map_lock(mm, pmd, address, ptlp) \
886 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
887 NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
889 #define pte_alloc_kernel(pmd, address) \
890 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
891 NULL: pte_offset_kernel(pmd, address))
893 extern void free_area_init(unsigned long * zones_size);
894 extern void free_area_init_node(int nid, pg_data_t *pgdat,
895 unsigned long * zones_size, unsigned long zone_start_pfn,
896 unsigned long *zholes_size);
897 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
898 extern void setup_per_zone_pages_min(void);
899 extern void mem_init(void);
900 extern void show_mem(void);
901 extern void si_meminfo(struct sysinfo * val);
902 extern void si_meminfo_node(struct sysinfo *val, int nid);
904 #ifdef CONFIG_NUMA
905 extern void setup_per_cpu_pageset(void);
906 #else
907 static inline void setup_per_cpu_pageset(void) {}
908 #endif
910 /* prio_tree.c */
911 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
912 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
913 void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
914 struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
915 struct prio_tree_iter *iter);
917 #define vma_prio_tree_foreach(vma, iter, root, begin, end) \
918 for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \
919 (vma = vma_prio_tree_next(vma, iter)); )
921 static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
922 struct list_head *list)
923 {
924 vma->shared.vm_set.parent = NULL;
925 list_add_tail(&vma->shared.vm_set.list, list);
926 }
928 /* mmap.c */
929 extern int __vm_enough_memory(long pages, int cap_sys_admin);
930 extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
931 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
932 extern struct vm_area_struct *vma_merge(struct mm_struct *,
933 struct vm_area_struct *prev, unsigned long addr, unsigned long end,
934 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
935 struct mempolicy *);
936 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
937 extern int split_vma(struct mm_struct *,
938 struct vm_area_struct *, unsigned long addr, int new_below);
939 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
940 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
941 struct rb_node **, struct rb_node *);
942 extern void unlink_file_vma(struct vm_area_struct *);
943 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
944 unsigned long addr, unsigned long len, pgoff_t pgoff);
945 extern void exit_mmap(struct mm_struct *);
946 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
948 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
950 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
951 unsigned long len, unsigned long prot,
952 unsigned long flag, unsigned long pgoff);
954 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
955 unsigned long len, unsigned long prot,
956 unsigned long flag, unsigned long offset)
957 {
958 unsigned long ret = -EINVAL;
959 if ((offset + PAGE_ALIGN(len)) < offset)
960 goto out;
961 if (!(offset & ~PAGE_MASK))
962 ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
963 out:
964 return ret;
965 }
967 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
969 extern unsigned long do_brk(unsigned long, unsigned long);
971 /* filemap.c */
972 extern unsigned long page_unuse(struct page *);
973 extern void truncate_inode_pages(struct address_space *, loff_t);
974 extern void truncate_inode_pages_range(struct address_space *,
975 loff_t lstart, loff_t lend);
977 /* generic vm_area_ops exported for stackable file systems */
978 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
979 extern int filemap_populate(struct vm_area_struct *, unsigned long,
980 unsigned long, pgprot_t, unsigned long, int);
982 /* mm/page-writeback.c */
983 int write_one_page(struct page *page, int wait);
985 /* readahead.c */
986 #define VM_MAX_READAHEAD 128 /* kbytes */
987 #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
988 #define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before
989 * turning readahead off */
991 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
992 pgoff_t offset, unsigned long nr_to_read);
993 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
994 pgoff_t offset, unsigned long nr_to_read);
995 unsigned long page_cache_readahead(struct address_space *mapping,
996 struct file_ra_state *ra,
997 struct file *filp,
998 pgoff_t offset,
999 unsigned long size);
1000 void handle_ra_miss(struct address_space *mapping,
1001 struct file_ra_state *ra, pgoff_t offset);
1002 unsigned long max_sane_readahead(unsigned long nr);
1004 /* Do stack extension */
1005 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
1006 #ifdef CONFIG_IA64
1007 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
1008 #endif
1010 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1011 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
1012 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
1013 struct vm_area_struct **pprev);
1015 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
1016 NULL if none. Assume start_addr < end_addr. */
1017 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
1019 struct vm_area_struct * vma = find_vma(mm,start_addr);
1021 if (vma && end_addr <= vma->vm_start)
1022 vma = NULL;
1023 return vma;
1026 static inline unsigned long vma_pages(struct vm_area_struct *vma)
1028 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
1031 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
1032 struct page *vmalloc_to_page(void *addr);
1033 unsigned long vmalloc_to_pfn(void *addr);
1034 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
1035 unsigned long pfn, unsigned long size, pgprot_t);
1036 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
1038 struct page *follow_page(struct vm_area_struct *, unsigned long address,
1039 unsigned int foll_flags);
1040 #define FOLL_WRITE 0x01 /* check pte is writable */
1041 #define FOLL_TOUCH 0x02 /* mark page accessed */
1042 #define FOLL_GET 0x04 /* do get_page on page */
1043 #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
1045 #ifdef CONFIG_XEN
1046 typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
1047 void *data);
1048 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
1049 unsigned long size, pte_fn_t fn, void *data);
1050 #endif
1052 #ifdef CONFIG_PROC_FS
1053 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
1054 #else
1055 static inline void vm_stat_account(struct mm_struct *mm,
1056 unsigned long flags, struct file *file, long pages)
1059 #endif /* CONFIG_PROC_FS */
1061 #ifndef CONFIG_DEBUG_PAGEALLOC
1062 static inline void
1063 kernel_map_pages(struct page *page, int numpages, int enable)
1065 if (!PageHighMem(page) && !enable)
1066 debug_check_no_locks_freed(page_address(page),
1067 numpages * PAGE_SIZE);
1069 #endif
1071 extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
1072 #ifdef __HAVE_ARCH_GATE_AREA
1073 int in_gate_area_no_task(unsigned long addr);
1074 int in_gate_area(struct task_struct *task, unsigned long addr);
1075 #else
1076 int in_gate_area_no_task(unsigned long addr);
1077 #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
1078 #endif /* __HAVE_ARCH_GATE_AREA */
1080 /* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
1081 #define OOM_DISABLE -17
1083 int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
1084 void __user *, size_t *, loff_t *);
1085 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
1086 unsigned long lru_pages);
1087 void drop_pagecache(void);
1088 void drop_slab(void);
1090 #ifndef CONFIG_MMU
1091 #define randomize_va_space 0
1092 #else
1093 extern int randomize_va_space;
1094 #endif
1096 const char *arch_vma_name(struct vm_area_struct *vma);
1098 #endif /* __KERNEL__ */
1099 #endif /* _LINUX_MM_H */