ia64/linux-2.6.18-xen.hg

view mm/filemap.c @ 907:cad6f60f0506

Transcendent memory ("tmem") for Linux

Tmem, when called from a tmem-capable (paravirtualized) guest, makes
use of otherwise unutilized ("fallow") memory to create and manage
pools of pages that can be accessed from the guest either as
"ephemeral" pages or as "persistent" pages. In either case, the pages
are not directly addressible by the guest, only copied to and fro via
the tmem interface. Ephemeral pages are a nice place for a guest to
put recently evicted clean pages that it might need again; these pages
can be reclaimed synchronously by Xen for other guests or other uses.
Persistent pages are a nice place for a guest to put "swap" pages to
avoid sending them to disk. These pages retain data as long as the
guest lives, but count against the guest memory allocation.

This patch contains the Linux paravirtualization changes to
complement the tmem Xen patch (xen-unstable c/s 19646). It
implements "precache" (ext3 only as of now), "preswap",
and limited "shared precache" (ocfs2 only as of now) support.
CONFIG options are required to turn on
the support (but in this patch they default to "y"). If
the underlying Xen does not have tmem support or has it
turned off, this is sensed early to avoid nearly all
hypercalls.

Lots of useful prose about tmem can be found at
http://oss.oracle.com/projects/tmem

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 18 10:24:18 2009 +0100 (2009-06-18)
parents 831230e53067
children
line source
1 /*
2 * linux/mm/filemap.c
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
11 */
12 #include <linux/module.h>
13 #include <linux/slab.h>
14 #include <linux/compiler.h>
15 #include <linux/fs.h>
16 #include <linux/uaccess.h>
17 #include <linux/aio.h>
18 #include <linux/capability.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/mman.h>
23 #include <linux/pagemap.h>
24 #include <linux/file.h>
25 #include <linux/uio.h>
26 #include <linux/hash.h>
27 #include <linux/writeback.h>
28 #include <linux/pagevec.h>
29 #include <linux/blkdev.h>
30 #include <linux/security.h>
31 #include <linux/syscalls.h>
32 #include <linux/cpuset.h>
33 #include <linux/precache.h>
34 #include "filemap.h"
35 #include "internal.h"
37 /*
38 * FIXME: remove all knowledge of the buffer layer from the core VM
39 */
40 #include <linux/buffer_head.h> /* for generic_osync_inode */
42 #include <asm/mman.h>
44 static ssize_t
45 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
46 loff_t offset, unsigned long nr_segs);
48 /*
49 * Shared mappings implemented 30.11.1994. It's not fully working yet,
50 * though.
51 *
52 * Shared mappings now work. 15.8.1995 Bruno.
53 *
54 * finished 'unifying' the page and buffer cache and SMP-threaded the
55 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
56 *
57 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
58 */
60 /*
61 * Lock ordering:
62 *
63 * ->i_mmap_lock (vmtruncate)
64 * ->private_lock (__free_pte->__set_page_dirty_buffers)
65 * ->swap_lock (exclusive_swap_page, others)
66 * ->mapping->tree_lock
67 *
68 * ->i_mutex
69 * ->i_mmap_lock (truncate->unmap_mapping_range)
70 *
71 * ->mmap_sem
72 * ->i_mmap_lock
73 * ->page_table_lock or pte_lock (various, mainly in memory.c)
74 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
75 *
76 * ->mmap_sem
77 * ->lock_page (access_process_vm)
78 *
79 * ->mmap_sem
80 * ->i_mutex (msync)
81 *
82 * ->i_mutex
83 * ->i_alloc_sem (various)
84 *
85 * ->inode_lock
86 * ->sb_lock (fs/fs-writeback.c)
87 * ->mapping->tree_lock (__sync_single_inode)
88 *
89 * ->i_mmap_lock
90 * ->anon_vma.lock (vma_adjust)
91 *
92 * ->anon_vma.lock
93 * ->page_table_lock or pte_lock (anon_vma_prepare and various)
94 *
95 * ->page_table_lock or pte_lock
96 * ->swap_lock (try_to_unmap_one)
97 * ->private_lock (try_to_unmap_one)
98 * ->tree_lock (try_to_unmap_one)
99 * ->zone.lru_lock (follow_page->mark_page_accessed)
100 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
101 * ->private_lock (page_remove_rmap->set_page_dirty)
102 * ->tree_lock (page_remove_rmap->set_page_dirty)
103 * ->inode_lock (page_remove_rmap->set_page_dirty)
104 * ->inode_lock (zap_pte_range->set_page_dirty)
105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
106 *
107 * ->task->proc_lock
108 * ->dcache_lock (proc_pid_lookup)
109 */
111 /*
112 * Remove a page from the page cache and free it. Caller has to make
113 * sure the page is locked and that nobody else uses it - or that usage
114 * is safe. The caller must hold a write_lock on the mapping's tree_lock.
115 */
116 void __remove_from_page_cache(struct page *page)
117 {
118 struct address_space *mapping = page->mapping;
120 /*
121 * if we're uptodate, flush out into the precache, otherwise
122 * invalidate any existing precache entries. We can't leave
123 * stale data around in the precache once our page is gone
124 */
125 if (PageUptodate(page))
126 precache_put(page->mapping, page->index, page);
127 else
128 precache_flush(page->mapping, page->index);
130 radix_tree_delete(&mapping->page_tree, page->index);
131 page->mapping = NULL;
132 mapping->nrpages--;
133 __dec_zone_page_state(page, NR_FILE_PAGES);
134 }
136 void remove_from_page_cache(struct page *page)
137 {
138 struct address_space *mapping = page->mapping;
140 BUG_ON(!PageLocked(page));
142 write_lock_irq(&mapping->tree_lock);
143 __remove_from_page_cache(page);
144 write_unlock_irq(&mapping->tree_lock);
145 }
147 static int sync_page(void *word)
148 {
149 struct address_space *mapping;
150 struct page *page;
152 page = container_of((unsigned long *)word, struct page, flags);
154 /*
155 * page_mapping() is being called without PG_locked held.
156 * Some knowledge of the state and use of the page is used to
157 * reduce the requirements down to a memory barrier.
158 * The danger here is of a stale page_mapping() return value
159 * indicating a struct address_space different from the one it's
160 * associated with when it is associated with one.
161 * After smp_mb(), it's either the correct page_mapping() for
162 * the page, or an old page_mapping() and the page's own
163 * page_mapping() has gone NULL.
164 * The ->sync_page() address_space operation must tolerate
165 * page_mapping() going NULL. By an amazing coincidence,
166 * this comes about because none of the users of the page
167 * in the ->sync_page() methods make essential use of the
168 * page_mapping(), merely passing the page down to the backing
169 * device's unplug functions when it's non-NULL, which in turn
170 * ignore it for all cases but swap, where only page_private(page) is
171 * of interest. When page_mapping() does go NULL, the entire
172 * call stack gracefully ignores the page and returns.
173 * -- wli
174 */
175 smp_mb();
176 mapping = page_mapping(page);
177 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
178 mapping->a_ops->sync_page(page);
179 io_schedule();
180 return 0;
181 }
183 /**
184 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
185 * @mapping: address space structure to write
186 * @start: offset in bytes where the range starts
187 * @end: offset in bytes where the range ends (inclusive)
188 * @sync_mode: enable synchronous operation
189 *
190 * Start writeback against all of a mapping's dirty pages that lie
191 * within the byte offsets <start, end> inclusive.
192 *
193 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
194 * opposed to a regular memory cleansing writeback. The difference between
195 * these two operations is that if a dirty page/buffer is encountered, it must
196 * be waited upon, and not just skipped over.
197 */
198 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
199 loff_t end, int sync_mode)
200 {
201 int ret;
202 struct writeback_control wbc = {
203 .sync_mode = sync_mode,
204 .nr_to_write = mapping->nrpages * 2,
205 .range_start = start,
206 .range_end = end,
207 };
209 if (!mapping_cap_writeback_dirty(mapping))
210 return 0;
212 ret = do_writepages(mapping, &wbc);
213 return ret;
214 }
216 static inline int __filemap_fdatawrite(struct address_space *mapping,
217 int sync_mode)
218 {
219 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
220 }
222 int filemap_fdatawrite(struct address_space *mapping)
223 {
224 return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
225 }
226 EXPORT_SYMBOL(filemap_fdatawrite);
228 static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
229 loff_t end)
230 {
231 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
232 }
234 /**
235 * filemap_flush - mostly a non-blocking flush
236 * @mapping: target address_space
237 *
238 * This is a mostly non-blocking flush. Not suitable for data-integrity
239 * purposes - I/O may not be started against all dirty pages.
240 */
241 int filemap_flush(struct address_space *mapping)
242 {
243 return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
244 }
245 EXPORT_SYMBOL(filemap_flush);
247 /**
248 * wait_on_page_writeback_range - wait for writeback to complete
249 * @mapping: target address_space
250 * @start: beginning page index
251 * @end: ending page index
252 *
253 * Wait for writeback to complete against pages indexed by start->end
254 * inclusive
255 */
256 int wait_on_page_writeback_range(struct address_space *mapping,
257 pgoff_t start, pgoff_t end)
258 {
259 struct pagevec pvec;
260 int nr_pages;
261 int ret = 0;
262 pgoff_t index;
264 if (end < start)
265 return 0;
267 pagevec_init(&pvec, 0);
268 index = start;
269 while ((index <= end) &&
270 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
271 PAGECACHE_TAG_WRITEBACK,
272 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
273 unsigned i;
275 for (i = 0; i < nr_pages; i++) {
276 struct page *page = pvec.pages[i];
278 /* until radix tree lookup accepts end_index */
279 if (page->index > end)
280 continue;
282 wait_on_page_writeback(page);
283 if (PageError(page))
284 ret = -EIO;
285 }
286 pagevec_release(&pvec);
287 cond_resched();
288 }
290 /* Check for outstanding write errors */
291 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
292 ret = -ENOSPC;
293 if (test_and_clear_bit(AS_EIO, &mapping->flags))
294 ret = -EIO;
296 return ret;
297 }
299 /**
300 * sync_page_range - write and wait on all pages in the passed range
301 * @inode: target inode
302 * @mapping: target address_space
303 * @pos: beginning offset in pages to write
304 * @count: number of bytes to write
305 *
306 * Write and wait upon all the pages in the passed range. This is a "data
307 * integrity" operation. It waits upon in-flight writeout before starting and
308 * waiting upon new writeout. If there was an IO error, return it.
309 *
310 * We need to re-take i_mutex during the generic_osync_inode list walk because
311 * it is otherwise livelockable.
312 */
313 int sync_page_range(struct inode *inode, struct address_space *mapping,
314 loff_t pos, loff_t count)
315 {
316 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
317 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
318 int ret;
320 if (!mapping_cap_writeback_dirty(mapping) || !count)
321 return 0;
322 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
323 if (ret == 0) {
324 mutex_lock(&inode->i_mutex);
325 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
326 mutex_unlock(&inode->i_mutex);
327 }
328 if (ret == 0)
329 ret = wait_on_page_writeback_range(mapping, start, end);
330 return ret;
331 }
332 EXPORT_SYMBOL(sync_page_range);
334 /**
335 * sync_page_range_nolock
336 * @inode: target inode
337 * @mapping: target address_space
338 * @pos: beginning offset in pages to write
339 * @count: number of bytes to write
340 *
341 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
342 * as it forces O_SYNC writers to different parts of the same file
343 * to be serialised right until io completion.
344 */
345 int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
346 loff_t pos, loff_t count)
347 {
348 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
349 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
350 int ret;
352 if (!mapping_cap_writeback_dirty(mapping) || !count)
353 return 0;
354 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
355 if (ret == 0)
356 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
357 if (ret == 0)
358 ret = wait_on_page_writeback_range(mapping, start, end);
359 return ret;
360 }
361 EXPORT_SYMBOL(sync_page_range_nolock);
363 /**
364 * filemap_fdatawait - wait for all under-writeback pages to complete
365 * @mapping: address space structure to wait for
366 *
367 * Walk the list of under-writeback pages of the given address space
368 * and wait for all of them.
369 */
370 int filemap_fdatawait(struct address_space *mapping)
371 {
372 loff_t i_size = i_size_read(mapping->host);
374 if (i_size == 0)
375 return 0;
377 return wait_on_page_writeback_range(mapping, 0,
378 (i_size - 1) >> PAGE_CACHE_SHIFT);
379 }
380 EXPORT_SYMBOL(filemap_fdatawait);
382 int filemap_write_and_wait(struct address_space *mapping)
383 {
384 int err = 0;
386 if (mapping->nrpages) {
387 err = filemap_fdatawrite(mapping);
388 /*
389 * Even if the above returned error, the pages may be
390 * written partially (e.g. -ENOSPC), so we wait for it.
391 * But the -EIO is special case, it may indicate the worst
392 * thing (e.g. bug) happened, so we avoid waiting for it.
393 */
394 if (err != -EIO) {
395 int err2 = filemap_fdatawait(mapping);
396 if (!err)
397 err = err2;
398 }
399 }
400 return err;
401 }
402 EXPORT_SYMBOL(filemap_write_and_wait);
404 /**
405 * filemap_write_and_wait_range - write out & wait on a file range
406 * @mapping: the address_space for the pages
407 * @lstart: offset in bytes where the range starts
408 * @lend: offset in bytes where the range ends (inclusive)
409 *
410 * Write out and wait upon file offsets lstart->lend, inclusive.
411 *
412 * Note that `lend' is inclusive (describes the last byte to be written) so
413 * that this function can be used to write to the very end-of-file (end = -1).
414 */
415 int filemap_write_and_wait_range(struct address_space *mapping,
416 loff_t lstart, loff_t lend)
417 {
418 int err = 0;
420 if (mapping->nrpages) {
421 err = __filemap_fdatawrite_range(mapping, lstart, lend,
422 WB_SYNC_ALL);
423 /* See comment of filemap_write_and_wait() */
424 if (err != -EIO) {
425 int err2 = wait_on_page_writeback_range(mapping,
426 lstart >> PAGE_CACHE_SHIFT,
427 lend >> PAGE_CACHE_SHIFT);
428 if (!err)
429 err = err2;
430 }
431 }
432 return err;
433 }
435 /**
436 * add_to_page_cache - add newly allocated pagecache pages
437 * @page: page to add
438 * @mapping: the page's address_space
439 * @offset: page index
440 * @gfp_mask: page allocation mode
441 *
442 * This function is used to add newly allocated pagecache pages;
443 * the page is new, so we can just run SetPageLocked() against it.
444 * The other page state flags were set by rmqueue().
445 *
446 * This function does not add the page to the LRU. The caller must do that.
447 */
448 int add_to_page_cache(struct page *page, struct address_space *mapping,
449 pgoff_t offset, gfp_t gfp_mask)
450 {
451 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
453 if (error == 0) {
454 write_lock_irq(&mapping->tree_lock);
455 error = radix_tree_insert(&mapping->page_tree, offset, page);
456 if (!error) {
457 page_cache_get(page);
458 SetPageLocked(page);
459 page->mapping = mapping;
460 page->index = offset;
461 mapping->nrpages++;
462 __inc_zone_page_state(page, NR_FILE_PAGES);
463 }
464 write_unlock_irq(&mapping->tree_lock);
465 radix_tree_preload_end();
466 }
467 return error;
468 }
469 EXPORT_SYMBOL(add_to_page_cache);
471 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
472 pgoff_t offset, gfp_t gfp_mask)
473 {
474 int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
475 if (ret == 0)
476 lru_cache_add(page);
477 return ret;
478 }
480 #ifdef CONFIG_NUMA
481 struct page *page_cache_alloc(struct address_space *x)
482 {
483 if (cpuset_do_page_mem_spread()) {
484 int n = cpuset_mem_spread_node();
485 return alloc_pages_node(n, mapping_gfp_mask(x), 0);
486 }
487 return alloc_pages(mapping_gfp_mask(x), 0);
488 }
489 EXPORT_SYMBOL(page_cache_alloc);
491 struct page *page_cache_alloc_cold(struct address_space *x)
492 {
493 if (cpuset_do_page_mem_spread()) {
494 int n = cpuset_mem_spread_node();
495 return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
496 }
497 return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
498 }
499 EXPORT_SYMBOL(page_cache_alloc_cold);
500 #endif
502 /*
503 * In order to wait for pages to become available there must be
504 * waitqueues associated with pages. By using a hash table of
505 * waitqueues where the bucket discipline is to maintain all
506 * waiters on the same queue and wake all when any of the pages
507 * become available, and for the woken contexts to check to be
508 * sure the appropriate page became available, this saves space
509 * at a cost of "thundering herd" phenomena during rare hash
510 * collisions.
511 */
512 static wait_queue_head_t *page_waitqueue(struct page *page)
513 {
514 const struct zone *zone = page_zone(page);
516 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
517 }
519 static inline void wake_up_page(struct page *page, int bit)
520 {
521 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
522 }
524 void fastcall wait_on_page_bit(struct page *page, int bit_nr)
525 {
526 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
528 if (test_bit(bit_nr, &page->flags))
529 __wait_on_bit(page_waitqueue(page), &wait, sync_page,
530 TASK_UNINTERRUPTIBLE);
531 }
532 EXPORT_SYMBOL(wait_on_page_bit);
534 /**
535 * unlock_page - unlock a locked page
536 * @page: the page
537 *
538 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
539 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
540 * mechananism between PageLocked pages and PageWriteback pages is shared.
541 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
542 *
543 * The first mb is necessary to safely close the critical section opened by the
544 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
545 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
546 * parallel wait_on_page_locked()).
547 */
548 void fastcall unlock_page(struct page *page)
549 {
550 smp_mb__before_clear_bit();
551 if (!TestClearPageLocked(page))
552 BUG();
553 smp_mb__after_clear_bit();
554 wake_up_page(page, PG_locked);
555 }
556 EXPORT_SYMBOL(unlock_page);
558 /**
559 * end_page_writeback - end writeback against a page
560 * @page: the page
561 */
562 void end_page_writeback(struct page *page)
563 {
564 if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
565 if (!test_clear_page_writeback(page))
566 BUG();
567 }
568 smp_mb__after_clear_bit();
569 wake_up_page(page, PG_writeback);
570 }
571 EXPORT_SYMBOL(end_page_writeback);
573 /**
574 * __lock_page - get a lock on the page, assuming we need to sleep to get it
575 * @page: the page to lock
576 *
577 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
578 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
579 * chances are that on the second loop, the block layer's plug list is empty,
580 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
581 */
582 void fastcall __lock_page(struct page *page)
583 {
584 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
586 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
587 TASK_UNINTERRUPTIBLE);
588 }
589 EXPORT_SYMBOL(__lock_page);
591 /**
592 * find_get_page - find and get a page reference
593 * @mapping: the address_space to search
594 * @offset: the page index
595 *
596 * A rather lightweight function, finding and getting a reference to a
597 * hashed page atomically.
598 */
599 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
600 {
601 struct page *page;
603 read_lock_irq(&mapping->tree_lock);
604 page = radix_tree_lookup(&mapping->page_tree, offset);
605 if (page)
606 page_cache_get(page);
607 read_unlock_irq(&mapping->tree_lock);
608 return page;
609 }
610 EXPORT_SYMBOL(find_get_page);
612 /**
613 * find_trylock_page - find and lock a page
614 * @mapping: the address_space to search
615 * @offset: the page index
616 *
617 * Same as find_get_page(), but trylock it instead of incrementing the count.
618 */
619 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
620 {
621 struct page *page;
623 read_lock_irq(&mapping->tree_lock);
624 page = radix_tree_lookup(&mapping->page_tree, offset);
625 if (page && TestSetPageLocked(page))
626 page = NULL;
627 read_unlock_irq(&mapping->tree_lock);
628 return page;
629 }
630 EXPORT_SYMBOL(find_trylock_page);
632 /**
633 * find_lock_page - locate, pin and lock a pagecache page
634 * @mapping: the address_space to search
635 * @offset: the page index
636 *
637 * Locates the desired pagecache page, locks it, increments its reference
638 * count and returns its address.
639 *
640 * Returns zero if the page was not present. find_lock_page() may sleep.
641 */
642 struct page *find_lock_page(struct address_space *mapping,
643 unsigned long offset)
644 {
645 struct page *page;
647 read_lock_irq(&mapping->tree_lock);
648 repeat:
649 page = radix_tree_lookup(&mapping->page_tree, offset);
650 if (page) {
651 page_cache_get(page);
652 if (TestSetPageLocked(page)) {
653 read_unlock_irq(&mapping->tree_lock);
654 __lock_page(page);
655 read_lock_irq(&mapping->tree_lock);
657 /* Has the page been truncated while we slept? */
658 if (unlikely(page->mapping != mapping ||
659 page->index != offset)) {
660 unlock_page(page);
661 page_cache_release(page);
662 goto repeat;
663 }
664 }
665 }
666 read_unlock_irq(&mapping->tree_lock);
667 return page;
668 }
669 EXPORT_SYMBOL(find_lock_page);
671 /**
672 * find_or_create_page - locate or add a pagecache page
673 * @mapping: the page's address_space
674 * @index: the page's index into the mapping
675 * @gfp_mask: page allocation mode
676 *
677 * Locates a page in the pagecache. If the page is not present, a new page
678 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
679 * LRU list. The returned page is locked and has its reference count
680 * incremented.
681 *
682 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
683 * allocation!
684 *
685 * find_or_create_page() returns the desired page's address, or zero on
686 * memory exhaustion.
687 */
688 struct page *find_or_create_page(struct address_space *mapping,
689 unsigned long index, gfp_t gfp_mask)
690 {
691 struct page *page, *cached_page = NULL;
692 int err;
693 repeat:
694 page = find_lock_page(mapping, index);
695 if (!page) {
696 if (!cached_page) {
697 cached_page = alloc_page(gfp_mask);
698 if (!cached_page)
699 return NULL;
700 }
701 err = add_to_page_cache_lru(cached_page, mapping,
702 index, gfp_mask);
703 if (!err) {
704 page = cached_page;
705 cached_page = NULL;
706 } else if (err == -EEXIST)
707 goto repeat;
708 }
709 if (cached_page)
710 page_cache_release(cached_page);
711 return page;
712 }
713 EXPORT_SYMBOL(find_or_create_page);
715 /**
716 * find_get_pages - gang pagecache lookup
717 * @mapping: The address_space to search
718 * @start: The starting page index
719 * @nr_pages: The maximum number of pages
720 * @pages: Where the resulting pages are placed
721 *
722 * find_get_pages() will search for and return a group of up to
723 * @nr_pages pages in the mapping. The pages are placed at @pages.
724 * find_get_pages() takes a reference against the returned pages.
725 *
726 * The search returns a group of mapping-contiguous pages with ascending
727 * indexes. There may be holes in the indices due to not-present pages.
728 *
729 * find_get_pages() returns the number of pages which were found.
730 */
731 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
732 unsigned int nr_pages, struct page **pages)
733 {
734 unsigned int i;
735 unsigned int ret;
737 read_lock_irq(&mapping->tree_lock);
738 ret = radix_tree_gang_lookup(&mapping->page_tree,
739 (void **)pages, start, nr_pages);
740 for (i = 0; i < ret; i++)
741 page_cache_get(pages[i]);
742 read_unlock_irq(&mapping->tree_lock);
743 return ret;
744 }
746 /**
747 * find_get_pages_contig - gang contiguous pagecache lookup
748 * @mapping: The address_space to search
749 * @index: The starting page index
750 * @nr_pages: The maximum number of pages
751 * @pages: Where the resulting pages are placed
752 *
753 * find_get_pages_contig() works exactly like find_get_pages(), except
754 * that the returned number of pages are guaranteed to be contiguous.
755 *
756 * find_get_pages_contig() returns the number of pages which were found.
757 */
758 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
759 unsigned int nr_pages, struct page **pages)
760 {
761 unsigned int i;
762 unsigned int ret;
764 read_lock_irq(&mapping->tree_lock);
765 ret = radix_tree_gang_lookup(&mapping->page_tree,
766 (void **)pages, index, nr_pages);
767 for (i = 0; i < ret; i++) {
768 if (pages[i]->mapping == NULL || pages[i]->index != index)
769 break;
771 page_cache_get(pages[i]);
772 index++;
773 }
774 read_unlock_irq(&mapping->tree_lock);
775 return i;
776 }
778 /**
779 * find_get_pages_tag - find and return pages that match @tag
780 * @mapping: the address_space to search
781 * @index: the starting page index
782 * @tag: the tag index
783 * @nr_pages: the maximum number of pages
784 * @pages: where the resulting pages are placed
785 *
786 * Like find_get_pages, except we only return pages which are tagged with
787 * @tag. We update @index to index the next page for the traversal.
788 */
789 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
790 int tag, unsigned int nr_pages, struct page **pages)
791 {
792 unsigned int i;
793 unsigned int ret;
795 read_lock_irq(&mapping->tree_lock);
796 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
797 (void **)pages, *index, nr_pages, tag);
798 for (i = 0; i < ret; i++)
799 page_cache_get(pages[i]);
800 if (ret)
801 *index = pages[ret - 1]->index + 1;
802 read_unlock_irq(&mapping->tree_lock);
803 return ret;
804 }
806 /**
807 * grab_cache_page_nowait - returns locked page at given index in given cache
808 * @mapping: target address_space
809 * @index: the page index
810 *
811 * Same as grab_cache_page, but do not wait if the page is unavailable.
812 * This is intended for speculative data generators, where the data can
813 * be regenerated if the page couldn't be grabbed. This routine should
814 * be safe to call while holding the lock for another page.
815 *
816 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
817 * and deadlock against the caller's locked page.
818 */
819 struct page *
820 grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
821 {
822 struct page *page = find_get_page(mapping, index);
823 gfp_t gfp_mask;
825 if (page) {
826 if (!TestSetPageLocked(page))
827 return page;
828 page_cache_release(page);
829 return NULL;
830 }
831 gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
832 page = alloc_pages(gfp_mask, 0);
833 if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
834 page_cache_release(page);
835 page = NULL;
836 }
837 return page;
838 }
839 EXPORT_SYMBOL(grab_cache_page_nowait);
841 /*
842 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
843 * a _large_ part of the i/o request. Imagine the worst scenario:
844 *
845 * ---R__________________________________________B__________
846 * ^ reading here ^ bad block(assume 4k)
847 *
848 * read(R) => miss => readahead(R...B) => media error => frustrating retries
849 * => failing the whole request => read(R) => read(R+1) =>
850 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
851 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
852 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
853 *
854 * It is going insane. Fix it by quickly scaling down the readahead size.
855 */
856 static void shrink_readahead_size_eio(struct file *filp,
857 struct file_ra_state *ra)
858 {
859 if (!ra->ra_pages)
860 return;
862 ra->ra_pages /= 4;
863 }
865 /**
866 * do_generic_mapping_read - generic file read routine
867 * @mapping: address_space to be read
868 * @_ra: file's readahead state
869 * @filp: the file to read
870 * @ppos: current file position
871 * @desc: read_descriptor
872 * @actor: read method
873 *
874 * This is a generic file read routine, and uses the
875 * mapping->a_ops->readpage() function for the actual low-level stuff.
876 *
877 * This is really ugly. But the goto's actually try to clarify some
878 * of the logic when it comes to error handling etc.
879 *
880 * Note the struct file* is only passed for the use of readpage.
881 * It may be NULL.
882 */
883 void do_generic_mapping_read(struct address_space *mapping,
884 struct file_ra_state *_ra,
885 struct file *filp,
886 loff_t *ppos,
887 read_descriptor_t *desc,
888 read_actor_t actor)
889 {
890 struct inode *inode = mapping->host;
891 unsigned long index;
892 unsigned long end_index;
893 unsigned long offset;
894 unsigned long last_index;
895 unsigned long next_index;
896 unsigned long prev_index;
897 loff_t isize;
898 struct page *cached_page;
899 int error;
900 struct file_ra_state ra = *_ra;
902 cached_page = NULL;
903 index = *ppos >> PAGE_CACHE_SHIFT;
904 next_index = index;
905 prev_index = ra.prev_page;
906 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
907 offset = *ppos & ~PAGE_CACHE_MASK;
909 isize = i_size_read(inode);
910 if (!isize)
911 goto out;
913 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
914 for (;;) {
915 struct page *page;
916 unsigned long nr, ret;
918 /* nr is the maximum number of bytes to copy from this page */
919 nr = PAGE_CACHE_SIZE;
920 if (index >= end_index) {
921 if (index > end_index)
922 goto out;
923 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
924 if (nr <= offset) {
925 goto out;
926 }
927 }
928 nr = nr - offset;
930 cond_resched();
931 if (index == next_index)
932 next_index = page_cache_readahead(mapping, &ra, filp,
933 index, last_index - index);
935 find_page:
936 page = find_get_page(mapping, index);
937 if (unlikely(page == NULL)) {
938 handle_ra_miss(mapping, &ra, index);
939 goto no_cached_page;
940 }
941 if (!PageUptodate(page))
942 goto page_not_up_to_date;
943 page_ok:
945 /* If users can be writing to this page using arbitrary
946 * virtual addresses, take care about potential aliasing
947 * before reading the page on the kernel side.
948 */
949 if (mapping_writably_mapped(mapping))
950 flush_dcache_page(page);
952 /*
953 * When (part of) the same page is read multiple times
954 * in succession, only mark it as accessed the first time.
955 */
956 if (prev_index != index)
957 mark_page_accessed(page);
958 prev_index = index;
960 /*
961 * Ok, we have the page, and it's up-to-date, so
962 * now we can copy it to user space...
963 *
964 * The actor routine returns how many bytes were actually used..
965 * NOTE! This may not be the same as how much of a user buffer
966 * we filled up (we may be padding etc), so we can only update
967 * "pos" here (the actor routine has to update the user buffer
968 * pointers and the remaining count).
969 */
970 ret = actor(desc, page, offset, nr);
971 offset += ret;
972 index += offset >> PAGE_CACHE_SHIFT;
973 offset &= ~PAGE_CACHE_MASK;
975 page_cache_release(page);
976 if (ret == nr && desc->count)
977 continue;
978 goto out;
980 page_not_up_to_date:
981 /* Get exclusive access to the page ... */
982 lock_page(page);
984 /* Did it get unhashed before we got the lock? */
985 if (!page->mapping) {
986 unlock_page(page);
987 page_cache_release(page);
988 continue;
989 }
991 /* Did somebody else fill it already? */
992 if (PageUptodate(page)) {
993 unlock_page(page);
994 goto page_ok;
995 }
997 readpage:
998 /* Start the actual read. The read will unlock the page. */
999 error = mapping->a_ops->readpage(filp, page);
1001 if (unlikely(error)) {
1002 if (error == AOP_TRUNCATED_PAGE) {
1003 page_cache_release(page);
1004 goto find_page;
1006 goto readpage_error;
1009 if (!PageUptodate(page)) {
1010 lock_page(page);
1011 if (!PageUptodate(page)) {
1012 if (page->mapping == NULL) {
1013 /*
1014 * invalidate_inode_pages got it
1015 */
1016 unlock_page(page);
1017 page_cache_release(page);
1018 goto find_page;
1020 unlock_page(page);
1021 error = -EIO;
1022 shrink_readahead_size_eio(filp, &ra);
1023 goto readpage_error;
1025 unlock_page(page);
1028 /*
1029 * i_size must be checked after we have done ->readpage.
1031 * Checking i_size after the readpage allows us to calculate
1032 * the correct value for "nr", which means the zero-filled
1033 * part of the page is not copied back to userspace (unless
1034 * another truncate extends the file - this is desired though).
1035 */
1036 isize = i_size_read(inode);
1037 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1038 if (unlikely(!isize || index > end_index)) {
1039 page_cache_release(page);
1040 goto out;
1043 /* nr is the maximum number of bytes to copy from this page */
1044 nr = PAGE_CACHE_SIZE;
1045 if (index == end_index) {
1046 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1047 if (nr <= offset) {
1048 page_cache_release(page);
1049 goto out;
1052 nr = nr - offset;
1053 goto page_ok;
1055 readpage_error:
1056 /* UHHUH! A synchronous read error occurred. Report it */
1057 desc->error = error;
1058 page_cache_release(page);
1059 goto out;
1061 no_cached_page:
1062 /*
1063 * Ok, it wasn't cached, so we need to create a new
1064 * page..
1065 */
1066 if (!cached_page) {
1067 cached_page = page_cache_alloc_cold(mapping);
1068 if (!cached_page) {
1069 desc->error = -ENOMEM;
1070 goto out;
1073 error = add_to_page_cache_lru(cached_page, mapping,
1074 index, GFP_KERNEL);
1075 if (error) {
1076 if (error == -EEXIST)
1077 goto find_page;
1078 desc->error = error;
1079 goto out;
1081 page = cached_page;
1082 cached_page = NULL;
1083 goto readpage;
1086 out:
1087 *_ra = ra;
1089 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1090 if (cached_page)
1091 page_cache_release(cached_page);
1092 if (filp)
1093 file_accessed(filp);
1095 EXPORT_SYMBOL(do_generic_mapping_read);
1097 int file_read_actor(read_descriptor_t *desc, struct page *page,
1098 unsigned long offset, unsigned long size)
1100 char *kaddr;
1101 unsigned long left, count = desc->count;
1103 if (size > count)
1104 size = count;
1106 /*
1107 * Faults on the destination of a read are common, so do it before
1108 * taking the kmap.
1109 */
1110 if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1111 kaddr = kmap_atomic(page, KM_USER0);
1112 left = __copy_to_user_inatomic(desc->arg.buf,
1113 kaddr + offset, size);
1114 kunmap_atomic(kaddr, KM_USER0);
1115 if (left == 0)
1116 goto success;
1119 /* Do it the slow way */
1120 kaddr = kmap(page);
1121 left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1122 kunmap(page);
1124 if (left) {
1125 size -= left;
1126 desc->error = -EFAULT;
1128 success:
1129 desc->count = count - size;
1130 desc->written += size;
1131 desc->arg.buf += size;
1132 return size;
1135 /**
1136 * __generic_file_aio_read - generic filesystem read routine
1137 * @iocb: kernel I/O control block
1138 * @iov: io vector request
1139 * @nr_segs: number of segments in the iovec
1140 * @ppos: current file position
1142 * This is the "read()" routine for all filesystems
1143 * that can use the page cache directly.
1144 */
1145 ssize_t
1146 __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1147 unsigned long nr_segs, loff_t *ppos)
1149 struct file *filp = iocb->ki_filp;
1150 ssize_t retval;
1151 unsigned long seg;
1152 size_t count;
1154 count = 0;
1155 for (seg = 0; seg < nr_segs; seg++) {
1156 const struct iovec *iv = &iov[seg];
1158 /*
1159 * If any segment has a negative length, or the cumulative
1160 * length ever wraps negative then return -EINVAL.
1161 */
1162 count += iv->iov_len;
1163 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1164 return -EINVAL;
1165 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1166 continue;
1167 if (seg == 0)
1168 return -EFAULT;
1169 nr_segs = seg;
1170 count -= iv->iov_len; /* This segment is no good */
1171 break;
1174 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1175 if (filp->f_flags & O_DIRECT) {
1176 loff_t pos = *ppos, size;
1177 struct address_space *mapping;
1178 struct inode *inode;
1180 mapping = filp->f_mapping;
1181 inode = mapping->host;
1182 retval = 0;
1183 if (!count)
1184 goto out; /* skip atime */
1185 size = i_size_read(inode);
1186 if (pos < size) {
1187 retval = generic_file_direct_IO(READ, iocb,
1188 iov, pos, nr_segs);
1189 if (retval > 0 && !is_sync_kiocb(iocb))
1190 retval = -EIOCBQUEUED;
1191 if (retval > 0)
1192 *ppos = pos + retval;
1194 file_accessed(filp);
1195 goto out;
1198 retval = 0;
1199 if (count) {
1200 for (seg = 0; seg < nr_segs; seg++) {
1201 read_descriptor_t desc;
1203 desc.written = 0;
1204 desc.arg.buf = iov[seg].iov_base;
1205 desc.count = iov[seg].iov_len;
1206 if (desc.count == 0)
1207 continue;
1208 desc.error = 0;
1209 do_generic_file_read(filp,ppos,&desc,file_read_actor);
1210 retval += desc.written;
1211 if (desc.error) {
1212 retval = retval ?: desc.error;
1213 break;
1217 out:
1218 return retval;
1220 EXPORT_SYMBOL(__generic_file_aio_read);
1222 ssize_t
1223 generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1225 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1227 BUG_ON(iocb->ki_pos != pos);
1228 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1230 EXPORT_SYMBOL(generic_file_aio_read);
1232 ssize_t
1233 generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1235 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1236 struct kiocb kiocb;
1237 ssize_t ret;
1239 init_sync_kiocb(&kiocb, filp);
1240 ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1241 if (-EIOCBQUEUED == ret)
1242 ret = wait_on_sync_kiocb(&kiocb);
1243 return ret;
1245 EXPORT_SYMBOL(generic_file_read);
1247 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1249 ssize_t written;
1250 unsigned long count = desc->count;
1251 struct file *file = desc->arg.data;
1253 if (size > count)
1254 size = count;
1256 written = file->f_op->sendpage(file, page, offset,
1257 size, &file->f_pos, size<count);
1258 if (written < 0) {
1259 desc->error = written;
1260 written = 0;
1262 desc->count = count - written;
1263 desc->written += written;
1264 return written;
1267 ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1268 size_t count, read_actor_t actor, void *target)
1270 read_descriptor_t desc;
1272 if (!count)
1273 return 0;
1275 desc.written = 0;
1276 desc.count = count;
1277 desc.arg.data = target;
1278 desc.error = 0;
1280 do_generic_file_read(in_file, ppos, &desc, actor);
1281 if (desc.written)
1282 return desc.written;
1283 return desc.error;
1285 EXPORT_SYMBOL(generic_file_sendfile);
1287 static ssize_t
1288 do_readahead(struct address_space *mapping, struct file *filp,
1289 unsigned long index, unsigned long nr)
1291 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1292 return -EINVAL;
1294 force_page_cache_readahead(mapping, filp, index,
1295 max_sane_readahead(nr));
1296 return 0;
1299 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1301 ssize_t ret;
1302 struct file *file;
1304 ret = -EBADF;
1305 file = fget(fd);
1306 if (file) {
1307 if (file->f_mode & FMODE_READ) {
1308 struct address_space *mapping = file->f_mapping;
1309 unsigned long start = offset >> PAGE_CACHE_SHIFT;
1310 unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1311 unsigned long len = end - start + 1;
1312 ret = do_readahead(mapping, file, start, len);
1314 fput(file);
1316 return ret;
1319 #ifdef CONFIG_MMU
1320 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1321 /**
1322 * page_cache_read - adds requested page to the page cache if not already there
1323 * @file: file to read
1324 * @offset: page index
1326 * This adds the requested page to the page cache if it isn't already there,
1327 * and schedules an I/O to read in its contents from disk.
1328 */
1329 static int fastcall page_cache_read(struct file * file, unsigned long offset)
1331 struct address_space *mapping = file->f_mapping;
1332 struct page *page;
1333 int ret;
1335 do {
1336 page = page_cache_alloc_cold(mapping);
1337 if (!page)
1338 return -ENOMEM;
1340 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1341 if (ret == 0)
1342 ret = mapping->a_ops->readpage(file, page);
1343 else if (ret == -EEXIST)
1344 ret = 0; /* losing race to add is OK */
1346 page_cache_release(page);
1348 } while (ret == AOP_TRUNCATED_PAGE);
1350 return ret;
1353 #define MMAP_LOTSAMISS (100)
1355 /**
1356 * filemap_nopage - read in file data for page fault handling
1357 * @area: the applicable vm_area
1358 * @address: target address to read in
1359 * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
1361 * filemap_nopage() is invoked via the vma operations vector for a
1362 * mapped memory region to read in file data during a page fault.
1364 * The goto's are kind of ugly, but this streamlines the normal case of having
1365 * it in the page cache, and handles the special cases reasonably without
1366 * having a lot of duplicated code.
1367 */
1368 struct page *filemap_nopage(struct vm_area_struct *area,
1369 unsigned long address, int *type)
1371 int error;
1372 struct file *file = area->vm_file;
1373 struct address_space *mapping = file->f_mapping;
1374 struct file_ra_state *ra = &file->f_ra;
1375 struct inode *inode = mapping->host;
1376 struct page *page;
1377 unsigned long size, pgoff;
1378 int did_readaround = 0, majmin = VM_FAULT_MINOR;
1380 pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1382 retry_all:
1383 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1384 if (pgoff >= size)
1385 goto outside_data_content;
1387 /* If we don't want any read-ahead, don't bother */
1388 if (VM_RandomReadHint(area))
1389 goto no_cached_page;
1391 /*
1392 * The readahead code wants to be told about each and every page
1393 * so it can build and shrink its windows appropriately
1395 * For sequential accesses, we use the generic readahead logic.
1396 */
1397 if (VM_SequentialReadHint(area))
1398 page_cache_readahead(mapping, ra, file, pgoff, 1);
1400 /*
1401 * Do we have something in the page cache already?
1402 */
1403 retry_find:
1404 page = find_get_page(mapping, pgoff);
1405 if (!page) {
1406 unsigned long ra_pages;
1408 if (VM_SequentialReadHint(area)) {
1409 handle_ra_miss(mapping, ra, pgoff);
1410 goto no_cached_page;
1412 ra->mmap_miss++;
1414 /*
1415 * Do we miss much more than hit in this file? If so,
1416 * stop bothering with read-ahead. It will only hurt.
1417 */
1418 if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1419 goto no_cached_page;
1421 /*
1422 * To keep the pgmajfault counter straight, we need to
1423 * check did_readaround, as this is an inner loop.
1424 */
1425 if (!did_readaround) {
1426 majmin = VM_FAULT_MAJOR;
1427 count_vm_event(PGMAJFAULT);
1429 did_readaround = 1;
1430 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1431 if (ra_pages) {
1432 pgoff_t start = 0;
1434 if (pgoff > ra_pages / 2)
1435 start = pgoff - ra_pages / 2;
1436 do_page_cache_readahead(mapping, file, start, ra_pages);
1438 page = find_get_page(mapping, pgoff);
1439 if (!page)
1440 goto no_cached_page;
1443 if (!did_readaround)
1444 ra->mmap_hit++;
1446 /*
1447 * Ok, found a page in the page cache, now we need to check
1448 * that it's up-to-date.
1449 */
1450 if (!PageUptodate(page))
1451 goto page_not_uptodate;
1453 success:
1454 /*
1455 * Found the page and have a reference on it.
1456 */
1457 mark_page_accessed(page);
1458 if (type)
1459 *type = majmin;
1460 return page;
1462 outside_data_content:
1463 /*
1464 * An external ptracer can access pages that normally aren't
1465 * accessible..
1466 */
1467 if (area->vm_mm == current->mm)
1468 return NULL;
1469 /* Fall through to the non-read-ahead case */
1470 no_cached_page:
1471 /*
1472 * We're only likely to ever get here if MADV_RANDOM is in
1473 * effect.
1474 */
1475 error = page_cache_read(file, pgoff);
1476 grab_swap_token();
1478 /*
1479 * The page we want has now been added to the page cache.
1480 * In the unlikely event that someone removed it in the
1481 * meantime, we'll just come back here and read it again.
1482 */
1483 if (error >= 0)
1484 goto retry_find;
1486 /*
1487 * An error return from page_cache_read can result if the
1488 * system is low on memory, or a problem occurs while trying
1489 * to schedule I/O.
1490 */
1491 if (error == -ENOMEM)
1492 return NOPAGE_OOM;
1493 return NULL;
1495 page_not_uptodate:
1496 if (!did_readaround) {
1497 majmin = VM_FAULT_MAJOR;
1498 count_vm_event(PGMAJFAULT);
1500 lock_page(page);
1502 /* Did it get unhashed while we waited for it? */
1503 if (!page->mapping) {
1504 unlock_page(page);
1505 page_cache_release(page);
1506 goto retry_all;
1509 /* Did somebody else get it up-to-date? */
1510 if (PageUptodate(page)) {
1511 unlock_page(page);
1512 goto success;
1515 error = mapping->a_ops->readpage(file, page);
1516 if (!error) {
1517 wait_on_page_locked(page);
1518 if (PageUptodate(page))
1519 goto success;
1520 } else if (error == AOP_TRUNCATED_PAGE) {
1521 page_cache_release(page);
1522 goto retry_find;
1525 /*
1526 * Umm, take care of errors if the page isn't up-to-date.
1527 * Try to re-read it _once_. We do this synchronously,
1528 * because there really aren't any performance issues here
1529 * and we need to check for errors.
1530 */
1531 lock_page(page);
1533 /* Somebody truncated the page on us? */
1534 if (!page->mapping) {
1535 unlock_page(page);
1536 page_cache_release(page);
1537 goto retry_all;
1540 /* Somebody else successfully read it in? */
1541 if (PageUptodate(page)) {
1542 unlock_page(page);
1543 goto success;
1545 ClearPageError(page);
1546 error = mapping->a_ops->readpage(file, page);
1547 if (!error) {
1548 wait_on_page_locked(page);
1549 if (PageUptodate(page))
1550 goto success;
1551 } else if (error == AOP_TRUNCATED_PAGE) {
1552 page_cache_release(page);
1553 goto retry_find;
1556 /*
1557 * Things didn't work out. Return zero to tell the
1558 * mm layer so, possibly freeing the page cache page first.
1559 */
1560 shrink_readahead_size_eio(file, ra);
1561 page_cache_release(page);
1562 return NULL;
1564 EXPORT_SYMBOL(filemap_nopage);
1566 static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1567 int nonblock)
1569 struct address_space *mapping = file->f_mapping;
1570 struct page *page;
1571 int error;
1573 /*
1574 * Do we have something in the page cache already?
1575 */
1576 retry_find:
1577 page = find_get_page(mapping, pgoff);
1578 if (!page) {
1579 if (nonblock)
1580 return NULL;
1581 goto no_cached_page;
1584 /*
1585 * Ok, found a page in the page cache, now we need to check
1586 * that it's up-to-date.
1587 */
1588 if (!PageUptodate(page)) {
1589 if (nonblock) {
1590 page_cache_release(page);
1591 return NULL;
1593 goto page_not_uptodate;
1596 success:
1597 /*
1598 * Found the page and have a reference on it.
1599 */
1600 mark_page_accessed(page);
1601 return page;
1603 no_cached_page:
1604 error = page_cache_read(file, pgoff);
1606 /*
1607 * The page we want has now been added to the page cache.
1608 * In the unlikely event that someone removed it in the
1609 * meantime, we'll just come back here and read it again.
1610 */
1611 if (error >= 0)
1612 goto retry_find;
1614 /*
1615 * An error return from page_cache_read can result if the
1616 * system is low on memory, or a problem occurs while trying
1617 * to schedule I/O.
1618 */
1619 return NULL;
1621 page_not_uptodate:
1622 lock_page(page);
1624 /* Did it get unhashed while we waited for it? */
1625 if (!page->mapping) {
1626 unlock_page(page);
1627 goto err;
1630 /* Did somebody else get it up-to-date? */
1631 if (PageUptodate(page)) {
1632 unlock_page(page);
1633 goto success;
1636 error = mapping->a_ops->readpage(file, page);
1637 if (!error) {
1638 wait_on_page_locked(page);
1639 if (PageUptodate(page))
1640 goto success;
1641 } else if (error == AOP_TRUNCATED_PAGE) {
1642 page_cache_release(page);
1643 goto retry_find;
1646 /*
1647 * Umm, take care of errors if the page isn't up-to-date.
1648 * Try to re-read it _once_. We do this synchronously,
1649 * because there really aren't any performance issues here
1650 * and we need to check for errors.
1651 */
1652 lock_page(page);
1654 /* Somebody truncated the page on us? */
1655 if (!page->mapping) {
1656 unlock_page(page);
1657 goto err;
1659 /* Somebody else successfully read it in? */
1660 if (PageUptodate(page)) {
1661 unlock_page(page);
1662 goto success;
1665 ClearPageError(page);
1666 error = mapping->a_ops->readpage(file, page);
1667 if (!error) {
1668 wait_on_page_locked(page);
1669 if (PageUptodate(page))
1670 goto success;
1671 } else if (error == AOP_TRUNCATED_PAGE) {
1672 page_cache_release(page);
1673 goto retry_find;
1676 /*
1677 * Things didn't work out. Return zero to tell the
1678 * mm layer so, possibly freeing the page cache page first.
1679 */
1680 err:
1681 page_cache_release(page);
1683 return NULL;
1686 int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
1687 unsigned long len, pgprot_t prot, unsigned long pgoff,
1688 int nonblock)
1690 struct file *file = vma->vm_file;
1691 struct address_space *mapping = file->f_mapping;
1692 struct inode *inode = mapping->host;
1693 unsigned long size;
1694 struct mm_struct *mm = vma->vm_mm;
1695 struct page *page;
1696 int err;
1698 if (!nonblock)
1699 force_page_cache_readahead(mapping, vma->vm_file,
1700 pgoff, len >> PAGE_CACHE_SHIFT);
1702 repeat:
1703 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1704 if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1705 return -EINVAL;
1707 page = filemap_getpage(file, pgoff, nonblock);
1709 /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
1710 * done in shmem_populate calling shmem_getpage */
1711 if (!page && !nonblock)
1712 return -ENOMEM;
1714 if (page) {
1715 err = install_page(mm, vma, addr, page, prot);
1716 if (err) {
1717 page_cache_release(page);
1718 return err;
1720 } else if (vma->vm_flags & VM_NONLINEAR) {
1721 /* No page was found just because we can't read it in now (being
1722 * here implies nonblock != 0), but the page may exist, so set
1723 * the PTE to fault it in later. */
1724 err = install_file_pte(mm, vma, addr, pgoff, prot);
1725 if (err)
1726 return err;
1729 len -= PAGE_SIZE;
1730 addr += PAGE_SIZE;
1731 pgoff++;
1732 if (len)
1733 goto repeat;
1735 return 0;
1737 EXPORT_SYMBOL(filemap_populate);
1739 struct vm_operations_struct generic_file_vm_ops = {
1740 .nopage = filemap_nopage,
1741 .populate = filemap_populate,
1742 };
1744 /* This is used for a general mmap of a disk file */
1746 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1748 struct address_space *mapping = file->f_mapping;
1750 if (!mapping->a_ops->readpage)
1751 return -ENOEXEC;
1752 file_accessed(file);
1753 vma->vm_ops = &generic_file_vm_ops;
1754 return 0;
1757 /*
1758 * This is for filesystems which do not implement ->writepage.
1759 */
1760 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1762 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1763 return -EINVAL;
1764 return generic_file_mmap(file, vma);
1766 #else
1767 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1769 return -ENOSYS;
1771 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1773 return -ENOSYS;
1775 #endif /* CONFIG_MMU */
1777 EXPORT_SYMBOL(generic_file_mmap);
1778 EXPORT_SYMBOL(generic_file_readonly_mmap);
1780 static inline struct page *__read_cache_page(struct address_space *mapping,
1781 unsigned long index,
1782 int (*filler)(void *,struct page*),
1783 void *data)
1785 struct page *page, *cached_page = NULL;
1786 int err;
1787 repeat:
1788 page = find_get_page(mapping, index);
1789 if (!page) {
1790 if (!cached_page) {
1791 cached_page = page_cache_alloc_cold(mapping);
1792 if (!cached_page)
1793 return ERR_PTR(-ENOMEM);
1795 err = add_to_page_cache_lru(cached_page, mapping,
1796 index, GFP_KERNEL);
1797 if (err == -EEXIST)
1798 goto repeat;
1799 if (err < 0) {
1800 /* Presumably ENOMEM for radix tree node */
1801 page_cache_release(cached_page);
1802 return ERR_PTR(err);
1804 page = cached_page;
1805 cached_page = NULL;
1806 err = filler(data, page);
1807 if (err < 0) {
1808 page_cache_release(page);
1809 page = ERR_PTR(err);
1812 if (cached_page)
1813 page_cache_release(cached_page);
1814 return page;
1817 /**
1818 * read_cache_page - read into page cache, fill it if needed
1819 * @mapping: the page's address_space
1820 * @index: the page index
1821 * @filler: function to perform the read
1822 * @data: destination for read data
1824 * Read into the page cache. If a page already exists,
1825 * and PageUptodate() is not set, try to fill the page.
1826 */
1827 struct page *read_cache_page(struct address_space *mapping,
1828 unsigned long index,
1829 int (*filler)(void *,struct page*),
1830 void *data)
1832 struct page *page;
1833 int err;
1835 retry:
1836 page = __read_cache_page(mapping, index, filler, data);
1837 if (IS_ERR(page))
1838 goto out;
1839 mark_page_accessed(page);
1840 if (PageUptodate(page))
1841 goto out;
1843 lock_page(page);
1844 if (!page->mapping) {
1845 unlock_page(page);
1846 page_cache_release(page);
1847 goto retry;
1849 if (PageUptodate(page)) {
1850 unlock_page(page);
1851 goto out;
1853 err = filler(data, page);
1854 if (err < 0) {
1855 page_cache_release(page);
1856 page = ERR_PTR(err);
1858 out:
1859 return page;
1861 EXPORT_SYMBOL(read_cache_page);
1863 /*
1864 * If the page was newly created, increment its refcount and add it to the
1865 * caller's lru-buffering pagevec. This function is specifically for
1866 * generic_file_write().
1867 */
1868 static inline struct page *
1869 __grab_cache_page(struct address_space *mapping, unsigned long index,
1870 struct page **cached_page, struct pagevec *lru_pvec)
1872 int err;
1873 struct page *page;
1874 repeat:
1875 page = find_lock_page(mapping, index);
1876 if (!page) {
1877 if (!*cached_page) {
1878 *cached_page = page_cache_alloc(mapping);
1879 if (!*cached_page)
1880 return NULL;
1882 err = add_to_page_cache(*cached_page, mapping,
1883 index, GFP_KERNEL);
1884 if (err == -EEXIST)
1885 goto repeat;
1886 if (err == 0) {
1887 page = *cached_page;
1888 page_cache_get(page);
1889 if (!pagevec_add(lru_pvec, page))
1890 __pagevec_lru_add(lru_pvec);
1891 *cached_page = NULL;
1894 return page;
1897 /*
1898 * The logic we want is
1900 * if suid or (sgid and xgrp)
1901 * remove privs
1902 */
1903 int remove_suid(struct dentry *dentry)
1905 mode_t mode = dentry->d_inode->i_mode;
1906 int kill = 0;
1907 int result = 0;
1909 /* suid always must be killed */
1910 if (unlikely(mode & S_ISUID))
1911 kill = ATTR_KILL_SUID;
1913 /*
1914 * sgid without any exec bits is just a mandatory locking mark; leave
1915 * it alone. If some exec bits are set, it's a real sgid; kill it.
1916 */
1917 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1918 kill |= ATTR_KILL_SGID;
1920 if (unlikely(kill && !capable(CAP_FSETID))) {
1921 struct iattr newattrs;
1923 newattrs.ia_valid = ATTR_FORCE | kill;
1924 result = notify_change(dentry, &newattrs);
1926 return result;
1928 EXPORT_SYMBOL(remove_suid);
1930 size_t
1931 __filemap_copy_from_user_iovec_inatomic(char *vaddr,
1932 const struct iovec *iov, size_t base, size_t bytes)
1934 size_t copied = 0, left = 0;
1936 while (bytes) {
1937 char __user *buf = iov->iov_base + base;
1938 int copy = min(bytes, iov->iov_len - base);
1940 base = 0;
1941 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1942 copied += copy;
1943 bytes -= copy;
1944 vaddr += copy;
1945 iov++;
1947 if (unlikely(left))
1948 break;
1950 return copied - left;
1953 /*
1954 * Performs necessary checks before doing a write
1956 * Can adjust writing position or amount of bytes to write.
1957 * Returns appropriate error code that caller should return or
1958 * zero in case that write should be allowed.
1959 */
1960 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1962 struct inode *inode = file->f_mapping->host;
1963 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1965 if (unlikely(*pos < 0))
1966 return -EINVAL;
1968 if (!isblk) {
1969 /* FIXME: this is for backwards compatibility with 2.4 */
1970 if (file->f_flags & O_APPEND)
1971 *pos = i_size_read(inode);
1973 if (limit != RLIM_INFINITY) {
1974 if (*pos >= limit) {
1975 send_sig(SIGXFSZ, current, 0);
1976 return -EFBIG;
1978 if (*count > limit - (typeof(limit))*pos) {
1979 *count = limit - (typeof(limit))*pos;
1984 /*
1985 * LFS rule
1986 */
1987 if (unlikely(*pos + *count > MAX_NON_LFS &&
1988 !(file->f_flags & O_LARGEFILE))) {
1989 if (*pos >= MAX_NON_LFS) {
1990 send_sig(SIGXFSZ, current, 0);
1991 return -EFBIG;
1993 if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1994 *count = MAX_NON_LFS - (unsigned long)*pos;
1998 /*
1999 * Are we about to exceed the fs block limit ?
2001 * If we have written data it becomes a short write. If we have
2002 * exceeded without writing data we send a signal and return EFBIG.
2003 * Linus frestrict idea will clean these up nicely..
2004 */
2005 if (likely(!isblk)) {
2006 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2007 if (*count || *pos > inode->i_sb->s_maxbytes) {
2008 send_sig(SIGXFSZ, current, 0);
2009 return -EFBIG;
2011 /* zero-length writes at ->s_maxbytes are OK */
2014 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2015 *count = inode->i_sb->s_maxbytes - *pos;
2016 } else {
2017 loff_t isize;
2018 if (bdev_read_only(I_BDEV(inode)))
2019 return -EPERM;
2020 isize = i_size_read(inode);
2021 if (*pos >= isize) {
2022 if (*count || *pos > isize)
2023 return -ENOSPC;
2026 if (*pos + *count > isize)
2027 *count = isize - *pos;
2029 return 0;
2031 EXPORT_SYMBOL(generic_write_checks);
2033 ssize_t
2034 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2035 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2036 size_t count, size_t ocount)
2038 struct file *file = iocb->ki_filp;
2039 struct address_space *mapping = file->f_mapping;
2040 struct inode *inode = mapping->host;
2041 ssize_t written;
2043 if (count != ocount)
2044 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2046 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2047 if (written > 0) {
2048 loff_t end = pos + written;
2049 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2050 i_size_write(inode, end);
2051 mark_inode_dirty(inode);
2053 *ppos = end;
2056 /*
2057 * Sync the fs metadata but not the minor inode changes and
2058 * of course not the data as we did direct DMA for the IO.
2059 * i_mutex is held, which protects generic_osync_inode() from
2060 * livelocking.
2061 */
2062 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2063 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2064 if (err < 0)
2065 written = err;
2067 if (written == count && !is_sync_kiocb(iocb))
2068 written = -EIOCBQUEUED;
2069 return written;
2071 EXPORT_SYMBOL(generic_file_direct_write);
2073 ssize_t
2074 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2075 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2076 size_t count, ssize_t written)
2078 struct file *file = iocb->ki_filp;
2079 struct address_space * mapping = file->f_mapping;
2080 const struct address_space_operations *a_ops = mapping->a_ops;
2081 struct inode *inode = mapping->host;
2082 long status = 0;
2083 struct page *page;
2084 struct page *cached_page = NULL;
2085 size_t bytes;
2086 struct pagevec lru_pvec;
2087 const struct iovec *cur_iov = iov; /* current iovec */
2088 size_t iov_base = 0; /* offset in the current iovec */
2089 char __user *buf;
2091 pagevec_init(&lru_pvec, 0);
2093 /*
2094 * handle partial DIO write. Adjust cur_iov if needed.
2095 */
2096 if (likely(nr_segs == 1))
2097 buf = iov->iov_base + written;
2098 else {
2099 filemap_set_next_iovec(&cur_iov, &iov_base, written);
2100 buf = cur_iov->iov_base + iov_base;
2103 do {
2104 unsigned long index;
2105 unsigned long offset;
2106 size_t copied;
2108 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2109 index = pos >> PAGE_CACHE_SHIFT;
2110 bytes = PAGE_CACHE_SIZE - offset;
2112 /* Limit the size of the copy to the caller's write size */
2113 bytes = min(bytes, count);
2115 /*
2116 * Limit the size of the copy to that of the current segment,
2117 * because fault_in_pages_readable() doesn't know how to walk
2118 * segments.
2119 */
2120 bytes = min(bytes, cur_iov->iov_len - iov_base);
2122 /*
2123 * Bring in the user page that we will copy from _first_.
2124 * Otherwise there's a nasty deadlock on copying from the
2125 * same page as we're writing to, without it being marked
2126 * up-to-date.
2127 */
2128 fault_in_pages_readable(buf, bytes);
2130 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
2131 if (!page) {
2132 status = -ENOMEM;
2133 break;
2136 if (unlikely(bytes == 0)) {
2137 status = 0;
2138 copied = 0;
2139 goto zero_length_segment;
2142 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2143 if (unlikely(status)) {
2144 loff_t isize = i_size_read(inode);
2146 if (status != AOP_TRUNCATED_PAGE)
2147 unlock_page(page);
2148 page_cache_release(page);
2149 if (status == AOP_TRUNCATED_PAGE)
2150 continue;
2151 /*
2152 * prepare_write() may have instantiated a few blocks
2153 * outside i_size. Trim these off again.
2154 */
2155 if (pos + bytes > isize)
2156 vmtruncate(inode, isize);
2157 break;
2159 if (likely(nr_segs == 1))
2160 copied = filemap_copy_from_user(page, offset,
2161 buf, bytes);
2162 else
2163 copied = filemap_copy_from_user_iovec(page, offset,
2164 cur_iov, iov_base, bytes);
2165 flush_dcache_page(page);
2166 status = a_ops->commit_write(file, page, offset, offset+bytes);
2167 if (status == AOP_TRUNCATED_PAGE) {
2168 page_cache_release(page);
2169 continue;
2171 zero_length_segment:
2172 if (likely(copied >= 0)) {
2173 if (!status)
2174 status = copied;
2176 if (status >= 0) {
2177 written += status;
2178 count -= status;
2179 pos += status;
2180 buf += status;
2181 if (unlikely(nr_segs > 1)) {
2182 filemap_set_next_iovec(&cur_iov,
2183 &iov_base, status);
2184 if (count)
2185 buf = cur_iov->iov_base +
2186 iov_base;
2187 } else {
2188 iov_base += status;
2192 if (unlikely(copied != bytes))
2193 if (status >= 0)
2194 status = -EFAULT;
2195 unlock_page(page);
2196 mark_page_accessed(page);
2197 page_cache_release(page);
2198 if (status < 0)
2199 break;
2200 balance_dirty_pages_ratelimited(mapping);
2201 cond_resched();
2202 } while (count);
2203 *ppos = pos;
2205 if (cached_page)
2206 page_cache_release(cached_page);
2208 /*
2209 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2210 */
2211 if (likely(status >= 0)) {
2212 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2213 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2214 status = generic_osync_inode(inode, mapping,
2215 OSYNC_METADATA|OSYNC_DATA);
2219 /*
2220 * If we get here for O_DIRECT writes then we must have fallen through
2221 * to buffered writes (block instantiation inside i_size). So we sync
2222 * the file data here, to try to honour O_DIRECT expectations.
2223 */
2224 if (unlikely(file->f_flags & O_DIRECT) && written)
2225 status = filemap_write_and_wait(mapping);
2227 pagevec_lru_add(&lru_pvec);
2228 return written ? written : status;
2230 EXPORT_SYMBOL(generic_file_buffered_write);
2232 static ssize_t
2233 __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2234 unsigned long nr_segs, loff_t *ppos)
2236 struct file *file = iocb->ki_filp;
2237 const struct address_space * mapping = file->f_mapping;
2238 size_t ocount; /* original count */
2239 size_t count; /* after file limit checks */
2240 struct inode *inode = mapping->host;
2241 unsigned long seg;
2242 loff_t pos;
2243 ssize_t written;
2244 ssize_t err;
2246 ocount = 0;
2247 for (seg = 0; seg < nr_segs; seg++) {
2248 const struct iovec *iv = &iov[seg];
2250 /*
2251 * If any segment has a negative length, or the cumulative
2252 * length ever wraps negative then return -EINVAL.
2253 */
2254 ocount += iv->iov_len;
2255 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
2256 return -EINVAL;
2257 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2258 continue;
2259 if (seg == 0)
2260 return -EFAULT;
2261 nr_segs = seg;
2262 ocount -= iv->iov_len; /* This segment is no good */
2263 break;
2266 count = ocount;
2267 pos = *ppos;
2269 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2271 /* We can write back this queue in page reclaim */
2272 current->backing_dev_info = mapping->backing_dev_info;
2273 written = 0;
2275 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2276 if (err)
2277 goto out;
2279 if (count == 0)
2280 goto out;
2282 err = remove_suid(file->f_dentry);
2283 if (err)
2284 goto out;
2286 file_update_time(file);
2288 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2289 if (unlikely(file->f_flags & O_DIRECT)) {
2290 written = generic_file_direct_write(iocb, iov,
2291 &nr_segs, pos, ppos, count, ocount);
2292 if (written < 0 || written == count)
2293 goto out;
2294 /*
2295 * direct-io write to a hole: fall through to buffered I/O
2296 * for completing the rest of the request.
2297 */
2298 pos += written;
2299 count -= written;
2302 written = generic_file_buffered_write(iocb, iov, nr_segs,
2303 pos, ppos, count, written);
2304 out:
2305 current->backing_dev_info = NULL;
2306 return written ? written : err;
2308 EXPORT_SYMBOL(generic_file_aio_write_nolock);
2310 ssize_t
2311 generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2312 unsigned long nr_segs, loff_t *ppos)
2314 struct file *file = iocb->ki_filp;
2315 struct address_space *mapping = file->f_mapping;
2316 struct inode *inode = mapping->host;
2317 ssize_t ret;
2318 loff_t pos = *ppos;
2320 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
2322 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2323 int err;
2325 err = sync_page_range_nolock(inode, mapping, pos, ret);
2326 if (err < 0)
2327 ret = err;
2329 return ret;
2332 static ssize_t
2333 __generic_file_write_nolock(struct file *file, const struct iovec *iov,
2334 unsigned long nr_segs, loff_t *ppos)
2336 struct kiocb kiocb;
2337 ssize_t ret;
2339 init_sync_kiocb(&kiocb, file);
2340 ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2341 if (ret == -EIOCBQUEUED)
2342 ret = wait_on_sync_kiocb(&kiocb);
2343 return ret;
2346 ssize_t
2347 generic_file_write_nolock(struct file *file, const struct iovec *iov,
2348 unsigned long nr_segs, loff_t *ppos)
2350 struct kiocb kiocb;
2351 ssize_t ret;
2353 init_sync_kiocb(&kiocb, file);
2354 ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2355 if (-EIOCBQUEUED == ret)
2356 ret = wait_on_sync_kiocb(&kiocb);
2357 return ret;
2359 EXPORT_SYMBOL(generic_file_write_nolock);
2361 ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2362 size_t count, loff_t pos)
2364 struct file *file = iocb->ki_filp;
2365 struct address_space *mapping = file->f_mapping;
2366 struct inode *inode = mapping->host;
2367 ssize_t ret;
2368 struct iovec local_iov = { .iov_base = (void __user *)buf,
2369 .iov_len = count };
2371 BUG_ON(iocb->ki_pos != pos);
2373 mutex_lock(&inode->i_mutex);
2374 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
2375 &iocb->ki_pos);
2376 mutex_unlock(&inode->i_mutex);
2378 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2379 ssize_t err;
2381 err = sync_page_range(inode, mapping, pos, ret);
2382 if (err < 0)
2383 ret = err;
2385 return ret;
2387 EXPORT_SYMBOL(generic_file_aio_write);
2389 ssize_t generic_file_write(struct file *file, const char __user *buf,
2390 size_t count, loff_t *ppos)
2392 struct address_space *mapping = file->f_mapping;
2393 struct inode *inode = mapping->host;
2394 ssize_t ret;
2395 struct iovec local_iov = { .iov_base = (void __user *)buf,
2396 .iov_len = count };
2398 mutex_lock(&inode->i_mutex);
2399 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2400 mutex_unlock(&inode->i_mutex);
2402 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2403 ssize_t err;
2405 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2406 if (err < 0)
2407 ret = err;
2409 return ret;
2411 EXPORT_SYMBOL(generic_file_write);
2413 ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2414 unsigned long nr_segs, loff_t *ppos)
2416 struct kiocb kiocb;
2417 ssize_t ret;
2419 init_sync_kiocb(&kiocb, filp);
2420 ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2421 if (-EIOCBQUEUED == ret)
2422 ret = wait_on_sync_kiocb(&kiocb);
2423 return ret;
2425 EXPORT_SYMBOL(generic_file_readv);
2427 ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2428 unsigned long nr_segs, loff_t *ppos)
2430 struct address_space *mapping = file->f_mapping;
2431 struct inode *inode = mapping->host;
2432 ssize_t ret;
2434 mutex_lock(&inode->i_mutex);
2435 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2436 mutex_unlock(&inode->i_mutex);
2438 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2439 int err;
2441 err = sync_page_range(inode, mapping, *ppos - ret, ret);
2442 if (err < 0)
2443 ret = err;
2445 return ret;
2447 EXPORT_SYMBOL(generic_file_writev);
2449 /*
2450 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2451 * went wrong during pagecache shootdown.
2452 */
2453 static ssize_t
2454 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2455 loff_t offset, unsigned long nr_segs)
2457 struct file *file = iocb->ki_filp;
2458 struct address_space *mapping = file->f_mapping;
2459 ssize_t retval;
2460 size_t write_len = 0;
2462 /*
2463 * If it's a write, unmap all mmappings of the file up-front. This
2464 * will cause any pte dirty bits to be propagated into the pageframes
2465 * for the subsequent filemap_write_and_wait().
2466 */
2467 if (rw == WRITE) {
2468 write_len = iov_length(iov, nr_segs);
2469 if (mapping_mapped(mapping))
2470 unmap_mapping_range(mapping, offset, write_len, 0);
2473 retval = filemap_write_and_wait(mapping);
2474 if (retval == 0) {
2475 retval = mapping->a_ops->direct_IO(rw, iocb, iov,
2476 offset, nr_segs);
2477 if (rw == WRITE && mapping->nrpages) {
2478 pgoff_t end = (offset + write_len - 1)
2479 >> PAGE_CACHE_SHIFT;
2480 int err = invalidate_inode_pages2_range(mapping,
2481 offset >> PAGE_CACHE_SHIFT, end);
2482 if (err)
2483 retval = err;
2486 return retval;