ia64/xen-unstable

view old/xenolinux-2.4.16-sparse/mm/memory.c @ 235:d7d0a23b2e07

bitkeeper revision 1.93 (3e5a4e6bkPheUp3x1uufN2MS3LAB7A)

Latest and Greatest version of XenoLinux based on the Linux-2.4.21-pre4
kernel.
author iap10@labyrinth.cl.cam.ac.uk
date Mon Feb 24 16:55:07 2003 +0000 (2003-02-24)
parents
children
line source
1 /*
2 * linux/mm/memory.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
10 */
12 /*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */
23 /*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */
31 /*
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de)
37 */
39 #include <linux/mm.h>
40 #include <linux/mman.h>
41 #include <linux/swap.h>
42 #include <linux/smp_lock.h>
43 #include <linux/swapctl.h>
44 #include <linux/iobuf.h>
45 #include <linux/highmem.h>
46 #include <linux/pagemap.h>
48 #include <asm/pgalloc.h>
49 #include <asm/uaccess.h>
50 #include <asm/tlb.h>
52 unsigned long max_mapnr;
53 unsigned long num_physpages;
54 void * high_memory;
55 struct page *highmem_start_page;
57 /*
58 * We special-case the C-O-W ZERO_PAGE, because it's such
59 * a common occurrence (no need to read the page to know
60 * that it's zero - better for the cache and memory subsystem).
61 */
62 static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
63 {
64 if (from == ZERO_PAGE(address)) {
65 clear_user_highpage(to, address);
66 return;
67 }
68 copy_user_highpage(to, from, address);
69 }
71 mem_map_t * mem_map;
73 /*
74 * Called by TLB shootdown
75 */
76 void __free_pte(pte_t pte)
77 {
78 struct page *page = pte_page(pte);
79 if ((!VALID_PAGE(page)) || PageReserved(page))
80 return;
81 if (pte_dirty(pte))
82 set_page_dirty(page);
83 free_page_and_swap_cache(page);
84 }
87 /*
88 * Note: this doesn't free the actual pages themselves. That
89 * has been handled earlier when unmapping all the memory regions.
90 */
91 static inline void free_one_pmd(pmd_t * dir)
92 {
93 pte_t * pte;
95 if (pmd_none(*dir))
96 return;
97 if (pmd_bad(*dir)) {
98 pmd_ERROR(*dir);
99 pmd_clear(dir);
100 return;
101 }
102 pte = pte_offset(dir, 0);
103 pmd_clear(dir);
104 pte_free(pte);
105 }
107 static inline void free_one_pgd(pgd_t * dir)
108 {
109 int j;
110 pmd_t * pmd;
112 if (pgd_none(*dir))
113 return;
114 if (pgd_bad(*dir)) {
115 pgd_ERROR(*dir);
116 pgd_clear(dir);
117 return;
118 }
119 pmd = pmd_offset(dir, 0);
120 pgd_clear(dir);
121 for (j = 0; j < PTRS_PER_PMD ; j++) {
122 prefetchw(pmd+j+(PREFETCH_STRIDE/16));
123 free_one_pmd(pmd+j);
124 }
125 pmd_free(pmd);
126 }
128 /* Low and high watermarks for page table cache.
129 The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
130 */
131 int pgt_cache_water[2] = { 25, 50 };
133 /* Returns the number of pages freed */
134 int check_pgt_cache(void)
135 {
136 return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
137 }
140 /*
141 * This function clears all user-level page tables of a process - this
142 * is needed by execve(), so that old pages aren't in the way.
143 */
144 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
145 {
146 pgd_t * page_dir = mm->pgd;
148 spin_lock(&mm->page_table_lock);
149 page_dir += first;
150 do {
151 free_one_pgd(page_dir);
152 page_dir++;
153 } while (--nr);
154 XENO_flush_page_update_queue();
155 spin_unlock(&mm->page_table_lock);
157 /* keep the page table cache within bounds */
158 check_pgt_cache();
159 }
161 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
162 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
164 /*
165 * copy one vm_area from one task to the other. Assumes the page tables
166 * already present in the new task to be cleared in the whole range
167 * covered by this vma.
168 *
169 * 08Jan98 Merged into one routine from several inline routines to reduce
170 * variable count and make things faster. -jj
171 *
172 * dst->page_table_lock is held on entry and exit,
173 * but may be dropped within pmd_alloc() and pte_alloc().
174 */
175 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
176 struct vm_area_struct *vma)
177 {
178 pgd_t * src_pgd, * dst_pgd;
179 unsigned long address = vma->vm_start;
180 unsigned long end = vma->vm_end;
181 unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
183 src_pgd = pgd_offset(src, address)-1;
184 dst_pgd = pgd_offset(dst, address)-1;
186 for (;;) {
187 pmd_t * src_pmd, * dst_pmd;
189 src_pgd++; dst_pgd++;
191 /* copy_pmd_range */
193 if (pgd_none(*src_pgd))
194 goto skip_copy_pmd_range;
195 if (pgd_bad(*src_pgd)) {
196 pgd_ERROR(*src_pgd);
197 pgd_clear(src_pgd);
198 skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
199 if (!address || (address >= end))
200 goto out;
201 continue;
202 }
204 src_pmd = pmd_offset(src_pgd, address);
205 dst_pmd = pmd_alloc(dst, dst_pgd, address);
206 if (!dst_pmd)
207 goto nomem;
209 do {
210 pte_t * src_pte, * dst_pte;
212 /* copy_pte_range */
214 if (pmd_none(*src_pmd))
215 goto skip_copy_pte_range;
216 if (pmd_bad(*src_pmd)) {
217 pmd_ERROR(*src_pmd);
218 pmd_clear(src_pmd);
219 skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
220 if (address >= end)
221 goto out;
222 goto cont_copy_pmd_range;
223 }
225 src_pte = pte_offset(src_pmd, address);
226 dst_pte = pte_alloc(dst, dst_pmd, address);
227 if (!dst_pte)
228 goto nomem;
230 spin_lock(&src->page_table_lock);
231 do {
232 pte_t pte = *src_pte;
233 struct page *ptepage;
235 /* copy_one_pte */
237 if (pte_none(pte))
238 goto cont_copy_pte_range_noset;
239 if (!pte_present(pte)) {
240 swap_duplicate(pte_to_swp_entry(pte));
241 goto cont_copy_pte_range;
242 }
243 ptepage = pte_page(pte);
244 if ((!VALID_PAGE(ptepage)) ||
245 PageReserved(ptepage))
246 goto cont_copy_pte_range;
248 /* If it's a COW mapping, write protect it both in the parent and the child */
249 if (cow) {
250 /* XENO modification: modified ordering here to avoid RaW hazard. */
251 pte = *src_pte;
252 pte = pte_wrprotect(pte);
253 ptep_set_wrprotect(src_pte);
254 }
256 /* If it's a shared mapping, mark it clean in the child */
257 if (vma->vm_flags & VM_SHARED)
258 pte = pte_mkclean(pte);
259 pte = pte_mkold(pte);
260 get_page(ptepage);
261 dst->rss++;
263 cont_copy_pte_range: set_pte(dst_pte, pte);
264 cont_copy_pte_range_noset: address += PAGE_SIZE;
265 if (address >= end)
266 goto out_unlock;
267 src_pte++;
268 dst_pte++;
269 } while ((unsigned long)src_pte & PTE_TABLE_MASK);
270 spin_unlock(&src->page_table_lock);
272 cont_copy_pmd_range: src_pmd++;
273 dst_pmd++;
274 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
275 }
276 out_unlock:
277 spin_unlock(&src->page_table_lock);
278 out:
279 return 0;
280 nomem:
281 return -ENOMEM;
282 }
284 /*
285 * Return indicates whether a page was freed so caller can adjust rss
286 */
287 static inline void forget_pte(pte_t page)
288 {
289 if (!pte_none(page)) {
290 printk("forget_pte: old mapping existed!\n");
291 BUG();
292 }
293 }
295 static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
296 {
297 unsigned long offset;
298 pte_t * ptep;
299 int freed = 0;
301 if (pmd_none(*pmd))
302 return 0;
303 if (pmd_bad(*pmd)) {
304 pmd_ERROR(*pmd);
305 pmd_clear(pmd);
306 return 0;
307 }
308 ptep = pte_offset(pmd, address);
309 offset = address & ~PMD_MASK;
310 if (offset + size > PMD_SIZE)
311 size = PMD_SIZE - offset;
312 size &= PAGE_MASK;
313 for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
314 pte_t pte = *ptep;
315 if (pte_none(pte))
316 continue;
317 if (pte_present(pte)) {
318 struct page *page = pte_page(pte);
319 if (VALID_PAGE(page) && !PageReserved(page))
320 freed ++;
321 /* This will eventually call __free_pte on the pte. */
322 tlb_remove_page(tlb, ptep, address + offset);
323 } else {
324 free_swap_and_cache(pte_to_swp_entry(pte));
325 pte_clear(ptep);
326 }
327 }
329 return freed;
330 }
332 static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size)
333 {
334 pmd_t * pmd;
335 unsigned long end;
336 int freed;
338 if (pgd_none(*dir))
339 return 0;
340 if (pgd_bad(*dir)) {
341 pgd_ERROR(*dir);
342 pgd_clear(dir);
343 return 0;
344 }
345 pmd = pmd_offset(dir, address);
346 end = address + size;
347 if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
348 end = ((address + PGDIR_SIZE) & PGDIR_MASK);
349 freed = 0;
350 do {
351 freed += zap_pte_range(tlb, pmd, address, end - address);
352 address = (address + PMD_SIZE) & PMD_MASK;
353 pmd++;
354 } while (address < end);
355 return freed;
356 }
358 /*
359 * remove user pages in a given range.
360 */
361 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
362 {
363 mmu_gather_t *tlb;
364 pgd_t * dir;
365 unsigned long start = address, end = address + size;
366 int freed = 0;
368 dir = pgd_offset(mm, address);
370 /*
371 * This is a long-lived spinlock. That's fine.
372 * There's no contention, because the page table
373 * lock only protects against kswapd anyway, and
374 * even if kswapd happened to be looking at this
375 * process we _want_ it to get stuck.
376 */
377 if (address >= end)
378 BUG();
379 spin_lock(&mm->page_table_lock);
380 flush_cache_range(mm, address, end);
381 tlb = tlb_gather_mmu(mm);
383 do {
384 freed += zap_pmd_range(tlb, dir, address, end - address);
385 address = (address + PGDIR_SIZE) & PGDIR_MASK;
386 dir++;
387 } while (address && (address < end));
389 /* this will flush any remaining tlb entries */
390 tlb_finish_mmu(tlb, start, end);
392 /*
393 * Update rss for the mm_struct (not necessarily current->mm)
394 * Notice that rss is an unsigned long.
395 */
396 if (mm->rss > freed)
397 mm->rss -= freed;
398 else
399 mm->rss = 0;
400 spin_unlock(&mm->page_table_lock);
401 }
404 /*
405 * Do a quick page-table lookup for a single page.
406 */
407 static struct page * follow_page(unsigned long address, int write)
408 {
409 pgd_t *pgd;
410 pmd_t *pmd;
411 pte_t *ptep, pte;
413 pgd = pgd_offset(current->mm, address);
414 if (pgd_none(*pgd) || pgd_bad(*pgd))
415 goto out;
417 pmd = pmd_offset(pgd, address);
418 if (pmd_none(*pmd) || pmd_bad(*pmd))
419 goto out;
421 ptep = pte_offset(pmd, address);
422 if (!ptep)
423 goto out;
425 pte = *ptep;
426 if (pte_present(pte)) {
427 if (!write ||
428 (pte_write(pte) && pte_dirty(pte)))
429 return pte_page(pte);
430 }
432 out:
433 return 0;
434 }
436 /*
437 * Given a physical address, is there a useful struct page pointing to
438 * it? This may become more complex in the future if we start dealing
439 * with IO-aperture pages in kiobufs.
440 */
442 static inline struct page * get_page_map(struct page *page)
443 {
444 if (!VALID_PAGE(page))
445 return 0;
446 return page;
447 }
449 /*
450 * Force in an entire range of pages from the current process's user VA,
451 * and pin them in physical memory.
452 */
454 #define dprintk(x...)
455 int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
456 {
457 unsigned long ptr, end;
458 int err;
459 struct mm_struct * mm;
460 struct vm_area_struct * vma = 0;
461 struct page * map;
462 int i;
463 int datain = (rw == READ);
465 /* Make sure the iobuf is not already mapped somewhere. */
466 if (iobuf->nr_pages)
467 return -EINVAL;
469 mm = current->mm;
470 dprintk ("map_user_kiobuf: begin\n");
472 ptr = va & PAGE_MASK;
473 end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
474 err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
475 if (err)
476 return err;
478 down_read(&mm->mmap_sem);
480 err = -EFAULT;
481 iobuf->locked = 0;
482 iobuf->offset = va & ~PAGE_MASK;
483 iobuf->length = len;
485 i = 0;
487 /*
488 * First of all, try to fault in all of the necessary pages
489 */
490 while (ptr < end) {
491 if (!vma || ptr >= vma->vm_end) {
492 vma = find_vma(current->mm, ptr);
493 if (!vma)
494 goto out_unlock;
495 if (vma->vm_start > ptr) {
496 if (!(vma->vm_flags & VM_GROWSDOWN))
497 goto out_unlock;
498 if (expand_stack(vma, ptr))
499 goto out_unlock;
500 }
501 if (((datain) && (!(vma->vm_flags & VM_WRITE))) ||
502 (!(vma->vm_flags & VM_READ))) {
503 err = -EACCES;
504 goto out_unlock;
505 }
506 }
507 spin_lock(&mm->page_table_lock);
508 while (!(map = follow_page(ptr, datain))) {
509 int ret;
511 spin_unlock(&mm->page_table_lock);
512 ret = handle_mm_fault(current->mm, vma, ptr, datain);
513 if (ret <= 0) {
514 if (!ret)
515 goto out_unlock;
516 else {
517 err = -ENOMEM;
518 goto out_unlock;
519 }
520 }
521 spin_lock(&mm->page_table_lock);
522 }
523 map = get_page_map(map);
524 if (map) {
525 flush_dcache_page(map);
526 page_cache_get(map);
527 } else
528 printk (KERN_INFO "Mapped page missing [%d]\n", i);
529 spin_unlock(&mm->page_table_lock);
530 iobuf->maplist[i] = map;
531 iobuf->nr_pages = ++i;
533 ptr += PAGE_SIZE;
534 }
536 up_read(&mm->mmap_sem);
537 dprintk ("map_user_kiobuf: end OK\n");
538 return 0;
540 out_unlock:
541 up_read(&mm->mmap_sem);
542 unmap_kiobuf(iobuf);
543 dprintk ("map_user_kiobuf: end %d\n", err);
544 return err;
545 }
547 /*
548 * Mark all of the pages in a kiobuf as dirty
549 *
550 * We need to be able to deal with short reads from disk: if an IO error
551 * occurs, the number of bytes read into memory may be less than the
552 * size of the kiobuf, so we have to stop marking pages dirty once the
553 * requested byte count has been reached.
554 */
556 void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes)
557 {
558 int index, offset, remaining;
559 struct page *page;
561 index = iobuf->offset >> PAGE_SHIFT;
562 offset = iobuf->offset & ~PAGE_MASK;
563 remaining = bytes;
564 if (remaining > iobuf->length)
565 remaining = iobuf->length;
567 while (remaining > 0 && index < iobuf->nr_pages) {
568 page = iobuf->maplist[index];
570 if (!PageReserved(page))
571 SetPageDirty(page);
573 remaining -= (PAGE_SIZE - offset);
574 offset = 0;
575 index++;
576 }
577 }
579 /*
580 * Unmap all of the pages referenced by a kiobuf. We release the pages,
581 * and unlock them if they were locked.
582 */
584 void unmap_kiobuf (struct kiobuf *iobuf)
585 {
586 int i;
587 struct page *map;
589 for (i = 0; i < iobuf->nr_pages; i++) {
590 map = iobuf->maplist[i];
591 if (map) {
592 if (iobuf->locked)
593 UnlockPage(map);
594 page_cache_release(map);
595 }
596 }
598 iobuf->nr_pages = 0;
599 iobuf->locked = 0;
600 }
603 /*
604 * Lock down all of the pages of a kiovec for IO.
605 *
606 * If any page is mapped twice in the kiovec, we return the error -EINVAL.
607 *
608 * The optional wait parameter causes the lock call to block until all
609 * pages can be locked if set. If wait==0, the lock operation is
610 * aborted if any locked pages are found and -EAGAIN is returned.
611 */
613 int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
614 {
615 struct kiobuf *iobuf;
616 int i, j;
617 struct page *page, **ppage;
618 int doublepage = 0;
619 int repeat = 0;
621 repeat:
623 for (i = 0; i < nr; i++) {
624 iobuf = iovec[i];
626 if (iobuf->locked)
627 continue;
629 ppage = iobuf->maplist;
630 for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
631 page = *ppage;
632 if (!page)
633 continue;
635 if (TryLockPage(page)) {
636 while (j--) {
637 struct page *tmp = *--ppage;
638 if (tmp)
639 UnlockPage(tmp);
640 }
641 goto retry;
642 }
643 }
644 iobuf->locked = 1;
645 }
647 return 0;
649 retry:
651 /*
652 * We couldn't lock one of the pages. Undo the locking so far,
653 * wait on the page we got to, and try again.
654 */
656 unlock_kiovec(nr, iovec);
657 if (!wait)
658 return -EAGAIN;
660 /*
661 * Did the release also unlock the page we got stuck on?
662 */
663 if (!PageLocked(page)) {
664 /*
665 * If so, we may well have the page mapped twice
666 * in the IO address range. Bad news. Of
667 * course, it _might_ just be a coincidence,
668 * but if it happens more than once, chances
669 * are we have a double-mapped page.
670 */
671 if (++doublepage >= 3)
672 return -EINVAL;
674 /* Try again... */
675 wait_on_page(page);
676 }
678 if (++repeat < 16)
679 goto repeat;
680 return -EAGAIN;
681 }
683 /*
684 * Unlock all of the pages of a kiovec after IO.
685 */
687 int unlock_kiovec(int nr, struct kiobuf *iovec[])
688 {
689 struct kiobuf *iobuf;
690 int i, j;
691 struct page *page, **ppage;
693 for (i = 0; i < nr; i++) {
694 iobuf = iovec[i];
696 if (!iobuf->locked)
697 continue;
698 iobuf->locked = 0;
700 ppage = iobuf->maplist;
701 for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
702 page = *ppage;
703 if (!page)
704 continue;
705 UnlockPage(page);
706 }
707 }
708 return 0;
709 }
711 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
712 unsigned long size, pgprot_t prot)
713 {
714 unsigned long end;
716 address &= ~PMD_MASK;
717 end = address + size;
718 if (end > PMD_SIZE)
719 end = PMD_SIZE;
720 do {
721 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
722 pte_t oldpage = ptep_get_and_clear(pte);
723 set_pte(pte, zero_pte);
724 forget_pte(oldpage);
725 address += PAGE_SIZE;
726 pte++;
727 } while (address && (address < end));
728 }
730 static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
731 unsigned long size, pgprot_t prot)
732 {
733 unsigned long end;
735 address &= ~PGDIR_MASK;
736 end = address + size;
737 if (end > PGDIR_SIZE)
738 end = PGDIR_SIZE;
739 do {
740 pte_t * pte = pte_alloc(mm, pmd, address);
741 if (!pte)
742 return -ENOMEM;
743 zeromap_pte_range(pte, address, end - address, prot);
744 address = (address + PMD_SIZE) & PMD_MASK;
745 pmd++;
746 } while (address && (address < end));
747 return 0;
748 }
750 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
751 {
752 int error = 0;
753 pgd_t * dir;
754 unsigned long beg = address;
755 unsigned long end = address + size;
756 struct mm_struct *mm = current->mm;
758 dir = pgd_offset(mm, address);
759 flush_cache_range(mm, beg, end);
760 if (address >= end)
761 BUG();
763 spin_lock(&mm->page_table_lock);
764 do {
765 pmd_t *pmd = pmd_alloc(mm, dir, address);
766 error = -ENOMEM;
767 if (!pmd)
768 break;
769 error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
770 if (error)
771 break;
772 address = (address + PGDIR_SIZE) & PGDIR_MASK;
773 dir++;
774 } while (address && (address < end));
775 spin_unlock(&mm->page_table_lock);
776 flush_tlb_range(mm, beg, end);
777 return error;
778 }
780 /*
781 * maps a range of physical memory into the requested pages. the old
782 * mappings are removed. any references to nonexistent pages results
783 * in null mappings (currently treated as "copy-on-access")
784 */
785 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
786 unsigned long phys_addr, pgprot_t prot)
787 {
788 unsigned long end;
790 address &= ~PMD_MASK;
791 end = address + size;
792 if (end > PMD_SIZE)
793 end = PMD_SIZE;
794 do {
795 struct page *page;
796 pte_t oldpage;
797 oldpage = ptep_get_and_clear(pte);
799 page = virt_to_page(__va(phys_addr));
800 if ((!VALID_PAGE(page)) || PageReserved(page))
801 set_pte(pte, mk_pte_phys(phys_addr, prot));
802 forget_pte(oldpage);
803 address += PAGE_SIZE;
804 phys_addr += PAGE_SIZE;
805 pte++;
806 } while (address && (address < end));
807 }
809 static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
810 unsigned long phys_addr, pgprot_t prot)
811 {
812 unsigned long end;
814 address &= ~PGDIR_MASK;
815 end = address + size;
816 if (end > PGDIR_SIZE)
817 end = PGDIR_SIZE;
818 phys_addr -= address;
819 do {
820 pte_t * pte = pte_alloc(mm, pmd, address);
821 if (!pte)
822 return -ENOMEM;
823 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
824 address = (address + PMD_SIZE) & PMD_MASK;
825 pmd++;
826 } while (address && (address < end));
827 return 0;
828 }
830 /* Note: this is only safe if the mm semaphore is held when called. */
831 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
832 {
833 int error = 0;
834 pgd_t * dir;
835 unsigned long beg = from;
836 unsigned long end = from + size;
837 struct mm_struct *mm = current->mm;
839 phys_addr -= from;
840 dir = pgd_offset(mm, from);
841 flush_cache_range(mm, beg, end);
842 if (from >= end)
843 BUG();
845 spin_lock(&mm->page_table_lock);
846 do {
847 pmd_t *pmd = pmd_alloc(mm, dir, from);
848 error = -ENOMEM;
849 if (!pmd)
850 break;
851 error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
852 if (error)
853 break;
854 from = (from + PGDIR_SIZE) & PGDIR_MASK;
855 dir++;
856 } while (from && (from < end));
857 spin_unlock(&mm->page_table_lock);
858 flush_tlb_range(mm, beg, end);
859 return error;
860 }
862 /*
863 * Establish a new mapping:
864 * - flush the old one
865 * - update the page tables
866 * - inform the TLB about the new one
867 *
868 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
869 */
870 static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
871 {
872 set_pte(page_table, entry);
873 flush_tlb_page(vma, address);
874 update_mmu_cache(vma, address, entry);
875 }
877 /*
878 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
879 */
880 static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
881 pte_t *page_table)
882 {
883 flush_page_to_ram(new_page);
884 flush_cache_page(vma, address);
885 establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
886 }
888 /*
889 * This routine handles present pages, when users try to write
890 * to a shared page. It is done by copying the page to a new address
891 * and decrementing the shared-page counter for the old page.
892 *
893 * Goto-purists beware: the only reason for goto's here is that it results
894 * in better assembly code.. The "default" path will see no jumps at all.
895 *
896 * Note that this routine assumes that the protection checks have been
897 * done by the caller (the low-level page fault routine in most cases).
898 * Thus we can safely just mark it writable once we've done any necessary
899 * COW.
900 *
901 * We also mark the page dirty at this point even though the page will
902 * change only once the write actually happens. This avoids a few races,
903 * and potentially makes it more efficient.
904 *
905 * We hold the mm semaphore and the page_table_lock on entry and exit
906 * with the page_table_lock released.
907 */
908 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
909 unsigned long address, pte_t *page_table, pte_t pte)
910 {
911 struct page *old_page, *new_page;
913 old_page = pte_page(pte);
914 if (!VALID_PAGE(old_page))
915 goto bad_wp_page;
917 if (!TryLockPage(old_page)) {
918 int reuse = can_share_swap_page(old_page);
919 unlock_page(old_page);
920 if (reuse) {
921 flush_cache_page(vma, address);
922 establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
923 spin_unlock(&mm->page_table_lock);
924 return 1; /* Minor fault */
925 }
926 }
928 /*
929 * Ok, we need to copy. Oh, well..
930 */
931 page_cache_get(old_page);
932 spin_unlock(&mm->page_table_lock);
934 new_page = alloc_page(GFP_HIGHUSER);
935 if (!new_page)
936 goto no_mem;
937 copy_cow_page(old_page,new_page,address);
939 /*
940 * Re-check the pte - we dropped the lock
941 */
942 spin_lock(&mm->page_table_lock);
943 if (pte_same(*page_table, pte)) {
944 if (PageReserved(old_page))
945 ++mm->rss;
946 break_cow(vma, new_page, address, page_table);
947 lru_cache_add(new_page);
949 /* Free the old page.. */
950 new_page = old_page;
951 }
952 spin_unlock(&mm->page_table_lock);
953 page_cache_release(new_page);
954 page_cache_release(old_page);
955 return 1; /* Minor fault */
957 bad_wp_page:
958 spin_unlock(&mm->page_table_lock);
959 printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
960 return -1;
961 no_mem:
962 page_cache_release(old_page);
963 return -1;
964 }
966 static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff)
967 {
968 do {
969 struct mm_struct *mm = mpnt->vm_mm;
970 unsigned long start = mpnt->vm_start;
971 unsigned long end = mpnt->vm_end;
972 unsigned long len = end - start;
973 unsigned long diff;
975 /* mapping wholly truncated? */
976 if (mpnt->vm_pgoff >= pgoff) {
977 zap_page_range(mm, start, len);
978 continue;
979 }
981 /* mapping wholly unaffected? */
982 len = len >> PAGE_SHIFT;
983 diff = pgoff - mpnt->vm_pgoff;
984 if (diff >= len)
985 continue;
987 /* Ok, partially affected.. */
988 start += diff << PAGE_SHIFT;
989 len = (len - diff) << PAGE_SHIFT;
990 zap_page_range(mm, start, len);
991 } while ((mpnt = mpnt->vm_next_share) != NULL);
992 }
994 /*
995 * Handle all mappings that got truncated by a "truncate()"
996 * system call.
997 *
998 * NOTE! We have to be ready to update the memory sharing
999 * between the file and the memory map for a potential last
1000 * incomplete page. Ugly, but necessary.
1001 */
1002 int vmtruncate(struct inode * inode, loff_t offset)
1004 unsigned long pgoff;
1005 struct address_space *mapping = inode->i_mapping;
1006 unsigned long limit;
1008 if (inode->i_size < offset)
1009 goto do_expand;
1010 inode->i_size = offset;
1011 spin_lock(&mapping->i_shared_lock);
1012 if (!mapping->i_mmap && !mapping->i_mmap_shared)
1013 goto out_unlock;
1015 pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1016 if (mapping->i_mmap != NULL)
1017 vmtruncate_list(mapping->i_mmap, pgoff);
1018 if (mapping->i_mmap_shared != NULL)
1019 vmtruncate_list(mapping->i_mmap_shared, pgoff);
1021 out_unlock:
1022 spin_unlock(&mapping->i_shared_lock);
1023 truncate_inode_pages(mapping, offset);
1024 goto out_truncate;
1026 do_expand:
1027 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1028 if (limit != RLIM_INFINITY) {
1029 if (inode->i_size >= limit) {
1030 send_sig(SIGXFSZ, current, 0);
1031 goto out;
1033 if (offset > limit) {
1034 send_sig(SIGXFSZ, current, 0);
1035 offset = limit;
1038 inode->i_size = offset;
1040 out_truncate:
1041 if (inode->i_op && inode->i_op->truncate) {
1042 lock_kernel();
1043 inode->i_op->truncate(inode);
1044 unlock_kernel();
1046 out:
1047 return 0;
1050 /*
1051 * Primitive swap readahead code. We simply read an aligned block of
1052 * (1 << page_cluster) entries in the swap area. This method is chosen
1053 * because it doesn't cost us any seek time. We also make sure to queue
1054 * the 'original' request together with the readahead ones...
1055 */
1056 void swapin_readahead(swp_entry_t entry)
1058 int i, num;
1059 struct page *new_page;
1060 unsigned long offset;
1062 /*
1063 * Get the number of handles we should do readahead io to.
1064 */
1065 num = valid_swaphandles(entry, &offset);
1066 for (i = 0; i < num; offset++, i++) {
1067 /* Ok, do the async read-ahead now */
1068 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
1069 if (!new_page)
1070 break;
1071 page_cache_release(new_page);
1073 return;
1076 /*
1077 * We hold the mm semaphore and the page_table_lock on entry and
1078 * should release the pagetable lock on exit..
1079 */
1080 static int do_swap_page(struct mm_struct * mm,
1081 struct vm_area_struct * vma, unsigned long address,
1082 pte_t * page_table, pte_t orig_pte, int write_access)
1084 struct page *page;
1085 swp_entry_t entry = pte_to_swp_entry(orig_pte);
1086 pte_t pte;
1087 int ret = 1;
1089 spin_unlock(&mm->page_table_lock);
1090 page = lookup_swap_cache(entry);
1091 if (!page) {
1092 swapin_readahead(entry);
1093 page = read_swap_cache_async(entry);
1094 if (!page) {
1095 /*
1096 * Back out if somebody else faulted in this pte while
1097 * we released the page table lock.
1098 */
1099 int retval;
1100 spin_lock(&mm->page_table_lock);
1101 retval = pte_same(*page_table, orig_pte) ? -1 : 1;
1102 spin_unlock(&mm->page_table_lock);
1103 return retval;
1106 /* Had to read the page from swap area: Major fault */
1107 ret = 2;
1110 lock_page(page);
1112 /*
1113 * Back out if somebody else faulted in this pte while we
1114 * released the page table lock.
1115 */
1116 spin_lock(&mm->page_table_lock);
1117 if (!pte_same(*page_table, orig_pte)) {
1118 spin_unlock(&mm->page_table_lock);
1119 unlock_page(page);
1120 page_cache_release(page);
1121 return 1;
1124 /* The page isn't present yet, go ahead with the fault. */
1126 swap_free(entry);
1127 if (vm_swap_full())
1128 remove_exclusive_swap_page(page);
1130 mm->rss++;
1131 pte = mk_pte(page, vma->vm_page_prot);
1132 if (write_access && can_share_swap_page(page))
1133 pte = pte_mkdirty(pte_mkwrite(pte));
1134 unlock_page(page);
1136 flush_page_to_ram(page);
1137 flush_icache_page(vma, page);
1138 set_pte(page_table, pte);
1140 /* No need to invalidate - it was non-present before */
1141 update_mmu_cache(vma, address, pte);
1142 XENO_flush_page_update_queue();
1143 spin_unlock(&mm->page_table_lock);
1144 return ret;
1147 /*
1148 * We are called with the MM semaphore and page_table_lock
1149 * spinlock held to protect against concurrent faults in
1150 * multithreaded programs.
1151 */
1152 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
1154 pte_t entry;
1156 /* Read-only mapping of ZERO_PAGE. */
1157 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1159 /* ..except if it's a write access */
1160 if (write_access) {
1161 struct page *page;
1163 /* Allocate our own private page. */
1164 spin_unlock(&mm->page_table_lock);
1166 page = alloc_page(GFP_HIGHUSER);
1167 if (!page)
1168 goto no_mem;
1169 clear_user_highpage(page, addr);
1171 spin_lock(&mm->page_table_lock);
1172 if (!pte_none(*page_table)) {
1173 page_cache_release(page);
1174 spin_unlock(&mm->page_table_lock);
1175 return 1;
1177 mm->rss++;
1178 flush_page_to_ram(page);
1179 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
1180 lru_cache_add(page);
1183 set_pte(page_table, entry);
1185 /* No need to invalidate - it was non-present before */
1186 update_mmu_cache(vma, addr, entry);
1187 XENO_flush_page_update_queue();
1188 spin_unlock(&mm->page_table_lock);
1189 return 1; /* Minor fault */
1191 no_mem:
1192 return -1;
1195 /*
1196 * do_no_page() tries to create a new page mapping. It aggressively
1197 * tries to share with existing pages, but makes a separate copy if
1198 * the "write_access" parameter is true in order to avoid the next
1199 * page fault.
1201 * As this is called only for pages that do not currently exist, we
1202 * do not need to flush old virtual caches or the TLB.
1204 * This is called with the MM semaphore held and the page table
1205 * spinlock held. Exit with the spinlock released.
1206 */
1207 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
1208 unsigned long address, int write_access, pte_t *page_table)
1210 struct page * new_page;
1211 pte_t entry;
1213 if (!vma->vm_ops || !vma->vm_ops->nopage)
1214 return do_anonymous_page(mm, vma, page_table, write_access, address);
1215 spin_unlock(&mm->page_table_lock);
1217 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
1219 if (new_page == NULL) /* no page was available -- SIGBUS */
1220 return 0;
1221 if (new_page == NOPAGE_OOM)
1222 return -1;
1224 /*
1225 * Should we do an early C-O-W break?
1226 */
1227 if (write_access && !(vma->vm_flags & VM_SHARED)) {
1228 struct page * page = alloc_page(GFP_HIGHUSER);
1229 if (!page)
1230 return -1;
1231 copy_highpage(page, new_page);
1232 page_cache_release(new_page);
1233 lru_cache_add(page);
1234 new_page = page;
1237 spin_lock(&mm->page_table_lock);
1238 /*
1239 * This silly early PAGE_DIRTY setting removes a race
1240 * due to the bad i386 page protection. But it's valid
1241 * for other architectures too.
1243 * Note that if write_access is true, we either now have
1244 * an exclusive copy of the page, or this is a shared mapping,
1245 * so we can make it writable and dirty to avoid having to
1246 * handle that later.
1247 */
1248 /* Only go through if we didn't race with anybody else... */
1249 if (pte_none(*page_table)) {
1250 ++mm->rss;
1251 flush_page_to_ram(new_page);
1252 flush_icache_page(vma, new_page);
1253 entry = mk_pte(new_page, vma->vm_page_prot);
1254 if (write_access)
1255 entry = pte_mkwrite(pte_mkdirty(entry));
1256 set_pte(page_table, entry);
1257 } else {
1258 /* One of our sibling threads was faster, back out. */
1259 page_cache_release(new_page);
1260 spin_unlock(&mm->page_table_lock);
1261 return 1;
1264 /* no need to invalidate: a not-present page shouldn't be cached */
1265 update_mmu_cache(vma, address, entry);
1266 XENO_flush_page_update_queue();
1267 spin_unlock(&mm->page_table_lock);
1268 return 2; /* Major fault */
1271 /*
1272 * These routines also need to handle stuff like marking pages dirty
1273 * and/or accessed for architectures that don't do it in hardware (most
1274 * RISC architectures). The early dirtying is also good on the i386.
1276 * There is also a hook called "update_mmu_cache()" that architectures
1277 * with external mmu caches can use to update those (ie the Sparc or
1278 * PowerPC hashed page tables that act as extended TLBs).
1280 * Note the "page_table_lock". It is to protect against kswapd removing
1281 * pages from under us. Note that kswapd only ever _removes_ pages, never
1282 * adds them. As such, once we have noticed that the page is not present,
1283 * we can drop the lock early.
1285 * The adding of pages is protected by the MM semaphore (which we hold),
1286 * so we don't need to worry about a page being suddenly been added into
1287 * our VM.
1289 * We enter with the pagetable spinlock held, we are supposed to
1290 * release it when done.
1291 */
1292 static inline int handle_pte_fault(struct mm_struct *mm,
1293 struct vm_area_struct * vma, unsigned long address,
1294 int write_access, pte_t * pte)
1296 pte_t entry;
1298 entry = *pte;
1299 if (!pte_present(entry)) {
1300 /*
1301 * If it truly wasn't present, we know that kswapd
1302 * and the PTE updates will not touch it later. So
1303 * drop the lock.
1304 */
1305 if (pte_none(entry))
1306 return do_no_page(mm, vma, address, write_access, pte);
1307 return do_swap_page(mm, vma, address, pte, entry, write_access);
1310 if (write_access) {
1311 if (!pte_write(entry))
1312 return do_wp_page(mm, vma, address, pte, entry);
1314 entry = pte_mkdirty(entry);
1316 entry = pte_mkyoung(entry);
1317 establish_pte(vma, address, pte, entry);
1318 XENO_flush_page_update_queue();
1319 spin_unlock(&mm->page_table_lock);
1320 return 1;
1323 /*
1324 * By the time we get here, we already hold the mm semaphore
1325 */
1326 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
1327 unsigned long address, int write_access)
1329 pgd_t *pgd;
1330 pmd_t *pmd;
1332 current->state = TASK_RUNNING;
1333 pgd = pgd_offset(mm, address);
1335 /*
1336 * We need the page table lock to synchronize with kswapd
1337 * and the SMP-safe atomic PTE updates.
1338 */
1339 spin_lock(&mm->page_table_lock);
1340 pmd = pmd_alloc(mm, pgd, address);
1342 if (pmd) {
1343 pte_t * pte = pte_alloc(mm, pmd, address);
1344 if (pte)
1345 return handle_pte_fault(mm, vma, address, write_access, pte);
1347 spin_unlock(&mm->page_table_lock);
1348 return -1;
1351 /*
1352 * Allocate page middle directory.
1354 * We've already handled the fast-path in-line, and we own the
1355 * page table lock.
1357 * On a two-level page table, this ends up actually being entirely
1358 * optimized away.
1359 */
1360 pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
1362 pmd_t *new;
1364 /* "fast" allocation can happen without dropping the lock.. */
1365 new = pmd_alloc_one_fast(mm, address);
1366 if (!new) {
1367 spin_unlock(&mm->page_table_lock);
1368 new = pmd_alloc_one(mm, address);
1369 spin_lock(&mm->page_table_lock);
1370 if (!new)
1371 return NULL;
1373 /*
1374 * Because we dropped the lock, we should re-check the
1375 * entry, as somebody else could have populated it..
1376 */
1377 if (!pgd_none(*pgd)) {
1378 pmd_free(new);
1379 goto out;
1382 pgd_populate(mm, pgd, new);
1383 out:
1384 return pmd_offset(pgd, address);
1387 /*
1388 * Allocate the page table directory.
1390 * We've already handled the fast-path in-line, and we own the
1391 * page table lock.
1392 */
1393 pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
1395 if (pmd_none(*pmd)) {
1396 pte_t *new;
1398 /* "fast" allocation can happen without dropping the lock.. */
1399 new = pte_alloc_one_fast(mm, address);
1400 if (!new) {
1401 XENO_flush_page_update_queue();
1402 spin_unlock(&mm->page_table_lock);
1403 new = pte_alloc_one(mm, address);
1404 spin_lock(&mm->page_table_lock);
1405 if (!new)
1406 return NULL;
1408 /*
1409 * Because we dropped the lock, we should re-check the
1410 * entry, as somebody else could have populated it..
1411 */
1412 if (!pmd_none(*pmd)) {
1413 pte_free(new);
1414 goto out;
1417 pmd_populate(mm, pmd, new);
1419 out:
1420 return pte_offset(pmd, address);
1423 /*
1424 * Simplistic page force-in..
1425 */
1426 int make_pages_present(unsigned long addr, unsigned long end)
1428 int write;
1429 struct mm_struct *mm = current->mm;
1430 struct vm_area_struct * vma;
1432 vma = find_vma(mm, addr);
1433 write = (vma->vm_flags & VM_WRITE) != 0;
1434 if (addr >= end)
1435 BUG();
1436 do {
1437 if (handle_mm_fault(mm, vma, addr, write) < 0)
1438 return -1;
1439 addr += PAGE_SIZE;
1440 } while (addr < end);
1441 return 0;