]> xenbits.xensource.com Git - people/julieng/freebsd.git/commitdiff
As a step towards the elimination of PG_CACHED pages, rework the handling
authormarkj <markj@FreeBSD.org>
Wed, 30 Sep 2015 23:06:29 +0000 (23:06 +0000)
committermarkj <markj@FreeBSD.org>
Wed, 30 Sep 2015 23:06:29 +0000 (23:06 +0000)
of POSIX_FADV_DONTNEED so that it causes the backing pages to be moved to
the head of the inactive queue instead of being cached.

This affects the implementation of POSIX_FADV_NOREUSE as well, since it
works by applying POSIX_FADV_DONTNEED to file ranges after they have been
read or written.  At that point the corresponding buffers may still be
dirty, so the previous implementation would coalesce successive ranges and
apply POSIX_FADV_DONTNEED to the result, ensuring that pages backing the
dirty buffers would eventually be cached.  To preserve this behaviour in an
efficient manner, this change adds a new buf flag, B_NOREUSE, which causes
the pages backing a VMIO buf to be placed at the head of the inactive queue
when the buf is released.  POSIX_FADV_NOREUSE then works by setting this
flag in bufs that underlie the specified range.

Reviewed by: alc, kib
Sponsored by: EMC / Isilon Storage Division
Differential Revision: https://reviews.freebsd.org/D3726

sys/kern/vfs_bio.c
sys/kern/vfs_default.c
sys/kern/vfs_syscalls.c
sys/kern/vfs_vnops.c
sys/sys/buf.h
sys/sys/file.h
sys/vm/vm_object.c
sys/vm/vm_object.h
sys/vm/vm_page.c
sys/vm/vm_page.h

index 80d06073a6abb2ccf7159cb4f8e6622971cc2f1d..0fab0009bc581b6aa8c1ad9a76e4880a85cf8922 100644 (file)
@@ -1785,6 +1785,8 @@ brelse(struct buf *bp)
            bp, bp->b_vp, bp->b_flags);
        KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
            ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+       KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
+           ("brelse: non-VMIO buffer marked NOREUSE"));
 
        if (BUF_LOCKRECURSED(bp)) {
                /*
@@ -1873,8 +1875,10 @@ brelse(struct buf *bp)
                allocbuf(bp, 0);
        }
 
-       if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
+       if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
+           (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
                allocbuf(bp, 0);
+               bp->b_flags &= ~B_NOREUSE;
                if (bp->b_vp != NULL)
                        brelvp(bp);
        }
@@ -1969,6 +1973,10 @@ bqrelse(struct buf *bp)
                if ((bp->b_flags & B_DELWRI) == 0 &&
                    (bp->b_xflags & BX_VNDIRTY))
                        panic("bqrelse: not dirty");
+               if ((bp->b_flags & B_NOREUSE) != 0) {
+                       brelse(bp);
+                       return;
+               }
                qindex = QUEUE_CLEAN;
        }
        binsfree(bp, qindex);
@@ -2079,10 +2087,15 @@ vfs_vmio_unwire(struct buf *bp, vm_page_t m)
                        freed = false;
                if (!freed) {
                        /*
-                        * In order to maintain LRU page ordering, put
-                        * the page at the tail of the inactive queue.
+                        * If the page is unlikely to be reused, let the
+                        * VM know.  Otherwise, maintain LRU page
+                        * ordering and put the page at the tail of the
+                        * inactive queue.
                         */
-                       vm_page_deactivate(m);
+                       if ((bp->b_flags & B_NOREUSE) != 0)
+                               vm_page_deactivate_noreuse(m);
+                       else
+                               vm_page_deactivate(m);
                }
        }
        vm_page_unlock(m);
@@ -2456,8 +2469,9 @@ getnewbuf_reuse_bp(struct buf *bp, int qindex)
         * Note: we no longer distinguish between VMIO and non-VMIO
         * buffers.
         */
-       KASSERT((bp->b_flags & B_DELWRI) == 0,
-           ("delwri buffer %p found in queue %d", bp, qindex));
+       KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
+           ("invalid buffer %p flags %#x found in queue %d", bp, bp->b_flags,
+           qindex));
 
        /*
         * When recycling a clean buffer we have to truncate it and
index d70b68567dad037f1a02debb5923d2489f1b4a08..d0074d3e34ed73a7709a1c6f73973aba275a1018 100644 (file)
@@ -1034,9 +1034,12 @@ vop_stdallocate(struct vop_allocate_args *ap)
 int
 vop_stdadvise(struct vop_advise_args *ap)
 {
+       struct buf *bp;
+       struct buflists *bl;
        struct vnode *vp;
+       daddr_t bn, startn, endn;
        off_t start, end;
-       int error;
+       int bsize, error;
 
        vp = ap->a_vp;
        switch (ap->a_advice) {
@@ -1049,28 +1052,59 @@ vop_stdadvise(struct vop_advise_args *ap)
                error = 0;
                break;
        case POSIX_FADV_DONTNEED:
-               /*
-                * Flush any open FS buffers and then remove pages
-                * from the backing VM object.  Using vinvalbuf() here
-                * is a bit heavy-handed as it flushes all buffers for
-                * the given vnode, not just the buffers covering the
-                * requested range.
-                */
                error = 0;
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                if (vp->v_iflag & VI_DOOMED) {
                        VOP_UNLOCK(vp, 0);
                        break;
                }
-               vinvalbuf(vp, V_CLEANONLY, 0, 0);
+
+               /*
+                * Deactivate pages in the specified range from the backing VM
+                * object.  Pages that are resident in the buffer cache will
+                * remain wired until their corresponding buffers are released
+                * below.
+                */
                if (vp->v_object != NULL) {
                        start = trunc_page(ap->a_start);
                        end = round_page(ap->a_end);
                        VM_OBJECT_WLOCK(vp->v_object);
-                       vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
+                       vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start),
                            OFF_TO_IDX(end));
                        VM_OBJECT_WUNLOCK(vp->v_object);
                }
+
+               BO_RLOCK(&vp->v_bufobj);
+               bsize = vp->v_bufobj.bo_bsize;
+               startn = ap->a_start / bsize;
+               if (ap->a_end == OFF_MAX) {
+                       endn = -1;
+                       bl = &vp->v_bufobj.bo_clean.bv_hd;
+                       if (!TAILQ_EMPTY(bl))
+                               endn = TAILQ_LAST(bl, buflists)->b_lblkno;
+                       bl = &vp->v_bufobj.bo_dirty.bv_hd;
+                       if (!TAILQ_EMPTY(bl) &&
+                           endn < TAILQ_LAST(bl, buflists)->b_lblkno)
+                               endn = TAILQ_LAST(bl, buflists)->b_lblkno;
+               } else
+                       endn = ap->a_end / bsize;
+               BO_RUNLOCK(&vp->v_bufobj);
+               /*
+                * In the VMIO case, use the B_NOREUSE flag to hint that the
+                * pages backing each buffer in the range are unlikely to be
+                * reused.  Dirty buffers will have the hint applied once
+                * they've been written.
+                */
+               for (bn = startn; bn <= endn; bn++) {
+                       bp = getblk(vp, bn, bsize, 0, 0, GB_NOCREAT |
+                           GB_UNMAPPED);
+                       if (bp == NULL)
+                               continue;
+                       bp->b_flags |= B_RELBUF;
+                       if (vp->v_object != NULL)
+                               bp->b_flags |= B_NOREUSE;
+                       brelse(bp);
+               }
                VOP_UNLOCK(vp, 0);
                break;
        default:
index 70a302ba6e51f4427a82666caec9cda453fe4fc9..ce4436ad731ca5fef320ff794670cdadcc449e10 100644 (file)
@@ -4610,8 +4610,6 @@ kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
                        new->fa_advice = advice;
                        new->fa_start = offset;
                        new->fa_end = end;
-                       new->fa_prevstart = 0;
-                       new->fa_prevend = 0;
                        fp->f_advice = new;
                        new = fa;
                }
index 18a9ac3110206617e0cef4ce15c07f6582c41b31..f07df31bbb2f22f3082820ce6c738ca3f1962d15 100644 (file)
@@ -770,10 +770,9 @@ vn_read(fp, uio, active_cred, flags, td)
        struct thread *td;
 {
        struct vnode *vp;
-       struct mtx *mtxp;
+       off_t orig_offset;
        int error, ioflag;
        int advice;
-       off_t offset, start, end;
 
        KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
            uio->uio_td, td));
@@ -797,7 +796,7 @@ vn_read(fp, uio, active_cred, flags, td)
                /* Disable read-ahead for random I/O. */
                break;
        }
-       offset = uio->uio_offset;
+       orig_offset = uio->uio_offset;
 
 #ifdef MAC
        error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@@ -807,39 +806,14 @@ vn_read(fp, uio, active_cred, flags, td)
        fp->f_nextoff = uio->uio_offset;
        VOP_UNLOCK(vp, 0);
        if (error == 0 && advice == POSIX_FADV_NOREUSE &&
-           offset != uio->uio_offset) {
+           orig_offset != uio->uio_offset)
                /*
-                * Use POSIX_FADV_DONTNEED to flush clean pages and
-                * buffers for the backing file after a
-                * POSIX_FADV_NOREUSE read(2).  To optimize the common
-                * case of using POSIX_FADV_NOREUSE with sequential
-                * access, track the previous implicit DONTNEED
-                * request and grow this request to include the
-                * current read(2) in addition to the previous
-                * DONTNEED.  With purely sequential access this will
-                * cause the DONTNEED requests to continously grow to
-                * cover all of the previously read regions of the
-                * file.  This allows filesystem blocks that are
-                * accessed by multiple calls to read(2) to be flushed
-                * once the last read(2) finishes.
+                * Use POSIX_FADV_DONTNEED to flush pages and buffers
+                * for the backing file after a POSIX_FADV_NOREUSE
+                * read(2).
                 */
-               start = offset;
-               end = uio->uio_offset - 1;
-               mtxp = mtx_pool_find(mtxpool_sleep, fp);
-               mtx_lock(mtxp);
-               if (fp->f_advice != NULL &&
-                   fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
-                       if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
-                               start = fp->f_advice->fa_prevstart;
-                       else if (fp->f_advice->fa_prevstart != 0 &&
-                           fp->f_advice->fa_prevstart == end + 1)
-                               end = fp->f_advice->fa_prevend;
-                       fp->f_advice->fa_prevstart = start;
-                       fp->f_advice->fa_prevend = end;
-               }
-               mtx_unlock(mtxp);
-               error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
-       }
+               error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
+                   POSIX_FADV_DONTNEED);
        return (error);
 }
 
@@ -856,10 +830,9 @@ vn_write(fp, uio, active_cred, flags, td)
 {
        struct vnode *vp;
        struct mount *mp;
-       struct mtx *mtxp;
+       off_t orig_offset;
        int error, ioflag, lock_flags;
        int advice;
-       off_t offset, start, end;
 
        KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
            uio->uio_td, td));
@@ -902,7 +875,7 @@ vn_write(fp, uio, active_cred, flags, td)
                /* XXX: Is this correct? */
                break;
        }
-       offset = uio->uio_offset;
+       orig_offset = uio->uio_offset;
 
 #ifdef MAC
        error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
@@ -914,55 +887,14 @@ vn_write(fp, uio, active_cred, flags, td)
        if (vp->v_type != VCHR)
                vn_finished_write(mp);
        if (error == 0 && advice == POSIX_FADV_NOREUSE &&
-           offset != uio->uio_offset) {
+           orig_offset != uio->uio_offset)
                /*
-                * Use POSIX_FADV_DONTNEED to flush clean pages and
-                * buffers for the backing file after a
-                * POSIX_FADV_NOREUSE write(2).  To optimize the
-                * common case of using POSIX_FADV_NOREUSE with
-                * sequential access, track the previous implicit
-                * DONTNEED request and grow this request to include
-                * the current write(2) in addition to the previous
-                * DONTNEED.  With purely sequential access this will
-                * cause the DONTNEED requests to continously grow to
-                * cover all of the previously written regions of the
-                * file.
-                *
-                * Note that the blocks just written are almost
-                * certainly still dirty, so this only works when
-                * VOP_ADVISE() calls from subsequent writes push out
-                * the data written by this write(2) once the backing
-                * buffers are clean.  However, as compared to forcing
-                * IO_DIRECT, this gives much saner behavior.  Write
-                * clustering is still allowed, and clean pages are
-                * merely moved to the cache page queue rather than
-                * outright thrown away.  This means a subsequent
-                * read(2) can still avoid hitting the disk if the
-                * pages have not been reclaimed.
-                *
-                * This does make POSIX_FADV_NOREUSE largely useless
-                * with non-sequential access.  However, sequential
-                * access is the more common use case and the flag is
-                * merely advisory.
+                * Use POSIX_FADV_DONTNEED to flush pages and buffers
+                * for the backing file after a POSIX_FADV_NOREUSE
+                * write(2).
                 */
-               start = offset;
-               end = uio->uio_offset - 1;
-               mtxp = mtx_pool_find(mtxpool_sleep, fp);
-               mtx_lock(mtxp);
-               if (fp->f_advice != NULL &&
-                   fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
-                       if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
-                               start = fp->f_advice->fa_prevstart;
-                       else if (fp->f_advice->fa_prevstart != 0 &&
-                           fp->f_advice->fa_prevstart == end + 1)
-                               end = fp->f_advice->fa_prevend;
-                       fp->f_advice->fa_prevstart = start;
-                       fp->f_advice->fa_prevend = end;
-               }
-               mtx_unlock(mtxp);
-               error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
-       }
-       
+               error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
+                   POSIX_FADV_DONTNEED);
 unlock:
        return (error);
 }
index d5ce0e51d87ae89612aa4ba3e3f783dfb21440ce..bdc457eef35ff6fa6d92502dfd0953ced3be4b70 100644 (file)
@@ -204,7 +204,7 @@ struct buf {
 #define        B_PERSISTENT    0x00000100      /* Perm. ref'ed while EXT2FS mounted. */
 #define        B_DONE          0x00000200      /* I/O completed. */
 #define        B_EINTR         0x00000400      /* I/O was interrupted */
-#define        B_00000800      0x00000800      /* Available flag. */
+#define        B_NOREUSE       0x00000800      /* Contents not reused once released. */
 #define        B_00001000      0x00001000      /* Available flag. */
 #define        B_INVAL         0x00002000      /* Does not contain valid info. */
 #define        B_BARRIER       0x00004000      /* Write this and all preceeding first. */
@@ -229,7 +229,7 @@ struct buf {
 #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \
        "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26dirty\25b20" \
        "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \
-       "\15b12\14b11\13eintr\12done\11persist\10delwri" \
+       "\15b12\14noreuse\13eintr\12done\11persist\10delwri" \
        "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age"
 
 /*
index cb51c27f76b189f76c9fa25efd57bf4d1c422e21..68d33e0dfd257b51899b53045766472ece962ae7 100644 (file)
@@ -160,8 +160,6 @@ struct fadvise_info {
        int             fa_advice;      /* (f) FADV_* type. */
        off_t           fa_start;       /* (f) Region start. */
        off_t           fa_end;         /* (f) Region end. */
-       off_t           fa_prevstart;   /* (f) Previous NOREUSE start. */
-       off_t           fa_prevend;     /* (f) Previous NOREUSE end. */
 };
 
 struct file {
index a4aac95c7adb204e261ef62d1fb0ccb519c376ca..0a3c2efdf8cfeabbd76c4f50693621694af27a61 100644 (file)
@@ -1963,15 +1963,15 @@ skipmemq:
 }
 
 /*
- *     vm_object_page_cache:
+ *     vm_object_page_noreuse:
  *
- *     For the given object, attempt to move the specified clean
- *     pages to the cache queue.  If a page is wired for any reason,
- *     then it will not be changed.  Pages are specified by the given
- *     range ["start", "end").  As a special case, if "end" is zero,
- *     then the range extends from "start" to the end of the object.
- *     Any mappings to the specified pages are removed before the
- *     pages are moved to the cache queue.
+ *     For the given object, attempt to move the specified pages to
+ *     the head of the inactive queue.  This bypasses regular LRU
+ *     operation and allows the pages to be reused quickly under memory
+ *     pressure.  If a page is wired for any reason, then it will not
+ *     be queued.  Pages are specified by the range ["start", "end").
+ *     As a special case, if "end" is zero, then the range extends from
+ *     "start" to the end of the object.
  *
  *     This operation should only be performed on objects that
  *     contain non-fictitious, managed pages.
@@ -1979,14 +1979,14 @@ skipmemq:
  *     The object must be locked.
  */
 void
-vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
        struct mtx *mtx, *new_mtx;
        vm_page_t p, next;
 
        VM_OBJECT_ASSERT_WLOCKED(object);
        KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
-           ("vm_object_page_cache: illegal object %p", object));
+           ("vm_object_page_noreuse: illegal object %p", object));
        if (object->resident_page_count == 0)
                return;
        p = vm_page_find_least(object, start);
@@ -2009,7 +2009,7 @@ vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
                        mtx = new_mtx;
                        mtx_lock(mtx);
                }
-               vm_page_try_to_cache(p);
+               vm_page_deactivate_noreuse(p);
        }
        if (mtx != NULL)
                mtx_unlock(mtx);
index 7e433aee022889afe6888b01863393c069f7bec0..894a8d5616babc2df23d1cc3ad2d88234fcd6db3 100644 (file)
@@ -304,10 +304,10 @@ void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
-void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
-    vm_pindex_t end);
 boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
+void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
+    vm_pindex_t end);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end, int options);
 boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);
index 2aaddfb01826f89474a6903ddc39e9f59531669d..a3a9a10417a49cecb058d843f358a38623a68bd9 100644 (file)
@@ -2588,6 +2588,19 @@ vm_page_deactivate(vm_page_t m)
        _vm_page_deactivate(m, 0);
 }
 
+/*
+ * Move the specified page to the inactive queue with the expectation
+ * that it is unlikely to be reused.
+ *
+ * The page must be locked.
+ */
+void
+vm_page_deactivate_noreuse(vm_page_t m)
+{
+
+       _vm_page_deactivate(m, 1);
+}
+
 /*
  * vm_page_try_to_cache:
  *
@@ -2740,8 +2753,7 @@ vm_page_cache(vm_page_t m)
 /*
  * vm_page_advise
  *
- *     Deactivate or do nothing, as appropriate.  This routine is used
- *     by madvise() and vop_stdadvise().
+ *     Deactivate or do nothing, as appropriate.
  *
  *     The object and page must be locked.
  */
index fd7d3f443c175b61cf69c6541681b3c5177546ba..dedd6ac9eb0c2d54bc1a4a3ad390651d5ea39e0d 100644 (file)
@@ -451,6 +451,7 @@ void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
 int vm_page_try_to_cache (vm_page_t);
 int vm_page_try_to_free (vm_page_t);
 void vm_page_deactivate (vm_page_t);
+void vm_page_deactivate_noreuse(vm_page_t);
 void vm_page_dequeue(vm_page_t m);
 void vm_page_dequeue_locked(vm_page_t m);
 vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);