]> xenbits.xensource.com Git - people/royger/freebsd.git/commitdiff
Fix NFS exports of FUSE file systems for big directories
authorAlan Somers <asomers@FreeBSD.org>
Sun, 2 Jan 2022 17:18:47 +0000 (10:18 -0700)
committerAlan Somers <asomers@FreeBSD.org>
Wed, 2 Mar 2022 23:35:33 +0000 (16:35 -0700)
The FUSE protocol does not require that a directory entry's d_off field
outlive the lifetime of its directory's file handle.  Since the NFS
server must reopen the directory on every VOP_READDIR call, that means
it can't pass uio->uio_offset down to the FUSE server.  Instead, it must
read the directory from 0 each time.  It may need to issue multiple
FUSE_READDIR operations until it finds the d_off field that it's looking
for.  That was the intention behind SVN r348209 and r297887, but a logic
bug prevented subsequent FUSE_READDIR operations from ever being issued,
rendering large directories incompletely browseable.

Reviewed by: rmacklem

(cherry picked from commit d088dc76e1a62ecb6c05bd2b14ee48a9f9a7e2bd)

fusefs: optimize NFS readdir for FUSE_NO_OPENDIR_SUPPORT

In its lowest common denominator, FUSE does not require that a directory
entry's d_off field is valid outside of the lifetime of the directory's
FUSE file handle.  But since NFS is stateless, it must reopen the
directory on every call to VOP_READDIR.  That means reading the
directory all the way from the first entry.  Not only does this create
an O(n^2) condition for large directories, but it can also result in
incorrect behavior if either:

* The file system _does_ change the d_off field for the last directory
  entry previously seen by NFS, or
* The file system deletes the last directory entry previously seen by
  NFS.

Handily, for file systems that set FUSE_NO_OPENDIR_SUPPORT d_off is
guaranteed to be valid for the lifetime of the directory entry, there is
no need to read the directory from the start.

Reviewed by: rmacklem

(cherry picked from commit 4a6526d84a56f398732bff491e63aa42f796a27d)

fusefs: require FUSE_NO_OPENDIR_SUPPORT for NFS exporting

FUSE file systems that do not set FUSE_NO_OPENDIR_SUPPORT do not
guarantee that d_off will be valid after closing and reopening a
directory.  That conflicts with NFS's statelessness, that results in
unresolvable bugs when NFS reads large directories, if:

* The file system _does_ change the d_off field for the last directory
  entry previously returned by VOP_READDIR, or
* The file system deletes the last directory entry previously seen by
  NFS.

Rather than doing a poor job of exporting such file systems, it's better
just to refuse.

Even though this is technically a breaking change, 13.0-RELEASE's
NFS-FUSE support was bad enough that an MFC should be allowed.

Reviewed by: rmacklem
Differential Revision: https://reviews.freebsd.org/D33726

(cherry picked from commit 00134a07898fa807b8a1fcb2596f0e3644143f69)

fusefs: fix the build without INVARIANTS after 00134a07898

MFC with: 00134a07898fa807b8a1fcb2596f0e3644143f69
Reported by: se

(cherry picked from commit 18ed2ce77a254f365a4687d6afe8eeba2aad2e13)

sys/fs/fuse/fuse_internal.c
sys/fs/fuse/fuse_internal.h
sys/fs/fuse/fuse_vnops.c

index 9dac4f7a17be5abd315fc76cbb3a4d7fc138e5b0..8862a09d2e6cb5881f1c59b0107ad587608e12bd 100644 (file)
@@ -554,7 +554,6 @@ fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
 int
 fuse_internal_readdir(struct vnode *vp,
     struct uio *uio,
-    off_t startoff,
     struct fuse_filehandle *fufh,
     struct fuse_iov *cookediov,
     int *ncookies,
@@ -563,7 +562,6 @@ fuse_internal_readdir(struct vnode *vp,
        int err = 0;
        struct fuse_dispatcher fdi;
        struct fuse_read_in *fri = NULL;
-       int fnd_start;
 
        if (uio_resid(uio) == 0)
                return 0;
@@ -573,25 +571,9 @@ fuse_internal_readdir(struct vnode *vp,
         * Note that we DO NOT have a UIO_SYSSPACE here (so no need for p2p
         * I/O).
         */
-
-       /*
-        * fnd_start is set non-zero once the offset in the directory gets
-        * to the startoff.  This is done because directories must be read
-        * from the beginning (offset == 0) when fuse_vnop_readdir() needs
-        * to do an open of the directory.
-        * If it is not set non-zero here, it will be set non-zero in
-        * fuse_internal_readdir_processdata() when uio_offset == startoff.
-        */
-       fnd_start = 0;
-       if (uio->uio_offset == startoff)
-               fnd_start = 1;
        while (uio_resid(uio) > 0) {
                fdi.iosize = sizeof(*fri);
-               if (fri == NULL)
-                       fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
-               else
-                       fdisp_refresh_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
-
+               fdisp_make_vp(&fdi, FUSE_READDIR, vp, NULL, NULL);
                fri = fdi.indata;
                fri->fh = fufh->fh_id;
                fri->offset = uio_offset(uio);
@@ -600,9 +582,8 @@ fuse_internal_readdir(struct vnode *vp,
 
                if ((err = fdisp_wait_answ(&fdi)))
                        break;
-               if ((err = fuse_internal_readdir_processdata(uio, startoff,
-                   &fnd_start, fri->size, fdi.answ, fdi.iosize, cookediov,
-                   ncookies, &cookies)))
+               if ((err = fuse_internal_readdir_processdata(uio, fri->size,
+                       fdi.answ, fdi.iosize, cookediov, ncookies, &cookies)))
                        break;
        }
 
@@ -617,8 +598,6 @@ fuse_internal_readdir(struct vnode *vp,
  */
 int
 fuse_internal_readdir_processdata(struct uio *uio,
-    off_t startoff,
-    int *fnd_start,
     size_t reqsize,
     void *buf,
     size_t bufsize,
@@ -672,39 +651,32 @@ fuse_internal_readdir_processdata(struct uio *uio,
                        err = -1;
                        break;
                }
-               /*
-                * Don't start to copy the directory entries out until
-                * the requested offset in the directory is found.
-                */
-               if (*fnd_start != 0) {
-                       fiov_adjust(cookediov, oreclen);
-                       bzero(cookediov->base, oreclen);
-
-                       de = (struct dirent *)cookediov->base;
-                       de->d_fileno = fudge->ino;
-                       de->d_off = fudge->off;
-                       de->d_reclen = oreclen;
-                       de->d_type = fudge->type;
-                       de->d_namlen = fudge->namelen;
-                       memcpy((char *)cookediov->base + sizeof(struct dirent) -
-                              MAXNAMLEN - 1,
-                              (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
-                       dirent_terminate(de);
-
-                       err = uiomove(cookediov->base, cookediov->len, uio);
-                       if (err)
+               fiov_adjust(cookediov, oreclen);
+               bzero(cookediov->base, oreclen);
+
+               de = (struct dirent *)cookediov->base;
+               de->d_fileno = fudge->ino;
+               de->d_off = fudge->off;
+               de->d_reclen = oreclen;
+               de->d_type = fudge->type;
+               de->d_namlen = fudge->namelen;
+               memcpy((char *)cookediov->base + sizeof(struct dirent) -
+                      MAXNAMLEN - 1,
+                      (char *)buf + FUSE_NAME_OFFSET, fudge->namelen);
+               dirent_terminate(de);
+
+               err = uiomove(cookediov->base, cookediov->len, uio);
+               if (err)
+                       break;
+               if (cookies != NULL) {
+                       if (*ncookies == 0) {
+                               err = -1;
                                break;
-                       if (cookies != NULL) {
-                               if (*ncookies == 0) {
-                                       err = -1;
-                                       break;
-                               }
-                               *cookies = fudge->off;
-                               cookies++;
-                               (*ncookies)--;
                        }
-               } else if (startoff == fudge->off)
-                       *fnd_start = 1;
+                       *cookies = fudge->off;
+                       cookies++;
+                       (*ncookies)--;
+               }
                buf = (char *)buf + freclen;
                bufsize -= freclen;
                uio_setoffset(uio, fudge->off);
index e9fa3857227a35b4032f7dbf55652c0015ac4325..7d773b02fd8b0eba35b5a2bfbed0fc1639f8b53f 100644 (file)
@@ -250,12 +250,12 @@ int fuse_internal_mknod(struct vnode *dvp, struct vnode **vpp,
 struct pseudo_dirent {
        uint32_t d_namlen;
 };
-int fuse_internal_readdir(struct vnode *vp, struct uio *uio, off_t startoff,
+int fuse_internal_readdir(struct vnode *vp, struct uio *uio,
     struct fuse_filehandle *fufh, struct fuse_iov *cookediov, int *ncookies,
-    u_long *cookies);
-int fuse_internal_readdir_processdata(struct uio *uio, off_t startoff,
-    int *fnd_start, size_t reqsize, void *buf, size_t bufsize,
-    struct fuse_iov *cookediov, int *ncookies, u_long **cookiesp);
+    uint64_t *cookies);
+int fuse_internal_readdir_processdata(struct uio *uio, size_t reqsize,
+    void *buf, size_t bufsize, struct fuse_iov *cookediov, int *ncookies,
+    u_long **cookiesp);
 
 /* remove */
 
index da1c5ec3cbcef2ba068608aa62894905bbc27c6b..7a3417749b7285e0596d921e50601d0c2f0de25a 100644 (file)
@@ -1847,10 +1847,10 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
        struct uio *uio = ap->a_uio;
        struct ucred *cred = ap->a_cred;
        struct fuse_filehandle *fufh = NULL;
+       struct mount *mp = vnode_mount(vp);
        struct fuse_iov cookediov;
        int err = 0;
        u_long *cookies;
-       off_t startoff;
        ssize_t tresid;
        int ncookies;
        bool closefufh = false;
@@ -1867,24 +1867,17 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
        }
 
        tresid = uio->uio_resid;
-       startoff = uio->uio_offset;
        err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
-       if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
+       if (err == EBADF && mp->mnt_flag & MNT_EXPORTED) {
+               KASSERT(fuse_get_mpdata(mp)->dataflags
+                               & FSESS_NO_OPENDIR_SUPPORT,
+                       ("FUSE file systems that don't set "
+                        "FUSE_NO_OPENDIR_SUPPORT should not be exported"));
                /* 
                 * nfsd will do VOP_READDIR without first doing VOP_OPEN.  We
-                * must implicitly open the directory here
+                * must implicitly open the directory here.
                 */
                err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred);
-               if (err == 0) {
-                       /*
-                        * When a directory is opened, it must be read from
-                        * the beginning.  Hopefully, the "startoff" still
-                        * exists as an offset cookie for the directory.
-                        * If not, it will read the entire directory without
-                        * returning any entries and just return eof.
-                        */
-                       uio->uio_offset = 0;
-               }
                closefufh = true;
        }
        if (err)
@@ -1902,7 +1895,7 @@ fuse_vnop_readdir(struct vop_readdir_args *ap)
 #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
        fiov_init(&cookediov, DIRCOOKEDSIZE);
 
-       err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov,
+       err = fuse_internal_readdir(vp, uio, fufh, &cookediov,
                &ncookies, cookies);
 
        fiov_teardown(&cookediov);
@@ -2999,8 +2992,30 @@ fuse_vnop_vptofh(struct vop_vptofh_args *ap)
        struct vattr va;
        int err;
 
-       if (!(data->dataflags & FSESS_EXPORT_SUPPORT))
+       if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) {
+               /* NFS requires lookups for "." and ".." */
+               SDT_PROBE2(fusefs, , vnops, trace, 1,
+                       "VOP_VPTOFH without FUSE_EXPORT_SUPPORT");
                return EOPNOTSUPP;
+       }
+       if ((mp->mnt_flag & MNT_EXPORTED) &&
+               !(data->dataflags & FSESS_NO_OPENDIR_SUPPORT))
+       {
+               /*
+                * NFS is stateless, so nfsd must reopen a directory on every
+                * call to VOP_READDIR, passing in the d_off field from the
+                * final dirent of the previous invocation.  But without
+                * FUSE_NO_OPENDIR_SUPPORT, the FUSE protocol does not
+                * guarantee that d_off will be valid after a directory is
+                * closed and reopened.  So prohibit exporting FUSE file
+                * systems that don't set that flag.
+                *
+                * But userspace NFS servers don't have this problem.
+                 */
+               SDT_PROBE2(fusefs, , vnops, trace, 1,
+                       "VOP_VPTOFH without FUSE_NO_OPENDIR_SUPPORT");
+               return EOPNOTSUPP;
+       }
 
        err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
        if (err)