]> xenbits.xensource.com Git - people/julieng/freebsd.git/commitdiff
MFV r274273:
authordelphij <delphij@FreeBSD.org>
Mon, 10 Nov 2014 08:20:21 +0000 (08:20 +0000)
committerdelphij <delphij@FreeBSD.org>
Mon, 10 Nov 2014 08:20:21 +0000 (08:20 +0000)
ZFS large block support.

Please note that booting from datasets that have recordsize greater
than 128KB is not supported (but it's Okay to enable the feature on
the pool).  This *may* remain unchanged because of memory constraint.

Limited safety belt is provided for mounted root filesystem but use
caution is advised.

Illumos issue:
    5027 zfs large block support

MFC after: 1 month

56 files changed:
cddl/contrib/opensolaris/cmd/zdb/zdb.c
cddl/contrib/opensolaris/cmd/zfs/zfs.8
cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
cddl/contrib/opensolaris/cmd/ztest/ztest.c
cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
sys/boot/zfs/zfsimpl.c
sys/cddl/boot/zfs/zfsimpl.h
sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h

index c2c47d511c03b2bfa50560bb56810c6378518b8a..47ab7bf9a1dbdaa5622fb81c3a69bf0f5e0d6164 100644 (file)
@@ -2147,6 +2147,8 @@ dump_label(const char *dev)
        (void) close(fd);
 }
 
+static uint64_t num_large_blocks;
+
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
@@ -2159,6 +2161,8 @@ dump_one_dir(const char *dsname, void *arg)
                (void) printf("Could not open %s, error %d\n", dsname, error);
                return (0);
        }
+       if (dmu_objset_ds(os)->ds_large_blocks)
+               num_large_blocks++;
        dump_dir(os);
        dmu_objset_disown(os, FTAG);
        fuid_table_destroy();
@@ -2169,7 +2173,7 @@ dump_one_dir(const char *dsname, void *arg)
 /*
  * Block statistics.
  */
-#define        PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
+#define        PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
        uint64_t zb_asize;
        uint64_t zb_lsize;
@@ -2234,7 +2238,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
                zb->zb_lsize += BP_GET_LSIZE(bp);
                zb->zb_psize += BP_GET_PSIZE(bp);
                zb->zb_count++;
-               zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
+
+               /*
+                * The histogram is only big enough to record blocks up to
+                * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+                * "other", bucket.
+                */
+               int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+               idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+               zb->zb_psize_histogram[idx]++;
 
                zb->zb_gangs += BP_COUNT_GANG(bp);
 
@@ -2946,6 +2958,7 @@ dump_zpool(spa_t *spa)
                dump_metaslab_groups(spa);
 
        if (dump_opt['d'] || dump_opt['i']) {
+               uint64_t refcount;
                dump_dir(dp->dp_meta_objset);
                if (dump_opt['d'] >= 3) {
                        dump_bpobj(&spa->spa_deferred_bpobj,
@@ -2965,8 +2978,21 @@ dump_zpool(spa_t *spa)
                }
                (void) dmu_objset_find(spa_name(spa), dump_one_dir,
                    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+               (void) feature_get_refcount(spa,
+                   &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
+               if (num_large_blocks != refcount) {
+                       (void) printf("large_blocks feature refcount mismatch: "
+                           "expected %lld != actual %lld\n",
+                           (longlong_t)num_large_blocks,
+                           (longlong_t)refcount);
+                       rc = 2;
+               } else {
+                       (void) printf("Verified large_blocks feature refcount "
+                           "is correct (%llu)\n", (longlong_t)refcount);
+               }
        }
-       if (dump_opt['b'] || dump_opt['c'])
+       if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
                rc = dump_block_stats(spa);
 
        if (rc == 0)
index 2315e056aaf43c50d9c8bc13b9bb00152db4c23e..065497f2a041ffc37f2421cff0f0ce041d69590a 100644 (file)
@@ -30,7 +30,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd June 30, 2014
+.Dd November 10, 2014
 .Dt ZFS 8
 .Os
 .Sh NAME
 .Ar bookmark
 .Nm
 .Cm send
-.Op Fl DnPpRve
+.Op Fl DnPpRveL
 .Op Fl i Ar snapshot | Fl I Ar snapshot
 .Ar snapshot
 .Nm
 .Cm send
-.Op Fl e
+.Op Fl eL
 .Op Fl i Ar snapshot Ns | Ns bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Nm
@@ -1187,6 +1187,12 @@ systems is strongly discouraged, and may adversely affect performance.
 .Pp
 The size specified must be a power of two greater than or equal to 512 and less
 than or equal to 128 Kbytes.
+If the
+.Sy large_blocks
+feature is enabled on the pool, the size may be up to 1 Mbyte.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags.
 .Pp
 Changing the file system's
 .Sy recordsize
@@ -2477,7 +2483,7 @@ feature.
 .It Xo
 .Nm
 .Cm send
-.Op Fl DnPpRve
+.Op Fl DnPpRveL
 .Op Fl i Ar snapshot | Fl I Ar snapshot
 .Ar snapshot
 .Xc
@@ -2549,6 +2555,22 @@ be used regardless of the dataset's
 property, but performance will be much better if the filesystem uses a
 dedup-capable checksum (eg.
 .Sy sha256 ) .
+.It Fl L
+Generate a stream which may contain blocks larger than 128KB.
+This flag
+has no effect if the
+.Sy large_blocks
+pool feature is disabled, or if the
+.Sy recordsize
+property of this filesystem has never been set above 128KB.
+The receiving system must have the
+.Sy large_blocks
+pool feature enabled as well.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags and the
+.Sy large_blocks
+feature.
 .It Fl e
 Generate a more compact stream by using WRITE_EMBEDDED records for blocks
 which are stored more compactly on disk by the
@@ -2596,7 +2618,7 @@ on future versions of
 .It Xo
 .Nm
 .Cm send
-.Op Fl e
+.Op Fl eL
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Xc
@@ -2622,6 +2644,22 @@ specified as the last component of the name
 If the incremental target is a clone, the incremental source can
 be the origin snapshot, or an earlier snapshot in the origin's filesystem,
 or the origin's origin, etc.
+.It Fl L
+Generate a stream which may contain blocks larger than 128KB.
+This flag
+has no effect if the
+.Sy large_blocks
+pool feature is disabled, or if the
+.Sy recordsize
+property of this filesystem has never been set above 128KB.
+The receiving system must have the
+.Sy large_blocks
+pool feature enabled as well.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags and the
+.Sy large_blocks
+feature.
 .It Fl e
 Generate a more compact stream by using WRITE_EMBEDDED records for blocks
 which are stored more compactly on disk by the
index a3b461e1159c90544485879dd1b2ceb970d4f25c..baac993a0df73807bbc6ee69fbbcaa4df895ca9c 100644 (file)
@@ -274,9 +274,9 @@ get_usage(zfs_help_t idx)
        case HELP_ROLLBACK:
                return (gettext("\trollback [-rRf] <snapshot>\n"));
        case HELP_SEND:
-               return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] "
+               return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
                    "<snapshot>\n"
-                   "\tsend [-e] [-i snapshot|bookmark] "
+                   "\tsend [-Le] [-i snapshot|bookmark] "
                    "<filesystem|volume|snapshot>\n"));
        case HELP_SET:
                return (gettext("\tset <property=value> "
@@ -3709,7 +3709,7 @@ zfs_do_send(int argc, char **argv)
        boolean_t extraverbose = B_FALSE;
 
        /* check options */
-       while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
+       while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
                switch (c) {
                case 'i':
                        if (fromname)
@@ -3744,6 +3744,9 @@ zfs_do_send(int argc, char **argv)
                case 'n':
                        flags.dryrun = B_TRUE;
                        break;
+               case 'L':
+                       flags.largeblock = B_TRUE;
+                       break;
                case 'e':
                        flags.embed_data = B_TRUE;
                        break;
@@ -3800,6 +3803,8 @@ zfs_do_send(int argc, char **argv)
                if (zhp == NULL)
                        return (1);
 
+               if (flags.largeblock)
+                       lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
                if (flags.embed_data)
                        lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 
index 27e63a9438f448fe397fd132fb4950d81f0f99ce..d855f168cc60376b35533cf5cdbc21b0915fde8a 100644 (file)
@@ -23,7 +23,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 1, 2014
+.Dd November 10, 2014
 .Dt ZPOOL-FEATURES 7
 .Os
 .Sh NAME
@@ -427,6 +427,33 @@ This feature becomes
 as soon as it is enabled and will
 never return to being
 .Sy enabled .
+.It Sy large_blocks
+.Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block"
+.It GUID Ta org.open-zfs:large_block
+.It READ\-ONLY COMPATIBLE Ta no
+.It DEPENDENCIES Ta extensible_dataset
+.El
+.Pp
+The
+.Sy large_block
+feature allows the record size on a dataset to be
+set larger than 128KB.
+.Pp
+This feature becomes
+.Sy active
+once a
+.Sy recordsize
+property has been set larger than 128KB, and will return to being 
+.Sy enabled
+once all filesystems that have ever had their recordsize larger than 128KB
+are destroyed.
+.Pp
+Please note that booting from datasets that have recordsize greater than
+128KB is
+.Em NOT
+supported by the
+.Fx
+boot loader.
 .El
 .Sh SEE ALSO
 .Xr zpool 8
index dce1cb3d765b3656ab9128af94830fe5afa31d66..d99d8014f049def80fafae01a7408b199d6c0794 100644 (file)
@@ -54,7 +54,6 @@ uint64_t total_stream_len = 0;
 FILE *send_stream = 0;
 boolean_t do_byteswap = B_FALSE;
 boolean_t do_cksum = B_TRUE;
-#define        INITIAL_BUFLEN (1<<20)
 
 static void
 usage(void)
@@ -67,6 +66,18 @@ usage(void)
        exit(1);
 }
 
+static void *
+safe_malloc(size_t size)
+{
+       void *rv = malloc(size);
+       if (rv == NULL) {
+               (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
+                   size);
+               abort();
+       }
+       return (rv);
+}
+
 /*
  * ssread - send stream read.
  *
@@ -158,7 +169,7 @@ print_block(char *buf, int length)
 int
 main(int argc, char *argv[])
 {
-       char *buf = malloc(INITIAL_BUFLEN);
+       char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
        uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
        uint64_t total_records = 0;
        dmu_replay_record_t thedrr;
@@ -307,9 +318,9 @@ main(int argc, char *argv[])
                                nvlist_t *nv;
                                int sz = drr->drr_payloadlen;
 
-                               if (sz > INITIAL_BUFLEN) {
+                               if (sz > SPA_MAXBLOCKSIZE) {
                                        free(buf);
-                                       buf = malloc(sz);
+                                       buf = safe_malloc(sz);
                                }
                                (void) ssread(buf, sz, &zc);
                                if (ferror(send_stream))
index 5ed87ce1371772cdbe403e32513b7b9e14fee5eb..ab69154b8d198ab36cac1f8c2095458229a3beb6 100644 (file)
@@ -987,9 +987,15 @@ ztest_spa_get_ashift() {
 static int
 ztest_random_blocksize(void)
 {
-       // Choose a block size >= the ashift.
-       uint64_t block_shift =
-           ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
+       uint64_t block_shift;
+       /*
+        * Choose a block size >= the ashift.
+        * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+        */
+       int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+       if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+               maxbs = 20;
+       block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
        return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
@@ -4789,7 +4795,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
        char path0[MAXPATHLEN];
        char pathrand[MAXPATHLEN];
        size_t fsize;
-       int bshift = SPA_MAXBLOCKSHIFT + 2;     /* don't scrog all labels */
+       int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
        int iters = 1000;
        int maxfaults;
        int mirror_save;
index ef18b457e056ddc226827c650ef53554ee3f7476..8a707d1f795cc2f8b60ebfd29711247aaf45dd70 100644 (file)
@@ -609,6 +609,9 @@ typedef struct sendflags {
        /* show progress (ie. -v) */
        boolean_t progress;
 
+       /* large blocks (>128K) are permitted */
+       boolean_t largeblock;
+
        /* WRITE_EMBEDDED records of type DATA are permitted */
        boolean_t embed_data;
 } sendflags_t;
index 265038ab1bf911be41a76cf7a6405669e5399be1..063df4a3c23464f348d8abcf082f64079b13b196 100644 (file)
@@ -1080,21 +1080,36 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
                        break;
                }
 
-               case ZFS_PROP_RECORDSIZE:
                case ZFS_PROP_VOLBLOCKSIZE:
-                       /* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
+               case ZFS_PROP_RECORDSIZE:
+               {
+                       int maxbs = SPA_MAXBLOCKSIZE;
+                       if (zhp != NULL) {
+                               maxbs = zpool_get_prop_int(zhp->zpool_hdl,
+                                   ZPOOL_PROP_MAXBLOCKSIZE, NULL);
+                       }
+                       /*
+                        * Volumes are limited to a volblocksize of 128KB,
+                        * because they typically service workloads with
+                        * small random writes, which incur a large performance
+                        * penalty with large blocks.
+                        */
+                       if (prop == ZFS_PROP_VOLBLOCKSIZE)
+                               maxbs = SPA_OLD_MAXBLOCKSIZE;
+                       /*
+                        * The value must be a power of two between
+                        * SPA_MINBLOCKSIZE and maxbs.
+                        */
                        if (intval < SPA_MINBLOCKSIZE ||
-                           intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
+                           intval > maxbs || !ISP2(intval)) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                                   "'%s' must be power of 2 from %u "
-                                   "to %uk"), propname,
-                                   (uint_t)SPA_MINBLOCKSIZE,
-                                   (uint_t)SPA_MAXBLOCKSIZE >> 10);
+                                   "'%s' must be power of 2 from 512B "
+                                   "to %uKB"), propname, maxbs >> 10);
                                (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
                                goto error;
                        }
                        break;
-
+               }
                case ZFS_PROP_MLSLABEL:
                {
 #ifdef sun
@@ -1471,7 +1486,9 @@ zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
                break;
 
        case ERANGE:
-               if (prop == ZFS_PROP_COMPRESSION) {
+       case EDOM:
+               if (prop == ZFS_PROP_COMPRESSION ||
+                   prop == ZFS_PROP_RECORDSIZE) {
                        (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "property setting is not allowed on "
                            "bootable datasets"));
@@ -3197,9 +3214,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
                case EDOM:
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "volume block size must be power of 2 from "
-                           "%u to %uk"),
-                           (uint_t)SPA_MINBLOCKSIZE,
-                           (uint_t)SPA_MAXBLOCKSIZE >> 10);
+                           "512B to 128KB"));
 
                        return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 
index 97f18d7bb22398a95368b22fc44e5dd58f57aec8..91857b65a8d6b0dcbee3cdba4e4bcc36c1f34553 100644 (file)
@@ -215,7 +215,7 @@ static void *
 cksummer(void *arg)
 {
        dedup_arg_t *dda = arg;
-       char *buf = malloc(1<<20);
+       char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
        dmu_replay_record_t thedrr;
        dmu_replay_record_t *drr = &thedrr;
        struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@@ -280,9 +280,9 @@ cksummer(void *arg)
                            DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
                                int sz = drr->drr_payloadlen;
 
-                               if (sz > 1<<20) {
-                                       free(buf);
-                                       buf = malloc(sz);
+                               if (sz > SPA_MAXBLOCKSIZE) {
+                                       buf = zfs_realloc(dda->dedup_hdl, buf,
+                                           SPA_MAXBLOCKSIZE, sz);
                                }
                                (void) ssread(buf, sz, ofp);
                                if (ferror(stdin))
@@ -815,7 +815,7 @@ typedef struct send_dump_data {
        char prevsnap[ZFS_MAXNAMELEN];
        uint64_t prevsnap_obj;
        boolean_t seenfrom, seento, replicate, doall, fromorigin;
-       boolean_t verbose, dryrun, parsable, progress, embed_data;
+       boolean_t verbose, dryrun, parsable, progress, embed_data, large_block;
        int outfd;
        boolean_t err;
        nvlist_t *fss;
@@ -1163,6 +1163,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
                }
 
                enum lzc_send_flags flags = 0;
+               if (sdd->large_block)
+                       flags |= LZC_SEND_FLAG_LARGE_BLOCK;
                if (sdd->embed_data)
                        flags |= LZC_SEND_FLAG_EMBED_DATA;
 
@@ -1511,6 +1513,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        sdd.parsable = flags->parsable;
        sdd.progress = flags->progress;
        sdd.dryrun = flags->dryrun;
+       sdd.large_block = flags->largeblock;
        sdd.embed_data = flags->embed_data;
        sdd.filter_cb = filter_func;
        sdd.filter_cb_arg = cb_arg;
@@ -2545,7 +2548,7 @@ static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
        dmu_replay_record_t *drr;
-       void *buf = malloc(1<<20);
+       void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
        char errbuf[1024];
 
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
index cb38dc2d12820d89b12885ff726359cb82439ac6..52bd580c47dfdd85f4b9ab9896fca17c8b569f59 100644 (file)
@@ -502,6 +502,10 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
  *
  * "fd" is the file descriptor to write the send stream to.
  *
+ * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
+ * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
+ * records with drr_blksz > 128K.
+ *
  * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
  * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
  * which the receiving system must support (as indicated by support
@@ -518,6 +522,8 @@ lzc_send(const char *snapname, const char *from, int fd,
        fnvlist_add_int32(args, "fd", fd);
        if (from != NULL)
                fnvlist_add_string(args, "fromsnap", from);
+       if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+               fnvlist_add_boolean(args, "largeblockok");
        if (flags & LZC_SEND_FLAG_EMBED_DATA)
                fnvlist_add_boolean(args, "embedok");
        err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
index 99883fecc13382e85ded28d7c11917b74ee9c2c7..b6a4c12f25009ca719bcded1868b8ed74e42a273 100644 (file)
@@ -54,7 +54,8 @@ int lzc_release(nvlist_t *, nvlist_t **);
 int lzc_get_holds(const char *, nvlist_t **);
 
 enum lzc_send_flags {
-       LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+       LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
+       LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
 };
 
 int lzc_send(const char *, const char *, int, enum lzc_send_flags);
index 16b57c7a61509d89c103d6e89724af8398355a68..8d21c5714302fa322ec294499b6ad709d0bda081 100644 (file)
@@ -57,6 +57,7 @@ static const char *features_for_read[] = {
        "com.delphix:hole_birth",
        "com.delphix:extensible_dataset",
        "com.delphix:embedded_data",
+       "org.open-zfs:large_blocks",
        NULL
 };
 
@@ -1222,6 +1223,11 @@ dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf,
        int nlevels = dnode->dn_nlevels;
        int i, rc;
 
+       if (bsize > SPA_MAXBLOCKSIZE) {
+               printf("ZFS: I/O error - blocks larger than 128K are not supported\n");
+               return (EIO);
+       }
+
        /*
         * Note: bsize may not be a power of two here so we need to do an
         * actual divide rather than a bitshift.
index 5f2e255e770724e016f4beb3af910bdded6741a0..730cdf49a0dd18b235d2bf6bf5ebc98e155503e4 100644 (file)
 #define        BSWAP_64(x)     ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * Note: the boot loader can't actually read blocks larger than 128KB,
+ * due to lack of memory.  Therefore its SPA_MAXBLOCKSIZE is still 128KB.
  */
 #define        SPA_MINBLOCKSHIFT       9
 #define        SPA_MAXBLOCKSHIFT       17
 #define        SPA_MINBLOCKSIZE        (1ULL << SPA_MINBLOCKSHIFT)
 #define        SPA_MAXBLOCKSIZE        (1ULL << SPA_MAXBLOCKSHIFT)
 
-#define        SPA_BLOCKSIZES          (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
index 65d285893e942953f2a242588b26d20b1e01ac8e..52a355d0a5450c4653b04ca2e201870e7dc14193 100644 (file)
@@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon)
 {
        return ((c >= 'a' && c <= 'z') ||
            (c >= '0' && c <= '9') ||
-           c == (after_colon ? '_' : '.'));
+           (after_colon && c == '_') ||
+           (!after_colon && (c == '.' || c == '-')));
 }
 
 /*
@@ -220,4 +221,13 @@ zpool_feature_init(void)
            "com.delphix:embedded_data", "embedded_data",
            "Blocks which compress very well use even less space.",
            B_FALSE, B_TRUE, B_TRUE, NULL);
+
+       static const spa_feature_t large_blocks_deps[] = {
+               SPA_FEATURE_EXTENSIBLE_DATASET,
+               SPA_FEATURE_NONE
+       };
+       zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+           "org.open-zfs:large_blocks", "large_blocks",
+           "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
+           large_blocks_deps);
 }
index 65016f1b5e2db7c49c8b16c6d732009ead296ca0..4ffe435ab68c0c6dd32bcfd4e0fadbec078f778a 100644 (file)
@@ -50,6 +50,7 @@ typedef enum spa_feature {
        SPA_FEATURE_EMBEDDED_DATA,
        SPA_FEATURE_BOOKMARKS,
        SPA_FEATURE_FS_SS_LIMIT,
+       SPA_FEATURE_LARGE_BLOCKS,
        SPA_FEATURES
 } spa_feature_t;
 
index bd023c70ef47017507b3870c1bae11485a694d99..dda72de8737e40f2f2d3261f5c8fd23818b6eef5 100644 (file)
@@ -409,8 +409,8 @@ zfs_prop_init(void)
 
        /* inherit number properties */
        zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
-           SPA_MAXBLOCKSIZE, PROP_INHERIT,
-           ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+           SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+           ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
        /* hidden properties */
        zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
index a400f821e2e29b454e977e6d16bc91e23aff711d..4d906b02bc02e80a7b0aae7af66898b6e5d1ae79 100644 (file)
@@ -127,6 +127,8 @@ zpool_prop_init(void)
        /* hidden properties */
        zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
            PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+       zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+           PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
 }
 
 /*
index b19df1af39673cdae25132cadbe4853237f05500..05c40e38a7446613def38f1048fdd07ef31f054a 100644 (file)
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
                if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
                        ASSERT0(dp->dp_empty_bpobj);
                        dp->dp_empty_bpobj =
-                           bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+                           bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
                        VERIFY(zap_add(os,
                            DMU_POOL_DIRECTORY_OBJECT,
                            DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -398,7 +398,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
        dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
        if (bpo->bpo_phys->bpo_subobjs == 0) {
                bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-                   DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+                   DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+                   DMU_OT_NONE, 0, tx);
        }
 
        dmu_object_info_t doi;
index c724ed074103208569d5a9b32e2aac35b76565c5..5f7d76f0e2ade937abbfa557413935db247f1ac7 100644 (file)
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
        bptree_phys_t *bt;
 
        obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
-           SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+           SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
            sizeof (bptree_phys_t), tx);
 
        /*
index c9ea6c02a13ec314d0ca3baece205935bf4d83e1..0b9a0b92434a46878c4fbf1e5c8e9cfdd8b3b05b 100644 (file)
@@ -2022,10 +2022,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
                return (SET_ERROR(ENOTSUP));
        if (blksz == 0)
                blksz = SPA_MINBLOCKSIZE;
-       if (blksz > SPA_MAXBLOCKSIZE)
-               blksz = SPA_MAXBLOCKSIZE;
-       else
-               blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+       ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+       blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
index 73b8e056cc8c588552da0c2e991a3bcb91bb1c9a..e7aeed17fb9d18eff85f1cdee1795d42fd500e41 100644 (file)
@@ -255,6 +255,14 @@ logbias_changed_cb(void *arg, uint64_t newval)
                zil_set_logbias(os->os_zil, newval);
 }
 
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+       objset_t *os = arg;
+
+       os->os_recordsize = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -384,6 +392,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                                    ZFS_PROP_REDUNDANT_METADATA),
                                    redundant_metadata_changed_cb, os);
                        }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                                   recordsize_changed_cb, os);
+                       }
                }
                if (err != 0) {
                        VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -642,6 +655,9 @@ dmu_objset_evict(objset_t *os)
                        VERIFY0(dsl_prop_unregister(ds,
                            zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
                            redundant_metadata_changed_cb, os));
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                           recordsize_changed_cb, os));
                }
                VERIFY0(dsl_prop_unregister(ds,
                    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
index 1a0cab5d1cd89b892b483b6cfb61b3548a668b21..00d4f3e73e2a7cfa2423b09abccd16aff6f20a37 100644 (file)
@@ -227,11 +227,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
        drrw->drr_offset = offset;
        drrw->drr_length = blksz;
        drrw->drr_toguid = dsp->dsa_toguid;
-       if (BP_IS_EMBEDDED(bp)) {
+       if (bp == NULL || BP_IS_EMBEDDED(bp)) {
                /*
-                * There's no pre-computed checksum of embedded BP's, so
-                * (like fletcher4-checkummed blocks) userland will have
-                * to compute a dedup-capable checksum itself.
+                * There's no pre-computed checksum for partial-block
+                * writes or embedded BP's, so (like
+                * fletcher4-checkummed blocks) userland will have to
+                * compute a dedup-capable checksum itself.
                 */
                drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
        } else {
@@ -393,6 +394,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
        drro->drr_compress = dnp->dn_compress;
        drro->drr_toguid = dsp->dsa_toguid;
 
+       if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+               drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
        if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
                return (SET_ERROR(EINTR));
 
@@ -512,6 +517,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                uint32_t aflags = ARC_WAIT;
                arc_buf_t *abuf;
                int blksz = BP_GET_LSIZE(bp);
+               uint64_t offset;
 
                ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
                ASSERT0(zb->zb_level);
@@ -532,8 +538,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                        }
                }
 
-               err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
-                   blksz, bp, abuf->b_data);
+               offset = zb->zb_blkid * blksz;
+
+               if (!(dsp->dsa_featureflags &
+                   DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+                   blksz > SPA_OLD_MAXBLOCKSIZE) {
+                       char *buf = abuf->b_data;
+                       while (blksz > 0 && err == 0) {
+                               int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+                               err = dump_write(dsp, type, zb->zb_object,
+                                   offset, n, NULL, buf);
+                               offset += n;
+                               buf += n;
+                               blksz -= n;
+                       }
+               } else {
+                       err = dump_write(dsp, type, zb->zb_object,
+                           offset, blksz, bp, abuf->b_data);
+               }
                (void) arc_buf_remove_ref(abuf, &abuf);
        }
 
@@ -548,9 +570,9 @@ static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
 #ifdef illumos
-    int outfd, vnode_t *vp, offset_t *off)
+    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 #else
-    int outfd, struct file *fp, offset_t *off)
+    boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off)
 #endif
 {
        objset_t *os;
@@ -586,6 +608,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        }
 #endif
 
+       if (large_block_ok && ds->ds_large_blocks)
+               featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
        if (embedok &&
            spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
                featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -682,10 +706,11 @@ out:
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+    int outfd, vnode_t *vp, offset_t *off)
 #else
-    boolean_t embedok, int outfd, struct file *fp, offset_t *off)
+    int outfd, struct file *fp, offset_t *off)
 #endif
 {
        dsl_pool_t *dp;
@@ -720,18 +745,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
                zb.zbm_guid = fromds->ds_phys->ds_guid;
                is_clone = (fromds->ds_dir != ds->ds_dir);
                dsl_dataset_rele(fromds, FTAG);
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-                   outfd, fp, off);
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+                   embedok, large_block_ok, outfd, fp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-                   outfd, fp, off);
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+                   embedok, large_block_ok, outfd, fp, off);
        }
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, vnode_t *vp, offset_t *off)
 #else
@@ -802,11 +828,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
                        dsl_pool_rele(dp, FTAG);
                        return (err);
                }
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-                   outfd, fp, off);
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+                   embedok, large_block_ok, outfd, fp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-                   outfd, fp, off);
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+                   embedok, large_block_ok, outfd, fp, off);
        }
        if (owned)
                dsl_dataset_disown(ds, FTAG);
@@ -1006,6 +1032,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
                return (SET_ERROR(ENOTSUP));
 
+       /*
+        * The receiving code doesn't know how to translate large blocks
+        * to smaller ones, so the pool must have the LARGE_BLOCKS
+        * feature enabled if the stream has LARGE_BLOCKS.
+        */
+       if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SET_ERROR(ENOTSUP));
+
        error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
        if (error == 0) {
                /* target fs already exists; recv into temp clone */
@@ -1131,6 +1166,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
        }
        VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
+       if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+           DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           !newds->ds_large_blocks) {
+               dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+               newds->ds_large_blocks = B_TRUE;
+       }
+
        dmu_buf_will_dirty(newds->ds_dbuf, tx);
        newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
@@ -1283,6 +1325,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
 
        /* some things will require 8-byte alignment, so everything must */
        ASSERT0(len % 8);
+       ASSERT3U(len, <=, ra->bufsize);
 
        while (done < len) {
                ssize_t resid;
@@ -1420,7 +1463,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
            drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
            P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
            drro->drr_blksz < SPA_MINBLOCKSIZE ||
-           drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+           drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
            drro->drr_bonuslen > DN_MAX_BONUSLEN) {
                return (SET_ERROR(EINVAL));
        }
@@ -1693,7 +1736,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
        int err;
 
        if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-           drrs->drr_length > SPA_MAXBLOCKSIZE)
+           drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
                return (SET_ERROR(EINVAL));
 
        data = restore_read(ra, drrs->drr_length, NULL);
@@ -1781,7 +1824,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
        ra.td = curthread;
        ra.fp = fp;
        ra.voff = *voffp;
-       ra.bufsize = 1<<20;
+       ra.bufsize = SPA_MAXBLOCKSIZE;
        ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
        /* these were verified in dmu_recv_begin */
index ed9757ecd4572c1a8c4ae88b887cb539bc6b887e..b97040ae870a2766573718c5be78e39aa4543bd9 100644 (file)
@@ -224,7 +224,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                return;
 
        min_bs = SPA_MINBLOCKSHIFT;
-       max_bs = SPA_MAXBLOCKSHIFT;
+       max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
        min_ibs = DN_MIN_INDBLKSHIFT;
        max_ibs = DN_MAX_INDBLKSHIFT;
 
@@ -293,6 +293,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                         */
                        ASSERT(dn->dn_datablkshift != 0);
                        min_bs = max_bs = dn->dn_datablkshift;
+               } else {
+                       /*
+                        * The blocksize can increase up to the recordsize,
+                        * or if it is already more than the recordsize,
+                        * up to the next power of 2.
+                        */
+                       min_bs = highbit64(dn->dn_datablksz - 1);
+                       max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
                }
 
                /*
@@ -751,11 +759,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
                bp = &dn->dn_phys->dn_blkptr[0];
                if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
                    bp, bp->blk_birth))
-                       txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
                else
-                       txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_towrite += MZAP_MAX_BLKSZ;
                if (!BP_IS_HOLE(bp))
-                       txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tounref += MZAP_MAX_BLKSZ;
                return;
        }
 
@@ -1549,18 +1557,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 
        /* If blkptr doesn't exist then add space to towrite */
        if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-               txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+               txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
        } else {
                blkptr_t *bp;
 
                bp = &dn->dn_phys->dn_spill;
                if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
                    bp, bp->blk_birth))
-                       txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
                else
-                       txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
                if (!BP_IS_HOLE(bp))
-                       txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
        }
 }
 
index 0b19e76343f0d98d20026d97bdcc186e7657a1b6..b39f6b11d26f853e11102db9a033ca7ae01289ad 100644 (file)
@@ -513,10 +513,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 {
        int i;
 
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (blocksize == 0)
                blocksize = 1 << zfs_default_bs;
-       else if (blocksize > SPA_MAXBLOCKSIZE)
-               blocksize = SPA_MAXBLOCKSIZE;
        else
                blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -597,7 +597,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        int nblkptr;
 
        ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-       ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        ASSERT0(blocksize % SPA_MINBLOCKSIZE);
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
        ASSERT(tx->tx_txg != 0);
@@ -1352,10 +1353,9 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
        dmu_buf_impl_t *db;
        int err;
 
+       ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (size == 0)
                size = SPA_MINBLOCKSIZE;
-       if (size > SPA_MAXBLOCKSIZE)
-               size = SPA_MAXBLOCKSIZE;
        else
                size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
index a3efe9293cb31255f4de4c9f99464432b860d497..8ac29d61216112f77cc6effca351665c23c57d9e 100644 (file)
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 
+/*
+ * The SPA supports block sizes up to 16MB.  However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator.  Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB).  Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+
 #define        SWITCH64(x, y) \
        { \
                uint64_t __tmp = (x); \
@@ -60,8 +71,6 @@
 
 #define        DS_REF_MAX      (1ULL << 62)
 
-#define        DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
-
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
  * layer.  If there's a refreservation, that space has already been
@@ -111,6 +120,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
        ds->ds_phys->ds_compressed_bytes += compressed;
        ds->ds_phys->ds_uncompressed_bytes += uncompressed;
        ds->ds_phys->ds_unique_bytes += used;
+       if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+               ds->ds_need_large_blocks = B_TRUE;
        mutex_exit(&ds->ds_lock);
        dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
            compressed, uncompressed, tx);
@@ -392,6 +403,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
                    offsetof(dmu_sendarg_t, dsa_link));
 
+               if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+                       err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
+                       if (err == 0)
+                               ds->ds_large_blocks = B_TRUE;
+                       else
+                               ASSERT3U(err, ==, ENOENT);
+               }
+
                if (err == 0) {
                        err = dsl_dir_hold_obj(dp,
                            ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
@@ -707,6 +726,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                dsphys->ds_flags |= origin->ds_phys->ds_flags &
                    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
 
+               if (origin->ds_large_blocks)
+                       dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
                dmu_buf_will_dirty(origin->ds_dbuf, tx);
                origin->ds_phys->ds_num_children++;
 
@@ -1262,6 +1284,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
        dsphys->ds_bp = ds->ds_phys->ds_bp;
        dmu_buf_rele(dbuf, FTAG);
 
+       if (ds->ds_large_blocks)
+               dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
        ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
        if (ds->ds_prev) {
                uint64_t next_clones_obj =
@@ -1546,6 +1571,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
        ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
 
        dmu_objset_sync(ds->ds_objset, zio, tx);
+
+       if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
+               dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+               ds->ds_large_blocks = B_TRUE;
+       }
 }
 
 static void
@@ -3231,6 +3261,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
        return (err);
 }
 
+static int
+dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
+{
+       const char *dsname = arg;
+       dsl_dataset_t *ds;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       int error = 0;
+
+       if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SET_ERROR(ENOTSUP));
+
+       ASSERT(spa_feature_is_enabled(dp->dp_spa,
+           SPA_FEATURE_EXTENSIBLE_DATASET));
+
+       error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       if (ds->ds_large_blocks)
+               error = EALREADY;
+       dsl_dataset_rele(ds, FTAG);
+
+       return (error);
+}
+
+void
+dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+       uint64_t zero = 0;
+
+       spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+       dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+       VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
+           sizeof (zero), 1, &zero, tx));
+}
+
+static void
+dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
+{
+       const char *dsname = arg;
+       dsl_dataset_t *ds;
+
+       VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
+
+       dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+       ASSERT(!ds->ds_large_blocks);
+       ds->ds_large_blocks = B_TRUE;
+       dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_activate_large_blocks(const char *dsname)
+{
+       int error;
+
+       error = dsl_sync_task(dsname,
+           dsl_dataset_activate_large_blocks_check,
+           dsl_dataset_activate_large_blocks_sync, (void *)dsname,
+           1, ZFS_SPACE_CHECK_RESERVED);
+
+       /*
+        * EALREADY indicates that this dataset already supports large blocks.
+        */
+       if (error == EALREADY)
+               error = 0;
+       return (error);
+}
+
 /*
  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
  * For example, they could both be snapshots of the same filesystem, and
index 4f39c397a0643a7f154c8587a7258b1e3dff6ada..8c8e3746eecb59fb2a2af5aea221e84b998d6085 100644 (file)
@@ -143,7 +143,7 @@ uint64_t
 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 {
        if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
-               return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+               return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
        return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
            sizeof (dsl_deadlist_phys_t), tx));
 }
@@ -180,7 +180,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 {
        if (dle->dle_bpobj.bpo_object ==
            dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
-               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
                bpobj_close(&dle->dle_bpobj);
                bpobj_decr_empty(dl->dl_os, tx);
                VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -254,7 +254,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 
        dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
        dle->dle_mintxg = mintxg;
-       obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+       obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
        VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
        avl_add(&dl->dl_tree, dle);
 
@@ -338,7 +338,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
                if (dle->dle_mintxg >= maxtxg)
                        break;
 
-               obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
                VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
                    dle->dle_mintxg, obj, tx));
        }
index f8a4546535e4c2de892f2a0519fab52b5e5fc795..1237641583a1bf32bd76b8b71cbdd497c9963de9 100644 (file)
@@ -264,6 +264,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
        obj = ds->ds_object;
 
+       if (ds->ds_large_blocks) {
+               ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
+               spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+       }
        if (ds->ds_phys->ds_prev_snap_obj != 0) {
                ASSERT3P(ds->ds_prev, ==, NULL);
                VERIFY0(dsl_dataset_hold_obj(dp,
@@ -720,6 +724,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
                ASSERT0(ds->ds_reserved);
        }
 
+       if (ds->ds_large_blocks)
+               spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+
        dsl_scan_ds_destroyed(ds, tx);
 
        obj = ds->ds_object;
index 53371e642294b1ffff11a2930268a5d3eda7f7ec..08e79ca5b8af5f5d877774d2c3e203a24425d2a4 100644 (file)
@@ -467,7 +467,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
                    FREE_DIR_NAME, &dp->dp_free_dir));
 
                /* create and open the free_bplist */
-               obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
                VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
                    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
                VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -892,7 +892,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
         * subobj support.  So call dmu_object_alloc() directly.
         */
        obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
-           SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+           SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
        VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
        VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
index c2bcbfd0ccc7df7c46119958baf38f3524a47232..9e53f08a6806eb09722fe92c48dbc4045dbe9cc4 100644 (file)
@@ -153,7 +153,7 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
-uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
     &metaslab_df_alloc_threshold, 0,
     "Minimum size which forces the dynamic allocator to change it's allocation strategy");
index 5d8c731c886c5d7a77bc52a94e7bbd548b634802..84b39dd02929d57261c1ced10e440a98506042ae 100644 (file)
@@ -500,7 +500,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 
        if (size == 0) {
                blocksize = SPA_MINBLOCKSIZE;
-       } else if (size > SPA_MAXBLOCKSIZE) {
+       } else if (size > SPA_OLD_MAXBLOCKSIZE) {
                ASSERT(0);
                return (SET_ERROR(EFBIG));
        } else {
@@ -675,7 +675,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
        hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
            SA_BONUS, &i, &used, &spilling);
 
-       if (used > SPA_MAXBLOCKSIZE)
+       if (used > SPA_OLD_MAXBLOCKSIZE)
                return (SET_ERROR(EFBIG));
 
        VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -699,7 +699,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
                    attr_count - i, hdl->sa_spill, SA_SPILL, &i,
                    &spill_used, &dummy);
 
-               if (spill_used > SPA_MAXBLOCKSIZE)
+               if (spill_used > SPA_OLD_MAXBLOCKSIZE)
                        return (SET_ERROR(EFBIG));
 
                buf_space = hdl->sa_spill->db_size - spillhdrsize;
index 2cdfeb0f24429593361ac50520515a7894584e8a..9c62669e9b0abbbbbdeea2448278acd380dcba02 100644 (file)
@@ -287,6 +287,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
                    0, ZPROP_SRC_LOCAL);
 
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+               spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+                   MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+       } else {
+               spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+                   SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+       }
+
        if ((dp = list_head(&spa->spa_config_list)) != NULL) {
                if (dp->scd_path == NULL) {
                        spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -501,7 +509,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 
                        if (!error) {
                                objset_t *os;
-                               uint64_t compress;
+                               uint64_t propval;
 
                                if (strval == NULL || strval[0] == '\0') {
                                        objnum = zpool_prop_default_numeric(
@@ -512,15 +520,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                                if (error = dmu_objset_hold(strval, FTAG, &os))
                                        break;
 
-                               /* Must be ZPL and not gzip compressed. */
+                               /*
+                                * Must be ZPL, and its property settings
+                                * must be supported by GRUB (compression
+                                * is not gzip, and large blocks are not used).
+                                */
 
                                if (dmu_objset_type(os) != DMU_OST_ZFS) {
                                        error = SET_ERROR(ENOTSUP);
                                } else if ((error =
                                    dsl_prop_get_int_ds(dmu_objset_ds(os),
                                    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-                                   &compress)) == 0 &&
-                                   !BOOTFS_COMPRESS_VALID(compress)) {
+                                   &propval)) == 0 &&
+                                   !BOOTFS_COMPRESS_VALID(propval)) {
+                                       error = SET_ERROR(ENOTSUP);
+                               } else if ((error =
+                                   dsl_prop_get_int_ds(dmu_objset_ds(os),
+                                   zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                                   &propval)) == 0 &&
+                                   propval > SPA_OLD_MAXBLOCKSIZE) {
                                        error = SET_ERROR(ENOTSUP);
                                } else {
                                        objnum = dmu_objset_id(os);
index 83e521765ac08da36cf09405c5e2d9938f8b6b0e..9fc7a463c6536751448f62828c40d2a23ad762f7 100644 (file)
@@ -90,7 +90,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
 
        ASSERT(spa->spa_history == 0);
        spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
-           SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+           SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
            sizeof (spa_history_phys_t), tx);
 
        VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
index c3a42d026b608f1edaa847f85701999ef5ef58f3..659dddd0178d7c41c8d9a60adf5f65aea02422dc 100644 (file)
@@ -2048,3 +2048,12 @@ spa_debug_enabled(spa_t *spa)
 {
        return (spa->spa_debug);
 }
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SPA_MAXBLOCKSIZE);
+       else
+               return (SPA_OLD_MAXBLOCKSIZE);
+}
index ad1926646871f69f11df364b3f4cbd90787b7403..d85d872f89da86f2fd3b5b7c2962d16e22b3ff7a 100644 (file)
@@ -249,7 +249,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
-#define        DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define        DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
 #define        DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define        DMU_USERUSED_OBJECT     (-1ULL)
@@ -646,6 +646,7 @@ void xuio_stat_wbuf_copied();
 void xuio_stat_wbuf_nocopy();
 
 extern int zfs_prefetch_disable;
+extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
index 23d88fd048bd96a1e35ce10e23761fe24208a1a9..804f0c182b6f193590a822513f532e9cc5d400c0 100644 (file)
@@ -95,6 +95,7 @@ struct objset {
        zfs_cache_type_t os_secondary_cache;
        zfs_sync_type_t os_sync;
        zfs_redundant_metadata_type_t os_redundant_metadata;
+       int os_recordsize;
 
        /* no lock needed: */
        struct dmu_tx *os_synctx; /* XXX sketchy */
index b5d617025bd021d1950fd9504eb44efd2685e825..b03cb0976e6d01278300310e5a8a0f7cb0f5d165 100644 (file)
@@ -36,7 +36,8 @@ struct dsl_dataset;
 struct drr_begin;
 struct avl_tree;
 
-int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+int dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, struct vnode *vp, offset_t *off);
 #else
@@ -45,10 +46,11 @@ int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
 int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
     uint64_t *sizep);
 int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
+    int outfd, struct vnode *vp, offset_t *off);
 #else
-    boolean_t embedok, int outfd, struct file *fp, offset_t *off);
+    int outfd, struct file *fp, offset_t *off);
 #endif
 
 typedef struct dmu_recv_cookie {
index d9552b2260aff1345b637856202ad71d64ffb742..ff90f8b439ccf7a137adcf10ba2c018bd9ae30aa 100644 (file)
@@ -82,6 +82,13 @@ struct dsl_pool;
  */
 #define        DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
 
+/*
+ * This field is present (with value=0) if this dataset may contain large
+ * blocks (>128KB).  If it is present, then this dataset
+ * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
+ */
+#define        DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
+
 /*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
@@ -135,6 +142,8 @@ typedef struct dsl_dataset {
        /* only used in syncing context, only valid for non-snapshots: */
        struct dsl_dataset *ds_prev;
        uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
+       boolean_t ds_large_blocks;
+       boolean_t ds_need_large_blocks;
 
        /* has internal locking: */
        dsl_deadlist_t ds_deadlist;
@@ -244,6 +253,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
+int dsl_dataset_activate_large_blocks(const char *dsname);
+void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
index f62e3151f87101d6d6492699e14f2b7eae00a868..a5f32e837be0558b912d92d39215034b2ca09a77 100644 (file)
@@ -94,17 +94,26 @@ _NOTE(CONSTCOND) } while (0)
 _NOTE(CONSTCOND) } while (0)
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
  */
 #define        SPA_MINBLOCKSHIFT       9
-#define        SPA_MAXBLOCKSHIFT       17
+#define        SPA_OLD_MAXBLOCKSHIFT   17
+#define        SPA_MAXBLOCKSHIFT       24
 #define        SPA_MINBLOCKSIZE        (1ULL << SPA_MINBLOCKSHIFT)
+#define        SPA_OLD_MAXBLOCKSIZE    (1ULL << SPA_OLD_MAXBLOCKSHIFT)
 #define        SPA_MAXBLOCKSIZE        (1ULL << SPA_MAXBLOCKSHIFT)
 
-#define        SPA_BLOCKSIZES          (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
 /*
  * Default maximum supported logical ashift.
  *
@@ -801,6 +810,7 @@ extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
index 1dc322e02f6f25419a79d16294a1c4ddd5a755be..29ebae53849dd0c11ec15597cc16a841ab4a5d17 100644 (file)
@@ -41,8 +41,7 @@ extern int fzap_default_block_shift;
 
 #define        MZAP_ENT_LEN            64
 #define        MZAP_NAME_LEN           (MZAP_ENT_LEN - 8 - 4 - 2)
-#define        MZAP_MAX_BLKSHIFT       SPA_MAXBLOCKSHIFT
-#define        MZAP_MAX_BLKSZ          (1 << MZAP_MAX_BLKSHIFT)
+#define        MZAP_MAX_BLKSZ          SPA_OLD_MAXBLOCKSIZE
 
 #define        ZAP_NEED_CD             (-1U)
 
index 73fbf3c9c0eaf07d0562113c15f9c80ec443568e..71ca044013411b37416682c4f074014dd2ed59e5 100644 (file)
@@ -85,13 +85,16 @@ typedef enum drr_headertype {
 /* flags #3 - #15 are reserved for incompatible closed-source implementations */
 #define        DMU_BACKUP_FEATURE_EMBED_DATA           (1<<16)
 #define        DMU_BACKUP_FEATURE_EMBED_DATA_LZ4       (1<<17)
+/* flag #18 is reserved for a Delphix feature */
+#define        DMU_BACKUP_FEATURE_LARGE_BLOCKS         (1<<19)
 
 /*
  * Mask of all supported backup features
  */
 #define        DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
     DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
-    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
+    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+    DMU_BACKUP_FEATURE_LARGE_BLOCKS)
 
 /* Are all features in the given flag word currently supported? */
 #define        DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
index dae099b7915bbf5e97460f36fb9b39a7aaf0a0da..3fb66c864ee93da95e28e5c4e0221591420dcbe9 100644 (file)
@@ -133,8 +133,6 @@ extern "C" {
 #define        ZFS_SHARES_DIR          "SHARES"
 #define        ZFS_SA_ATTRS            "SA_ATTRS"
 
-#define        ZFS_MAX_BLOCKSIZE       (SPA_MAXBLOCKSIZE)
-
 /*
  * Path component length
  *
index 15ef2aa8bf985c12fed39ce5135702a8094d51c8..895d632a2620d13b7394242c0d39537edb563de7 100644 (file)
@@ -90,7 +90,6 @@ typedef struct zil_chain {
 } zil_chain_t;
 
 #define        ZIL_MIN_BLKSZ   4096ULL
-#define        ZIL_MAX_BLKSZ   SPA_MAXBLOCKSIZE
 
 /*
  * The words of a log block checksum.
index 58566203b697cefed7e96c6ab76756413c3926b6..b5c666c02b7385cfe11634e3ecf6a28ae78308d6 100644 (file)
@@ -139,7 +139,7 @@ typedef struct zil_bp_node {
        avl_node_t      zn_node;
 } zil_bp_node_t;
 
-#define        ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define        ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
 #ifdef __cplusplus
index dea1a8fa7bf1cf91e8731be3d2bc8520688869e7..356b5d5f6ca03f872bc9815900955fed6ea7dfa7 100644 (file)
@@ -926,9 +926,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
        /*
         * Compute the raidz-deflation ratio.  Note, we hard-code
-        * in 128k (1 << 17) because it is the current "typical" blocksize.
-        * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
-        * or we will inconsistently account for existing bp's.
+        * in 128k (1 << 17) because it is the "typical" blocksize.
+        * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+        * otherwise it would inconsistently account for existing bp's.
         */
        vd->vdev_deflate_ratio = (1 << 17) /
            (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
index 395ea29ff187e9320976b74d73c7e36451055a74..962e01d6ee5d7eef754558147d7cae4ecd743472 100644 (file)
@@ -170,7 +170,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
index 00045e1d528a3fa2f369f71e7165efd6405be301..6b538cf56d61095fb9df331f55d9ecff694423b9 100644 (file)
@@ -1618,7 +1618,7 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
        /*
         * Don't write past the end of the block
         */
-       VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+       VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
 
        start = offset;
        end = start + size;
@@ -1633,8 +1633,8 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
         * KB size.
         */
        rm = vdev_raidz_map_alloc(data - (offset - origoffset),
-           SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
-           vd->vdev_nparity);
+           SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
+           vd->vdev_children, vd->vdev_nparity);
 
        coloffset = origoffset;
 
index 4e88a53fe2f942e33432633bae1f8dace875e4a9..4ed8aac29851de0cefbbeb28f4ac8593d1ed50ad 100644 (file)
@@ -33,6 +33,7 @@
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
 #include <sys/arc.h>
+#include <sys/dmu_objset.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
@@ -664,9 +665,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
        uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
        ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
-           leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+           leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
            indirect_blockshift >= SPA_MINBLOCKSHIFT &&
-           indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+           indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 
        VERIFY(dmu_object_set_blocksize(os, obj,
            1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -1396,7 +1397,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
        zap_t *zap;
        int err = 0;
 
-
        /*
         * Since, we don't have a name, we cannot figure out which blocks will
         * be affected in this operation. So, account for the worst case :
@@ -1409,7 +1409,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
         * large microzap results in a promotion to fatzap.
         */
        if (name == NULL) {
-               *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+               *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
                return (err);
        }
 
@@ -1433,7 +1433,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
                        /*
                         * We treat this case as similar to (name == NULL)
                         */
-                       *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+                       *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
                }
        } else {
                /*
@@ -1452,12 +1452,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
                 *                      ptrtbl blocks
                 */
                if (dmu_buf_freeable(zap->zap_dbuf))
-                       *tooverwrite += SPA_MAXBLOCKSIZE;
+                       *tooverwrite += MZAP_MAX_BLKSZ;
                else
-                       *towrite += SPA_MAXBLOCKSIZE;
+                       *towrite += MZAP_MAX_BLKSZ;
 
                if (add) {
-                       *towrite += 4 * SPA_MAXBLOCKSIZE;
+                       *towrite += 4 * MZAP_MAX_BLKSZ;
                }
        }
 
index 5f42c79223f5bb89bbefbf3f7a89576fdf43f2fd..5595348e9e104f67afb99e7135bbb8d361b48be7 100644 (file)
@@ -2433,7 +2433,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
        const char *propname = nvpair_name(pair);
        zfs_prop_t prop = zfs_name_to_prop(propname);
        uint64_t intval;
-       int err;
+       int err = -1;
 
        if (prop == ZPROP_INVAL) {
                if (zfs_prop_userquota(propname))
@@ -3864,8 +3864,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
                 * the SPA supports it. We ignore any errors here since
                 * we'll catch them later.
                 */
-               if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
-                   nvpair_value_uint64(pair, &intval) == 0) {
+               if (nvpair_value_uint64(pair, &intval) == 0) {
                        if (intval >= ZIO_COMPRESS_GZIP_1 &&
                            intval <= ZIO_COMPRESS_GZIP_9 &&
                            zfs_earlier_version(dsname,
@@ -3916,6 +3915,42 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
                        return (SET_ERROR(ENOTSUP));
                break;
 
+       case ZFS_PROP_RECORDSIZE:
+               /* Record sizes above 128k need the feature to be enabled */
+               if (nvpair_value_uint64(pair, &intval) == 0 &&
+                   intval > SPA_OLD_MAXBLOCKSIZE) {
+                       spa_t *spa;
+
+                       /*
+                        * If this is a bootable dataset then
+                        * the we don't allow large (>128K) blocks,
+                        * because GRUB doesn't support them.
+                        */
+                       if (zfs_is_bootfs(dsname) &&
+                           intval > SPA_OLD_MAXBLOCKSIZE) {
+                               return (SET_ERROR(EDOM));
+                       }
+
+                       /*
+                        * We don't allow setting the property above 1MB,
+                        * unless the tunable has been changed.
+                        */
+                       if (intval > zfs_max_recordsize ||
+                           intval > SPA_MAXBLOCKSIZE)
+                               return (SET_ERROR(EDOM));
+
+                       if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+                               return (err);
+
+                       if (!spa_feature_is_enabled(spa,
+                           SPA_FEATURE_LARGE_BLOCKS)) {
+                               spa_close(spa, FTAG);
+                               return (SET_ERROR(ENOTSUP));
+                       }
+                       spa_close(spa, FTAG);
+               }
+               break;
+
        case ZFS_PROP_SHARESMB:
                if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
                        return (SET_ERROR(ENOTSUP));
@@ -4344,7 +4379,7 @@ out:
  * zc_fromobj  objsetid of incremental fromsnap (may be zero)
  * zc_guid     if set, estimate size of stream only.  zc_cookie is ignored.
  *             output size in zc_objset_type.
- * zc_flags    if =1, WRITE_EMBEDDED records are permitted
+ * zc_flags    lzc_send_flags
  *
  * outputs:
  * zc_objset_type      estimated size, if zc_guid is set
@@ -4356,6 +4391,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
        offset_t off;
        boolean_t estimate = (zc->zc_guid != 0);
        boolean_t embedok = (zc->zc_flags & 0x1);
+       boolean_t large_block_ok = (zc->zc_flags & 0x2);
 
        if (zc->zc_obj != 0) {
                dsl_pool_t *dp;
@@ -4420,10 +4456,11 @@ zfs_ioc_send(zfs_cmd_t *zc)
 
                off = fp->f_offset;
                error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+                   zc->zc_fromobj, embedok, large_block_ok,
 #ifdef illumos
-                   zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
+                   zc->zc_cookie, fp->f_vnode, &off);
 #else
-                   zc->zc_fromobj, embedok, zc->zc_cookie, fp, &off);
+                   zc->zc_cookie, fp, &off);
 #endif
 
                if (off >= 0 && off <= MAXOFFSET_T)
@@ -5361,6 +5398,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc)
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "largeblockok" -> (value ignored)
+ *         indicates that blocks > 128KB are permitted
  *     (optional) "embedok" -> (value ignored)
  *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  * }
@@ -5376,6 +5415,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
        offset_t off;
        char *fromname = NULL;
        int fd;
+       boolean_t largeblockok;
        boolean_t embedok;
 
        error = nvlist_lookup_int32(innvl, "fd", &fd);
@@ -5384,6 +5424,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 
        (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
+       largeblockok = nvlist_exists(innvl, "largeblockok");
        embedok = nvlist_exists(innvl, "embedok");
 
        file_t *fp = getf(fd, cap_rights_init(&rights, CAP_READ));
@@ -5391,10 +5432,11 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
                return (SET_ERROR(EBADF));
 
        off = fp->f_offset;
+       error = dmu_send(snapname, fromname, embedok, largeblockok,
 #ifdef illumos
-       error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
+           fd, fp->f_vnode, &off);
 #else
-       error = dmu_send(snapname, fromname, embedok, fd, fp, &off);
+           fd, fp, &off);
 #endif
 
 #ifdef illumos
index 3029f7dc4c74dd9e584f8b66183664cee209c8d1..7432290d218c4c56dffa4d4e78acfd9349790990 100644 (file)
@@ -490,7 +490,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
                 * If the write would overflow the largest block then split it.
                 */
                if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
-                       len = SPA_MAXBLOCKSIZE >> 1;
+                       len = SPA_OLD_MAXBLOCKSIZE >> 1;
                else
                        len = resid;
 
index ea17daf81f9393bf79188bf546572a2eeaffb6be..15d34df9a0bdb9db3487d2c328c0b08d63172a28 100644 (file)
@@ -270,10 +270,9 @@ static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
        zfsvfs_t *zfsvfs = arg;
-
-       if (newval < SPA_MINBLOCKSIZE ||
-           newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
-               newval = SPA_MAXBLOCKSIZE;
+       ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+       ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+       ASSERT(ISP2(newval));
 
        zfsvfs->z_max_blksz = newval;
        zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
@@ -900,7 +899,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
         */
        zfsvfs->z_vfs = NULL;
        zfsvfs->z_parent = zfsvfs;
-       zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+       zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
        zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
        zfsvfs->z_os = os;
 
index 5fec709c9bb3f2cd68390df207529fa58793062a..46d0b6de9fb16e7a9f1294c942a663f59424be95 100644 (file)
@@ -1023,8 +1023,14 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
                        uint64_t new_blksz;
 
                        if (zp->z_blksz > max_blksz) {
+                               /*
+                                * File's blocksize is already larger than the
+                                * "recordsize" property.  Only let it grow to
+                                * the next power of 2.
+                                */
                                ASSERT(!ISP2(zp->z_blksz));
-                               new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+                               new_blksz = MIN(end_size,
+                                   1 << highbit64(zp->z_blksz));
                        } else {
                                new_blksz = MIN(end_size, max_blksz);
                        }
index d92597e4b0e867a2601a14b6f8db0a44431b55d7..f92ddd446b89b4596cb85574739ed3d5000fb61d 100644 (file)
@@ -54,6 +54,7 @@
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
@@ -1543,8 +1544,13 @@ zfs_extend(znode_t *zp, uint64_t end)
                 * We are growing the file past the current block size.
                 */
                if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+                       /*
+                        * File's blocksize is already larger than the
+                        * "recordsize" property.  Only let it grow to
+                        * the next power of 2.
+                        */
                        ASSERT(!ISP2(zp->z_blksz));
-                       newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+                       newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
                } else {
                        newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
                }
index c26e53cea63571c19d1996c1fe6cc3dad18701d1..2084d88ff68590511714233a54e5d73c7f1f79b5 100644 (file)
@@ -229,6 +229,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
                            sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
                                error = SET_ERROR(ECKSUM);
                        } else {
+                               ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
                                bcopy(lr, dst, len);
                                *end = (char *)dst + len;
                                *nbp = zilc->zc_next_blk;
@@ -243,6 +244,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
                            (zilc->zc_nused > (size - sizeof (*zilc)))) {
                                error = SET_ERROR(ECKSUM);
                        } else {
+                               ASSERT3U(zilc->zc_nused, <=,
+                                   SPA_OLD_MAXBLOCKSIZE);
                                bcopy(lr, dst, zilc->zc_nused);
                                *end = (char *)dst + zilc->zc_nused;
                                *nbp = zilc->zc_next_blk;
@@ -326,7 +329,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
         * If the log has been claimed, stop if we encounter a sequence
         * number greater than the highest claimed sequence number.
         */
-       lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+       lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
        zil_bp_tree_init(zilog);
 
        for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -373,7 +376,7 @@ done:
            (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
        zil_bp_tree_fini(zilog);
-       zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+       zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
        return (error);
 }
@@ -905,7 +908,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,              /* non TX_WRITE */
@@ -987,7 +990,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
                continue;
        zil_blksz = zil_block_buckets[i];
        if (zil_blksz == UINT64_MAX)
-               zil_blksz = SPA_MAXBLOCKSIZE;
+               zil_blksz = SPA_OLD_MAXBLOCKSIZE;
        zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
        for (i = 0; i < ZIL_PREV_BLKS; i++)
                zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
index 45efb22f1a3f07b04e586ddb545c49fb6e874cbb..c06ad57bde3caa7c211704dd8f2c34aa90b4d126 100644 (file)
@@ -142,9 +142,8 @@ zio_init(void)
 
        /*
         * For small buffers, we want a cache for each multiple of
-        * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
-        * for each quarter-power of 2.  For large buffers, we want
-        * a cache for each multiple of PAGESIZE.
+        * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+        * for each quarter-power of 2.
         */
        for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
                size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -169,10 +168,8 @@ zio_init(void)
 #endif /* illumos */
                if (size <= 4 * SPA_MINBLOCKSIZE) {
                        align = SPA_MINBLOCKSIZE;
-               } else if (IS_P2ALIGNED(size, PAGESIZE)) {
-                       align = PAGESIZE;
                } else if (IS_P2ALIGNED(size, p2 >> 2)) {
-                       align = p2 >> 2;
+                       align = MIN(p2 >> 2, PAGESIZE);
                }
 
                if (align != 0) {
index 23480cc57160d4c7ece1a9df0841019155172c25..1776ed31114055edd0d656c2b5b45f6980ccaf97 100644 (file)
@@ -258,7 +258,7 @@ int
 zvol_check_volblocksize(uint64_t volblocksize)
 {
        if (volblocksize < SPA_MINBLOCKSIZE ||
-           volblocksize > SPA_MAXBLOCKSIZE ||
+           volblocksize > SPA_OLD_MAXBLOCKSIZE ||
            !ISP2(volblocksize))
                return (SET_ERROR(EDOM));
 
@@ -828,7 +828,7 @@ zvol_prealloc(zvol_state_t *zv)
 
        while (resid != 0) {
                int error;
-               uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+               uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 
                tx = dmu_tx_create(os);
                dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
@@ -1866,7 +1866,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
                (void) strcpy(dki.dki_dname, "zvol");
                dki.dki_ctype = DKC_UNKNOWN;
                dki.dki_unit = getminor(dev);
-               dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
+               dki.dki_maxtransfer =
+                   1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
                mutex_exit(&spa_namespace_lock);
                if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
                        error = SET_ERROR(EFAULT);
@@ -2185,14 +2186,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
                    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
                    &vbs, tx);
                error = error ? error : dmu_object_set_blocksize(
-                   os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+                   os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
                if (version >= SPA_VERSION_DEDUP) {
                        error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
                            zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
                            &dedup, tx);
                }
                if (error == 0)
-                       zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
+                       zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
        }
        dmu_tx_commit(tx);
 
index 87d0650badd5689a45c6a843b8f917419c0352e4..16d528e025d5ffb58651b55361fe8bf0bf6c08bc 100644 (file)
@@ -196,6 +196,7 @@ typedef enum {
        ZPOOL_PROP_FREEING,
        ZPOOL_PROP_FRAGMENTATION,
        ZPOOL_PROP_LEAKED,
+       ZPOOL_PROP_MAXBLOCKSIZE,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;