From bf8c61f489e07ddcfed246768059b37808b7f6e5 Mon Sep 17 00:00:00 2001 From: Seth Hoffert Date: Tue, 3 Sep 2024 19:52:33 -0500 Subject: [PATCH 01/11] Remove unused sysctl node PR #14953 removed vdev-level read cache but accidentally left this sysctl node behind. Reviewed-by: Rich Ercolani Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Seth Hoffert Closes #16493 --- module/os/freebsd/zfs/sysctl_os.c | 1 - 1 file changed, 1 deletion(-) diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 30983b13f7d1..c84cb7407a9c 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -124,7 +124,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); -SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV mirror"); From 4a4f7b019fa57e2a196e95492aecbed1f312be3a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 18 Jun 2024 14:11:11 +1000 Subject: [PATCH 02/11] zdb: rework dedup accounting for log, quota and prune The simplest thing first: add the FDT and log objects to the list of objects to be considered when checking for leaks. The rest is based on a conceptual change in all of this patch stack: a block on disk with a 'D' bit is not necessarily in the DDT at all (pruned), or in the DDT ZAPs (still on the log). As such, walking the DDT up front is difficult (for all the reasons that walking an unflushed log is difficult) and not really useful, since it's not a reflection of what's on disk anyway. Instead, we rework things here to be more like the BRT checks. When we see a dedup'd block, we look it up in the DDT, consume a refcount, and for the second-or-later instances, count them as duplicates. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Co-authored-by: Allan Jude Co-authored-by: Don Brady Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #16277 --- module/zfs/ddt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index bd1941f43adf..11fd10fb769d 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -789,6 +789,9 @@ ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) { + if (dde == NULL) + return (DDT_PHYS_NONE); + const ddt_univ_phys_t *ddp = dde->dde_phys; if (ddt->ddt_flags & DDT_FLAG_FLAT) { From d4d79451cb87aa0d93f9068ce5844098a5ebe3b5 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 17 Jun 2024 22:35:18 +0000 Subject: [PATCH 03/11] Add DDT prune command Requires the new 'flat' physical data which has the start time for a class entry. The amount to prune can be based on a target percentage of the unique entries or based on the age (i.e., every entry older than N days). Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Don Brady Closes #16277 --- cmd/zdb/zdb.c | 55 ++- cmd/zpool/zpool_main.c | 89 +++++ cmd/ztest.c | 28 ++ contrib/debian/openzfs-zfsutils.install | 1 + include/libzfs.h | 3 + include/libzfs_core.h | 3 + include/sys/ddt.h | 3 + include/sys/ddt_impl.h | 52 ++- include/sys/fs/zfs.h | 15 +- include/sys/spa_impl.h | 1 + lib/libzfs/libzfs.abi | 67 +++- lib/libzfs/libzfs_pool.c | 28 ++ lib/libzfs_core/libzfs_core.abi | 15 + lib/libzfs_core/libzfs_core.c | 22 ++ man/Makefile.am | 1 + man/man8/zpool-ddtprune.8 | 48 +++ man/man8/zpool.8 | 1 + module/zfs/ddt.c | 474 +++++++++++++++++++++--- module/zfs/ddt_log.c | 24 +- module/zfs/zfs_ioctl.c | 50 +++ module/zfs/zio.c | 10 + 21 files changed, 905 insertions(+), 85 deletions(-) create mode 100644 man/man8/zpool-ddtprune.8 diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 41c2b6765585..8e3b6972ae04 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2045,7 +2045,7 @@ dump_all_ddts(spa_t *spa) for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt) + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; @@ -2072,6 +2072,32 @@ dump_all_ddts(spa_t *spa) } dump_dedup_ratio(&dds_total); + + /* + * Dump a histogram of unique class entry age + */ + if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) { + ddt_age_histo_t histogram; + + (void) printf("DDT walk unique, building age histogram...\n"); + ddt_prune_walk(spa, 0, &histogram); + + /* + * print out histogram for unique entry class birth + */ + if (histogram.dah_entries > 0) { + (void) printf("%5s %9s %4s\n", + "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", + "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9d %4d%%\n", 1 << i, + (int)histogram.dah_age_histo[i], + (int)((histogram.dah_age_histo[i] * 100) / + histogram.dah_entries)); + } + } + } } static void @@ -5749,12 +5775,17 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, ddt_entry_t *dde = ddt_lookup(ddt, bp); /* - * ddt_lookup() can only return NULL if this block didn't exist + * ddt_lookup() can return NULL if this block didn't exist * in the DDT and creating it would take the DDT over its * quota. Since we got the block from disk, it must exist in - * the DDT, so this can't happen. + * the DDT, so this can't happen. However, when unique entries + * are pruned, the dedup bit can be set with no corresponding + * entry in the DDT. */ - VERIFY3P(dde, !=, NULL); + if (dde == NULL) { + ddt_exit(ddt); + goto skipped; + } /* Get the phys for this variant */ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); @@ -5774,8 +5805,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, (void *)(((uintptr_t)dde->dde_io) | (1 << v)); /* Consume a reference for this block. */ - VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0); - ddt_phys_decref(dde->dde_phys, v); + if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0) + ddt_phys_decref(dde->dde_phys, v); /* * If this entry has a single flat phys, it may have been @@ -5864,6 +5895,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, } } +skipped: for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -8138,7 +8170,7 @@ dump_mos_leaks(spa_t *spa) for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt) + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; /* DDT store objects */ @@ -8150,11 +8182,14 @@ dump_mos_leaks(spa_t *spa) } /* FDT container */ - mos_obj_refd(ddt->ddt_dir_object); + if (ddt->ddt_version == DDT_VERSION_FDT) + mos_obj_refd(ddt->ddt_dir_object); /* FDT log objects */ - mos_obj_refd(ddt->ddt_log[0].ddl_object); - mos_obj_refd(ddt->ddt_log[1].ddl_object); + if (ddt->ddt_flags & DDT_FLAG_LOG) { + mos_obj_refd(ddt->ddt_log[0].ddl_object); + mos_obj_refd(ddt->ddt_log[1].ddl_object); + } } if (spa->spa_brt != NULL) { diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 9cd26a8650ad..ce859226c215 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -130,6 +130,8 @@ static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); +static int zpool_do_ddt_prune(int, char **); + static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( @@ -170,6 +172,7 @@ typedef enum { HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, + HELP_DDT_PRUNE, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, @@ -426,6 +429,8 @@ static zpool_command_t command_table[] = { { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, + { NULL }, + { "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) @@ -545,6 +550,8 @@ get_usage(zpool_help_t idx) case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); + case HELP_DDT_PRUNE: + return (gettext("\tddtprune -d|-p \n")); default: __builtin_unreachable(); } @@ -13342,6 +13349,88 @@ found:; return (error); } +/* + * zpool ddtprune -d|-p + * + * -d Prune entries old and older + * -p Prune amount of entries + * + * Prune single reference entries from DDT to satisfy the amount specified. + */ +int +zpool_do_ddt_prune(int argc, char **argv) +{ + zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE; + uint64_t amount = 0; + zpool_handle_t *zhp; + char *endptr; + int c; + + while ((c = getopt(argc, argv, "d:p:")) != -1) { + switch (c) { + case 'd': + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + (void) fprintf(stderr, gettext("-d cannot be " + "combined with -p option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || amount == 0) { + (void) fprintf(stderr, + gettext("invalid days value\n")); + usage(B_FALSE); + } + amount *= 86400; /* convert days to seconds */ + unit = ZPOOL_DDT_PRUNE_AGE; + break; + case 'p': + if (unit == ZPOOL_DDT_PRUNE_AGE) { + (void) fprintf(stderr, gettext("-p cannot be " + "combined with -d option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || + amount == 0 || amount > 100) { + (void) fprintf(stderr, + gettext("invalid percentage value\n")); + usage(B_FALSE); + } + unit = ZPOOL_DDT_PRUNE_PERCENTAGE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (unit == ZPOOL_DDT_PRUNE_NONE) { + (void) fprintf(stderr, + gettext("missing amount option (-d|-p )\n")); + usage(B_FALSE); + } else if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } else if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + zhp = zpool_open(g_zfs, argv[0]); + if (zhp == NULL) + return (-1); + + int error = zpool_ddt_prune(zhp, unit, amount); + + zpool_close(zhp); + + return (error); +} + static int find_command_idx(const char *command, int *idx) { diff --git a/cmd/ztest.c b/cmd/ztest.c index 7c9db84d4ea4..a7843d338834 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; extern uint64_t raidz_expand_max_reflow_bytes; extern uint_t raidz_expand_pause_point; +extern boolean_t ddt_prune_artificial_age; +extern boolean_t ddt_dump_prune_histogram; static ztest_shared_opts_t *ztest_shared_opts; @@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_pool_prefetch_ddt; +ztest_func_t ztest_ddt_prune; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), + ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -7288,6 +7292,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id) mutex_exit(&ztest_vdev_lock); } +void +ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + + spa_t *spa = ztest_spa; + uint64_t pct = ztest_random(15) + 1; + + (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); +} + /* * Verify pool integrity by running zdb. */ @@ -7469,6 +7484,13 @@ ztest_resume_thread(void *arg) { spa_t *spa = arg; + /* + * Synthesize aged DDT entries for ddt prune testing + */ + ddt_prune_artificial_age = B_TRUE; + if (ztest_opts.zo_verbose >= 3) + ddt_dump_prune_histogram = B_TRUE; + while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); @@ -8587,6 +8609,12 @@ ztest_init(ztest_shared_t *zs) if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) continue; + /* + * split 50/50 between legacy and fast dedup + */ + if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) + continue; + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); fnvlist_add_uint64(props, buf, 0); diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 10083351abb5..d51e4ef003e6 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8 usr/share/man/man8/zpool-create.8 usr/share/man/man8/zpool-destroy.8 usr/share/man/man8/zpool-detach.8 +usr/share/man/man8/zpool-ddtprune.8 usr/share/man/man8/zpool-events.8 usr/share/man/man8/zpool-export.8 usr/share/man/man8/zpool-get.8 diff --git a/include/libzfs.h b/include/libzfs.h index 2412797541de..01d51999f4eb 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -305,6 +305,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, + uint64_t); + _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 206e5e5c2bf6..b1d74fbbc8f5 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t, + uint64_t); + #ifdef __cplusplus } #endif diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 93abad85af44..4e5ccd46318e 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -405,6 +405,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); +extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount); + #ifdef __cplusplus } #endif diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 6f11cd90c1d8..4d3c0cae072e 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -35,8 +35,11 @@ extern "C" { #endif /* DDT version numbers */ -#define DDT_VERSION_LEGACY (0) -#define DDT_VERSION_FDT (1) +#define DDT_VERSION_LEGACY (0) +#define DDT_VERSION_FDT (1) + +/* Dummy version to signal that configure is still necessary */ +#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) /* Names of interesting objects in the DDT root dir */ #define DDT_DIR_VERSION "version" @@ -187,8 +190,11 @@ extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu); extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe); -extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, - const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe); + +extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe); +extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, + const ddt_key_t *ddk); extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); @@ -211,6 +217,44 @@ extern void ddt_log_fini(void); * them up. */ +/* + * We use a histogram to convert a percentage request into a + * cutoff value where entries older than the cutoff get pruned. + * + * The histogram bins represent hours in power-of-two increments. + * 16 bins covers up to four years. + */ +#define HIST_BINS 16 + +typedef struct ddt_age_histo { + uint64_t dah_entries; + uint64_t dah_age_histo[HIST_BINS]; +} ddt_age_histo_t; + +void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram); + +#if defined(_KERNEL) || !defined(ZFS_DEBUG) +#define ddt_dump_age_histogram(histo, cutoff) ((void)0) +#else +static inline void +ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff) +{ + if (histogram->dah_entries == 0) + return; + + (void) printf("DDT prune unique class age, %llu hour cutoff\n", + (u_longlong_t)(gethrestime_sec() - cutoff)/3600); + (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9llu %4d%%\n", 1<dah_age_histo[i], + (int)((histogram->dah_age_histo[i] * 100) / + histogram->dah_entries)); + } +} +#endif + /* * Enough room to expand DMU_POOL_DDT format for all possible DDT * checksum/class/type combinations. diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 73d686a002ee..fc4f22cd5304 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1422,7 +1422,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 88/128 numbers reserved. + * Core features - 89/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1519,6 +1519,7 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ + ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1655,6 +1656,12 @@ typedef enum { ZPOOL_PREFETCH_DDT } zpool_prefetch_type_t; +typedef enum { + ZPOOL_DDT_PRUNE_NONE, + ZPOOL_DDT_PRUNE_AGE, /* in seconds */ + ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */ +} zpool_ddt_prune_unit_t; + /* * Bookmark name values. */ @@ -1753,6 +1760,12 @@ typedef enum { */ #define ZPOOL_PREFETCH_TYPE "prefetch_type" +/* + * The following are names used when invoking ZFS_IOC_DDT_PRUNE. + */ +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 4fc6f22fcb50..7811abbb9ce3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -412,6 +412,7 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + boolean_t spa_active_ddt_prune; /* ddt prune process active */ struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 87c5c4380be3..88dd8b3c679d 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -183,8 +183,8 @@ - + @@ -466,7 +466,9 @@ + + @@ -485,8 +487,8 @@ - + @@ -529,7 +531,6 @@ - @@ -5929,6 +5930,7 @@ + @@ -5963,6 +5965,13 @@ + + + + + + + @@ -6139,6 +6148,12 @@ + + + + + + @@ -6798,6 +6813,12 @@ + + + + + + @@ -7837,7 +7858,7 @@ - + @@ -7856,6 +7877,9 @@ + + + @@ -7865,6 +7889,15 @@ + + + + + + + + + @@ -7968,6 +8001,11 @@ + + + + + @@ -8075,6 +8113,11 @@ + + + + + @@ -8093,6 +8136,11 @@ + + + + + @@ -8292,12 +8340,12 @@ - - - + + + @@ -8802,11 +8850,6 @@ - - - - - diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index dfa7c4db6881..14410b153130 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5649,3 +5649,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, return (ret); } + +/* + * Prune older entries from the DDT to reclaim space under the quota + */ +int +zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + int error = lzc_ddt_prune(zhp->zpool_name, unit, amount); + if (error != 0) { + libzfs_handle_t *hdl = zhp->zpool_hdl; + char errbuf[ERRBUFLEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot prune dedup table on '%s'"), zhp->zpool_name); + + if (error == EALREADY) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a prune operation is already in progress")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + return (-1); + } + + return (0); +} diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 1062a6b52dff..5ee6b8e09d6d 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -162,6 +162,7 @@ + @@ -1444,6 +1445,7 @@ + @@ -1484,6 +1486,13 @@ + + + + + + + @@ -3015,6 +3024,12 @@ + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index ec8b0ff4f61c..d07fca6cebad 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); } + +/* + * Prune the specified amount from the pool's dedup table. + */ +int +lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit); + fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount); + + error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} diff --git a/man/Makefile.am b/man/Makefile.am index 194bb4721619..fde704933764 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -72,6 +72,7 @@ dist_man_MANS = \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ + %D%/man8/zpool-ddtprune.8 \ %D%/man8/zpool-events.8 \ %D%/man8/zpool-export.8 \ %D%/man8/zpool-get.8 \ diff --git a/man/man8/zpool-ddtprune.8 b/man/man8/zpool-ddtprune.8 new file mode 100644 index 000000000000..1ab7d3982c3e --- /dev/null +++ b/man/man8/zpool-ddtprune.8 @@ -0,0 +1,48 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2024, Klara Inc. +.\" +.Dd June 17, 2024 +.Dt ZPOOL-DDTPRUNE 8 +.Os +. +.Sh NAME +.Nm zpool-ddtprune +.Nd Prunes the oldest entries from the single reference dedup table(s) +.Sh SYNOPSIS +.Nm zpool +.Cm ddtprune +.Fl d Ar days | Fl p Ar percentage +.Ar pool +.Sh DESCRIPTION +This command prunes older unique entries from the dedup table. +As a complement to the dedup quota feature, +.Sy ddtprune +allows removal of older non-duplicate entries to make room for +newer duplicate entries. +.Pp +The amount to prune can be based on a target percentage of the unique entries +or based on the age (i.e., every unique entry older than N days). +. +.Sh SEE ALSO +.Xr zdb 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index c55644d9ecea..02a258f66708 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -592,6 +592,7 @@ don't wait. .Xr zpool-checkpoint 8 , .Xr zpool-clear 8 , .Xr zpool-create 8 , +.Xr zpool-ddtprune 8 , .Xr zpool-destroy 8 , .Xr zpool-detach 8 , .Xr zpool-events 8 , diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 11fd10fb769d..0e12e7e49828 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -125,6 +125,13 @@ * without which, no space would be recovered and the DDT would continue to be * considered "over quota". See zap_shrink_enabled. * + * ## Dedup table pruning + * + * As a complement to the dedup quota feature, ddtprune allows removal of older + * non-duplicate entries to make room for newer duplicate entries. The amount + * to prune can be based on a target percentage of the unique entries or based + * on the age (i.e., prune unique entry older than N days). + * * ## Dedup log * * Historically, all entries modified on a txg were written back to dedup @@ -228,6 +235,19 @@ int zfs_dedup_prefetch = 0; */ uint_t dedup_class_wait_txgs = 5; +/* + * How many DDT prune entries to add to the DDT sync AVL tree. + * Note these addtional entries have a memory footprint of a + * ddt_entry_t (216 bytes). + */ +static uint32_t zfs_ddt_prunes_per_txg = 50000; + +/* + * For testing, synthesize aged DDT entries + * (in global scope for ztest) + */ +boolean_t ddt_prune_artificial_age = B_FALSE; +boolean_t ddt_dump_prune_histogram = B_FALSE; /* * Don't do more than this many incremental flush passes per txg. @@ -268,10 +288,6 @@ static const uint64_t ddt_version_flags[] = { [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG, }; -/* Dummy version to signal that configure is still necessary */ -#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) - -#ifdef _KERNEL /* per-DDT kstats */ typedef struct { /* total lookups and whether they returned new or existing entries */ @@ -324,6 +340,7 @@ static const ddt_kstats_t ddt_kstats_template = { { "log_flush_time_rate", KSTAT_DATA_UINT32 }, }; +#ifdef _KERNEL #define _DDT_KSTAT_STAT(ddt, stat) \ &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64 #define DDT_KSTAT_BUMP(ddt, stat) \ @@ -343,6 +360,7 @@ static const ddt_kstats_t ddt_kstats_template = { #define DDT_KSTAT_ZERO(ddt, stat) do {} while (0) #endif /* _KERNEL */ + static void ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) @@ -715,6 +733,30 @@ ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); } +static uint64_t +ddt_class_start(void) +{ + uint64_t start = gethrestime_sec(); + + if (ddt_prune_artificial_age) { + /* + * debug aide -- simulate a wider distribution + * so we don't have to wait for an aged DDT + * to test prune. + */ + int range = 1 << 21; + int percent = random_in_range(100); + if (percent < 50) { + range = range >> 4; + } else if (percent > 75) { + range /= 2; + } + start -= random_in_range(range); + } + + return (start); +} + void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { @@ -1022,6 +1064,47 @@ ddt_prefetch_all(spa_t *spa) static int ddt_configure(ddt_t *ddt, boolean_t new); +/* + * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them + * to the ones in the entry. If they're different, then the passed-in BP is + * from a previous generation of this entry (ie was previously pruned) and we + * have to act like the entry doesn't exist at all. + * + * This should only happen during a lookup to free the block (zio_ddt_free()). + * + * XXX this is similar in spirit to ddt_phys_select(), maybe can combine + * -- robn, 2024-02-09 + */ +static boolean_t +ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde) +{ + /* If the BP has no DVAs, then this entry is good */ + uint_t ndvas = BP_GET_NDVAS(bp); + if (ndvas == 0) + return (B_TRUE); + + /* + * Only checking the phys for the copies. For flat, there's only one; + * for trad it'll be the one that has the matching set of DVAs. + */ + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + dde->dde_phys->ddp_flat.ddp_dva : + dde->dde_phys->ddp_trad[ndvas].ddp_dva; + + /* + * Compare entry DVAs with the BP. They should all be there, but + * there's not really anything we can do if its only partial anyway, + * that's an error somewhere else, maybe long ago. + */ + uint_t d; + for (d = 0; d < ndvas; d++) + if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d])) + return (B_FALSE); + ASSERT3U(d, ==, ndvas); + + return (B_TRUE); +} + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp) { @@ -1057,8 +1140,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* If it's already loaded, we can just return it. */ DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit); - if (dde->dde_flags & DDE_FLAG_LOADED) - return (dde); + if (dde->dde_flags & DDE_FLAG_LOADED) { + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); + return (NULL); + } /* Someone else is loading it, wait for it. */ dde->dde_waiters++; @@ -1077,7 +1163,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) } DDT_KSTAT_BUMP(ddt, dds_lookup_existing); - return (dde); + + /* Make sure the loaded entry matches the BP */ + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); + return (NULL); } else DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss); @@ -1086,32 +1176,42 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* Record the time this class was created (used by ddt prune) */ if (ddt->ddt_flags & DDT_FLAG_FLAT) - dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec(); + dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start(); avl_insert(&ddt->ddt_tree, dde, where); /* If its in the log tree, we can "load" it from there */ if (ddt->ddt_flags & DDT_FLAG_LOG) { ddt_lightweight_entry_t ddlwe; - boolean_t found = B_FALSE; - - if (ddt_log_take_key(ddt, ddt->ddt_log_active, - &search, &ddlwe)) { - DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); - found = B_TRUE; - } else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing, - &search, &ddlwe)) { - DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit); - found = B_TRUE; - } - - if (found) { - dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; + if (ddt_log_find_key(ddt, &search, &ddlwe)) { + /* + * See if we have the key first, and if so, set up + * the entry. + */ dde->dde_type = ddlwe.ddlwe_type; dde->dde_class = ddlwe.ddlwe_class; memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, DDT_PHYS_SIZE(ddt)); + /* Whatever we found isn't valid for this BP, eject */ + if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + + /* Remove it and count it */ + if (ddt_log_remove_key(ddt, + ddt->ddt_log_active, &search)) { + DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); + } else { + VERIFY(ddt_log_remove_key(ddt, + ddt->ddt_log_flushing, &search)); + DDT_KSTAT_BUMP(ddt, + dds_lookup_log_flushing_hit); + } + + dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit); DDT_KSTAT_BUMP(ddt, dds_lookup_existing); @@ -1150,6 +1250,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + boolean_t valid = B_TRUE; + if (dde->dde_type == DDT_TYPES && dde->dde_class == DDT_CLASSES && ddt_over_quota(spa)) { @@ -1163,6 +1265,24 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* Flag cleanup required */ dde->dde_flags |= DDE_FLAG_OVERQUOTA; } else if (error == 0) { + /* + * If what we loaded is no good for this BP and there's no one + * waiting for it, we can just remove it and get out. If its no + * good but there are waiters, we have to leave it, because we + * don't know what they want. If its not needed we'll end up + * taking an entry log/sync, but it can only happen if more + * than one previous version of this block is being deleted at + * the same time. This is extremely unlikely to happen and not + * worth the effort to deal with without taking an entry + * update. + */ + valid = ddt_entry_lookup_is_valid(ddt, bp, dde); + if (!valid && dde->dde_waiters == 0) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit); DDT_KSTAT_BUMP(ddt, dds_lookup_existing); @@ -1191,7 +1311,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) dde->dde_flags |= DDE_FLAG_LOADED; cv_broadcast(&dde->dde_cv); - return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde); + if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid) + return (NULL); + + return (dde); } void @@ -1420,7 +1543,6 @@ ddt_configure(ddt_t *ddt, boolean_t new) static void ddt_table_alloc_kstats(ddt_t *ddt) { -#ifdef _KERNEL char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa)); char *name = kmem_asprintf("ddt_stats_%s", zio_checksum_table[ddt->ddt_checksum].ci_name); @@ -1436,9 +1558,6 @@ ddt_table_alloc_kstats(ddt_t *ddt) kmem_strfree(name); kmem_strfree(mod); -#else - (void) ddt; -#endif /* _KERNEL */ } static ddt_t * @@ -1468,13 +1587,11 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) static void ddt_table_free(ddt_t *ddt) { -#ifdef _KERNEL if (ddt->ddt_ksp != NULL) { kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t)); ddt->ddt_ksp->ks_data = NULL; kstat_delete(ddt->ddt_ksp); } -#endif /* _KERNEL */ ddt_log_free(ddt); ASSERT0(avl_numnodes(&ddt->ddt_tree)); @@ -1814,7 +1931,7 @@ ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); if (ddt_phys_birth(ddp, v) == 0) { - ASSERT3U(phys_refcnt, ==, 0); + ASSERT0(phys_refcnt); continue; } if (DDT_PHYS_IS_DITTO(ddt, p)) { @@ -2288,8 +2405,9 @@ ddt_walk_ready(spa_t *spa) return (B_TRUE); } -int -ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) +static int +ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe, + uint64_t flags, boolean_t wait) { do { do { @@ -2298,7 +2416,11 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) if (ddt == NULL) continue; - if (ddt->ddt_flush_force_txg > 0) + if (flags != 0 && + (ddt->ddt_flags & flags) != flags) + continue; + + if (wait && ddt->ddt_flush_force_txg > 0) return (EAGAIN); int error = ENOENT; @@ -2322,13 +2444,19 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) return (SET_ERROR(ENOENT)); } +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) +{ + return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE)); +} + /* * This function is used by Block Cloning (brt.c) to increase reference * counter for the DDT entry if the block is already in DDT. * * Return false if the block, despite having the D bit set, is not present - * in the DDT. Currently this is not possible but might be in the future. - * See the comment below. + * in the DDT. This is possible when the DDT has been pruned by an admin + * or by the DDT quota mechanism. */ boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp) @@ -2359,28 +2487,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - /* - * This entry already existed (dde_type is real), so it must - * have refcnt >0 at the start of this txg. We are called from - * brt_pending_apply(), before frees are issued, so the refcnt - * can't be lowered yet. Therefore, it must be >0. We assert - * this because if the order of BRT and DDT interactions were - * ever to change and the refcnt was ever zero here, then - * likely further action is required to fill out the DDT entry, - * and this is a place that is likely to be missed in testing. - */ - ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0); - ddt_phys_addref(dde->dde_phys, v); result = B_TRUE; } else { /* - * At the time of implementating this if the block has the - * DEDUP flag set it must exist in the DEDUP table, but - * there are many advocates that want ability to remove - * entries from DDT with refcnt=1. If this will happen, - * we may have a block with the DEDUP set, but which doesn't - * have a corresponding entry in the DDT. Be ready. + * If the block has the DEDUP flag set it still might not + * exist in the DEDUP table due to DDT pruning of entries + * where refcnt=1. */ ddt_remove(ddt, dde); result = B_FALSE; @@ -2392,6 +2505,261 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) return (result); } +typedef struct ddt_prune_entry { + ddt_t *dpe_ddt; + ddt_key_t dpe_key; + list_node_t dpe_node; + ddt_univ_phys_t dpe_phys[]; +} ddt_prune_entry_t; + +typedef struct ddt_prune_info { + spa_t *dpi_spa; + uint64_t dpi_txg_syncs; + uint64_t dpi_pruned; + list_t dpi_candidates; +} ddt_prune_info_t; + +/* + * Add prune candidates for ddt_sync during spa_sync + */ +static void +prune_candidates_sync(void *arg, dmu_tx_t *tx) +{ + (void) tx; + ddt_prune_info_t *dpi = arg; + ddt_prune_entry_t *dpe; + + spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER); + + /* Process the prune candidates collected so far */ + while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) { + blkptr_t blk; + ddt_t *ddt = dpe->dpe_ddt; + + ddt_enter(ddt); + + /* + * If it's on the live list, then it was loaded for update + * this txg and is no longer stale; skip it. + */ + if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) { + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + continue; + } + + ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key, + dpe->dpe_phys, DDT_PHYS_FLAT, &blk); + + ddt_entry_t *dde = ddt_lookup(ddt, &blk); + if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Zero the physical, so we don't try to free DVAs + * at flush nor try to reuse this entry. + */ + ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT); + + dpi->dpi_pruned++; + } + + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + } + + spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG); + dpi->dpi_txg_syncs++; +} + +/* + * Prune candidates are collected in open context and processed + * in sync context as part of ddt_sync_table(). + */ +static void +ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp) +{ + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + + size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE; + ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP); + + dpe->dpe_ddt = ddt; + dpe->dpe_key = *ddk; + memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE); + list_insert_head(list, dpe); +} + +/* + * Interate over all the entries in the DDT unique class. + * The walk will perform one of the following operations: + * (a) build a histogram than can be used when pruning + * (b) prune entries older than the cutoff + * + * Also called by zdb(8) to dump the age histogram + */ +void +ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram) +{ + ddt_bookmark_t ddb = { + .ddb_class = DDT_CLASS_UNIQUE, + .ddb_type = 0, + .ddb_checksum = 0, + .ddb_cursor = 0 + }; + ddt_lightweight_entry_t ddlwe = {0}; + int error; + int total = 0, valid = 0; + int candidates = 0; + uint64_t now = gethrestime_sec(); + ddt_prune_info_t dpi; + boolean_t pruning = (cutoff != 0); + + if (pruning) { + dpi.dpi_txg_syncs = 0; + dpi.dpi_pruned = 0; + dpi.dpi_spa = spa; + list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t), + offsetof(ddt_prune_entry_t, dpe_node)); + } + + if (histogram != NULL) + memset(histogram, 0, sizeof (ddt_age_histo_t)); + + while ((error = + ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + VERIFY(ddt); + + if (spa_shutting_down(spa) || issig()) + break; + total++; + + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1); + + uint64_t class_start = + ddlwe.ddlwe_phys.ddp_flat.ddp_class_start; + + /* + * If this entry is on the log, then the stored entry is stale + * and we should skip it. + */ + if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL)) + continue; + + /* prune older entries */ + if (pruning && class_start < cutoff) { + if (candidates++ >= zfs_ddt_prunes_per_txg) { + /* sync prune candidates in batches */ + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, prune_candidates_sync, + &dpi, 0, ZFS_SPACE_CHECK_NONE)); + candidates = 1; + } + ddt_prune_entry(&dpi.dpi_candidates, ddt, + &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys); + } + + /* build a histogram */ + if (histogram != NULL) { + uint64_t age = MAX(1, (now - class_start) / 3600); + int bin = MIN(highbit64(age) - 1, HIST_BINS - 1); + histogram->dah_entries++; + histogram->dah_age_histo[bin]++; + } + + valid++; + } + + if (pruning && valid > 0) { + if (!list_is_empty(&dpi.dpi_candidates)) { + /* sync out final batch of prune candidates */ + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + prune_candidates_sync, &dpi, 0, + ZFS_SPACE_CHECK_NONE)); + } + list_destroy(&dpi.dpi_candidates); + + zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs", + (u_longlong_t)dpi.dpi_pruned, + (int)((dpi.dpi_pruned * 100) / valid), + (u_longlong_t)dpi.dpi_txg_syncs); + } +} + +static uint64_t +ddt_total_entries(spa_t *spa) +{ + ddt_object_t ddo; + ddt_get_dedup_object_stats(spa, &ddo); + + return (ddo.ddo_count); +} + +int +ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + uint64_t cutoff; + uint64_t start_time = gethrtime(); + + if (spa->spa_active_ddt_prune) + return (SET_ERROR(EALREADY)); + if (ddt_total_entries(spa) == 0) + return (0); + + spa->spa_active_ddt_prune = B_TRUE; + + zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount, + unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older"); + + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + ddt_age_histo_t histogram; + uint64_t oldest = 0; + + /* Make a pass over DDT to build a histogram */ + ddt_prune_walk(spa, 0, &histogram); + + int target = (histogram.dah_entries * amount) / 100; + + /* + * Figure out our cutoff date + * (i.e., which bins to prune from) + */ + for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) { + if (histogram.dah_age_histo[i] != 0) { + /* less than this bucket remaining */ + if (target < histogram.dah_age_histo[i]) { + oldest = MAX(1, (1< 0 && !spa_shutting_down(spa) && !issig()) { + /* Traverse DDT to prune entries older that our cuttoff */ + ddt_prune_walk(spa, cutoff, NULL); + } + + zfs_dbgmsg("%s: prune completed in %llu ms", + spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + + spa->spa_active_ddt_prune = B_FALSE; + return (0); +} + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c index a367d0cd02f8..3aa07dc25b91 100644 --- a/module/zfs/ddt_log.c +++ b/module/zfs/ddt_log.c @@ -353,16 +353,15 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) } boolean_t -ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk, - ddt_lightweight_entry_t *ddlwe) +ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) { ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); if (ddle == NULL) return (B_FALSE); - DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); - - ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + ddt_lightweight_entry_t ddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); avl_remove(&ddl->ddl_tree, ddle); kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? @@ -371,6 +370,21 @@ ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk, return (B_TRUE); } +boolean_t +ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = + avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); + if (!ddle) + ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); + if (!ddle) + return (B_FALSE); + if (ddlwe) + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + return (B_TRUE); +} + void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 7ce2d919610f..55bf9b683f1a 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4342,6 +4342,51 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + +/* + * innvl: { + * "ddt_prune_unit" -> uint32_t + * "ddt_prune_amount" -> uint64_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_ddt_prune[] = { + {DDT_PRUNE_UNIT, DATA_TYPE_INT32, 0}, + {DDT_PRUNE_AMOUNT, DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t unit; + uint64_t amount; + + if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 || + nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) { + return (EINVAL); + } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit, + amount); + + spa_close(spa, FTAG); + + return (error); +} + /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the @@ -7430,6 +7475,11 @@ zfs_ioctl_init(void) POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props)); + zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE, + zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a841e0a79107..e4ccd144f091 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3859,6 +3859,16 @@ zio_ddt_free(zio_t *zio) } ddt_exit(ddt); + /* + * When no entry was found, it must have been pruned, + * so we can free it now instead of decrementing the + * refcount in the DDT. + */ + if (!dde) { + BP_SET_DEDUP(bp, 0); + zio->io_pipeline |= ZIO_STAGE_DVA_FREE; + } + return (zio); } From 82ff9aafd687d4eebb6041c99fa822e0478a2024 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 29 Feb 2024 11:25:24 +1100 Subject: [PATCH 04/11] value strings: pretty printers for flags and enums This adds zfs_valstr, a collection of pretty printers for bitfields and enums. These are useful in debugging, logging and other display contexts where raw values are difficult for the untrained (or even trained!) eye to decipher. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/Makefile.am | 1 + include/sys/zio.h | 3 + include/sys/zio_impl.h | 3 + include/sys/zio_priority.h | 4 + include/zfs_valstr.h | 84 +++++++++++ lib/libzfs/Makefile.am | 1 + lib/libzfs/libzfs.abi | 51 +++++++ lib/libzpool/Makefile.am | 1 + module/Kbuild.in | 1 + module/Makefile.bsd | 1 + module/zcommon/zfs_valstr.c | 277 ++++++++++++++++++++++++++++++++++++ 11 files changed, 427 insertions(+) create mode 100644 include/zfs_valstr.h create mode 100644 module/zcommon/zfs_valstr.c diff --git a/include/Makefile.am b/include/Makefile.am index fa725c2e7a5f..f173064efc99 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -14,6 +14,7 @@ COMMON_H = \ zfs_fletcher.h \ zfs_namecheck.h \ zfs_prop.h \ + zfs_valstr.h \ \ sys/abd.h \ sys/abd_impl.h \ diff --git a/include/sys/zio.h b/include/sys/zio.h index 446b64ccd8ab..3a756949a422 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -167,6 +167,9 @@ typedef enum zio_suspend_reason { * This was originally an enum type. However, those are 32-bit and there is no * way to make a 64-bit enum type. Since we ran out of bits for flags, we were * forced to upgrade it to a uint64_t. + * + * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * FLAG. */ typedef uint64_t zio_flag_t; /* diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 2b026d48675a..2c846a5d41f6 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -120,6 +120,9 @@ extern "C" { /* * zio pipeline stage definitions + * + * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * FLAG. */ enum zio_stage { ZIO_STAGE_OPEN = 1 << 0, /* RWFCXT */ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index 2d8e7fc36bae..bdf5f9b8ff35 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -22,6 +22,10 @@ extern "C" { #endif +/* + * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * VALUE. + */ typedef enum zio_priority { ZIO_PRIORITY_SYNC_READ, ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ diff --git a/include/zfs_valstr.h b/include/zfs_valstr.h new file mode 100644 index 000000000000..77c26ce1ae7d --- /dev/null +++ b/include/zfs_valstr.h @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024, Klara Inc. + */ + +#ifndef _ZFS_VALSTR_H +#define _ZFS_VALSTR_H extern __attribute__((visibility("default"))) + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * These macros create function prototypes for pretty-printing or stringifying + * certain kinds of numeric types. + * + * _ZFS_VALSTR_DECLARE_BITFIELD(name) creates: + * + * size_t zfs_valstr__bits(uint64_t bits, char *out, size_t outlen); + * expands single char for each set bit, and space for each clear bit + * + * size_t zfs_valstr__pairs(uint64_t bits, char *out, size_t outlen); + * expands two-char mnemonic for each bit set in `bits`, separated by `|` + * + * size_t zfs_valstr_(uint64_t bits, char *out, size_t outlen); + * expands full name of each bit set in `bits`, separated by spaces + * + * _ZFS_VALSTR_DECLARE_ENUM(name) creates: + * + * size_t zfs_valstr_(int v, char *out, size_t outlen); + * expands full name of enum value + * + * Each _ZFS_VALSTR_DECLARE_xxx needs a corresponding _VALSTR_xxx_IMPL string + * table in vfs_valstr.c. + */ + +#define _ZFS_VALSTR_DECLARE_BITFIELD(name) \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _bits( \ + uint64_t bits, char *out, size_t outlen); \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _pairs( \ + uint64_t bits, char *out, size_t outlen); \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name( \ + uint64_t bits, char *out, size_t outlen); \ + +#define _ZFS_VALSTR_DECLARE_ENUM(name) \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name( \ + int v, char *out, size_t outlen); \ + +_ZFS_VALSTR_DECLARE_BITFIELD(zio_flag) +_ZFS_VALSTR_DECLARE_BITFIELD(zio_stage) + +_ZFS_VALSTR_DECLARE_ENUM(zio_priority) + +#undef _ZFS_VALSTR_DECLARE_BITFIELD +#undef _ZFS_VALSTR_DECLARE_ENUM + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_VALSTR_H */ diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index 5e74d908de3d..a976faaf9913 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -47,6 +47,7 @@ nodist_libzfs_la_SOURCES = \ module/zcommon/zfs_fletcher_superscalar4.c \ module/zcommon/zfs_namecheck.c \ module/zcommon/zfs_prop.c \ + module/zcommon/zfs_valstr.c \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 88dd8b3c679d..51b29643ee0c 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -454,6 +454,13 @@ + + + + + + + @@ -9831,6 +9838,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 81949bf9e5b8..ff30af7d2b9f 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -64,6 +64,7 @@ nodist_libzpool_la_SOURCES = \ module/zcommon/zfs_fletcher_superscalar4.c \ module/zcommon/zfs_namecheck.c \ module/zcommon/zfs_prop.c \ + module/zcommon/zfs_valstr.c \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c \ \ diff --git a/module/Kbuild.in b/module/Kbuild.in index a119198dbfc0..0472a9348c13 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -240,6 +240,7 @@ ZCOMMON_OBJS := \ zfs_fletcher_superscalar4.o \ zfs_namecheck.o \ zfs_prop.o \ + zfs_valstr.o \ zpool_prop.o \ zprop_common.o diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 534f3257132a..9161204c99d3 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -233,6 +233,7 @@ SRCS+= cityhash.c \ zfs_fletcher_superscalar.c \ zfs_namecheck.c \ zfs_prop.c \ + zfs_valstr.c \ zpool_prop.c \ zprop_common.c diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c new file mode 100644 index 000000000000..e2d4d1aefefb --- /dev/null +++ b/module/zcommon/zfs_valstr.c @@ -0,0 +1,277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024, Klara Inc. + */ + +#include +#include +#include +#include +#include +#include "zfs_valstr.h" + +/* + * Each bit in a bitfield has three possible string representations: + * - single char + * - two-char pair + * - full name + */ +typedef struct { + const char vb_bit; + const char vb_pair[2]; + const char *vb_name; +} valstr_bit_t; + +/* + * Emits a character for each bit in `bits`, up to the number of elements + * in the table. Set bits get the character in vb_bit, clear bits get a + * space. This results in all strings having the same width, for easier + * visual comparison. + */ +static size_t +valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + out[n++] = (bits & mask) ? table[b].vb_bit : ' '; + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and + * separated by a `|` character. This gives a concise representation of the + * whole value. + */ +static size_t +valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + ASSERT3U(n, <=, outlen); + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + if (bits & mask) { + size_t len = (n > 0) ? 3 : 2; + if (n > outlen-len) + break; + if (n > 0) + out[n++] = '|'; + out[n++] = table[b].vb_pair[0]; + out[n++] = table[b].vb_pair[1]; + } + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits the full name for each bit set in `bits`, taken from vb_name, and + * separated by a space. This unambiguously shows the entire set of bits, but + * can get very long. + */ +static size_t +valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + ASSERT3U(n, <=, outlen); + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + if (bits & mask) { + size_t len = strlen(table[b].vb_name); + if (n > 0) + len++; + if (n > outlen-len) + break; + if (n > 0) { + out[n++] = ' '; + len--; + } + memcpy(&out[n], table[b].vb_name, len); + n += len; + } + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits the name of the given enum value in the table. + */ +static size_t +valstr_enum_str(const char **table, const size_t nelems, + int v, char *out, size_t outlen) +{ + ASSERT(out); + ASSERT3U(v, <, nelems); + if (v >= nelems) + return (0); + return (MIN(strlcpy(out, table[v], outlen), outlen)); +} + +/* + * These macros create the string tables for the given name, and implement + * the public functions described in zfs_valstr.h. + */ +#define _VALSTR_BITFIELD_IMPL(name, ...) \ +static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\ +size_t \ +zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_bits(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + \ +size_t \ +zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_pairs(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + \ +size_t \ +zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_str(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + +#define _VALSTR_ENUM_IMPL(name, ...) \ +static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ }; \ +size_t \ +zfs_valstr_ ## name(int v, char *out, size_t outlen) \ +{ \ + return (valstr_enum_str(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen)); \ +} \ + + +/* String tables */ + +/* ZIO flags: zio_flag_t, typically zio->io_flags */ +/* BEGIN CSTYLED */ +_VALSTR_BITFIELD_IMPL(zio_flag, + { '.', "DA", "DONT_AGGREGATE" }, + { '.', "RP", "IO_REPAIR" }, + { '.', "SH", "SELF_HEAL" }, + { '.', "RS", "RESILVER" }, + { '.', "SC", "SCRUB" }, + { '.', "ST", "SCAN_THREAD" }, + { '.', "PH", "PHYSICAL" }, + { '.', "CF", "CANFAIL" }, + { '.', "SP", "SPECULATIVE" }, + { '.', "CW", "CONFIG_WRITER" }, + { '.', "DR", "DONT_RETRY" }, + { '?', "??", "[UNUSED 11]" }, + { '.', "ND", "NODATA" }, + { '.', "ID", "INDUCE_DAMAGE" }, + { '.', "AL", "IO_ALLOCATING" }, + { '.', "RE", "IO_RETRY" }, + { '.', "PR", "PROBE" }, + { '.', "TH", "TRYHARD" }, + { '.', "OP", "OPTIONAL" }, + { '.', "DQ", "DONT_QUEUE" }, + { '.', "DP", "DONT_PROPAGATE" }, + { '.', "BY", "IO_BYPASS" }, + { '.', "RW", "IO_REWRITE" }, + { '.', "CM", "RAW_COMPRESS" }, + { '.', "EN", "RAW_ENCRYPT" }, + { '.', "GG", "GANG_CHILD" }, + { '.', "DD", "DDT_CHILD" }, + { '.', "GF", "GODFATHER" }, + { '.', "NP", "NOPWRITE" }, + { '.', "EX", "REEXECUTED" }, + { '.', "DG", "DELEGATED" }, +) +/* END CSTYLED */ + +/* + * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or + * zio->io_pipeline. + */ +/* BEGIN CSTYLED */ +_VALSTR_BITFIELD_IMPL(zio_stage, + { 'O', "O ", "OPEN" }, + { 'I', "RI", "READ_BP_INIT" }, + { 'I', "WI", "WRITE_BP_INIT" }, + { 'I', "FI", "FREE_BP_INIT" }, + { 'A', "IA", "ISSUE_ASYNC" }, + { 'W', "WC", "WRITE_COMPRESS" }, + { 'E', "EN", "ENCRYPT" }, + { 'C', "CG", "CHECKSUM_GENERATE" }, + { 'N', "NW", "NOP_WRITE" }, + { 'B', "BF", "BRT_FREE" }, + { 'd', "dS", "DDT_READ_START" }, + { 'd', "dD", "DDT_READ_DONE" }, + { 'd', "dW", "DDT_WRITE" }, + { 'd', "dF", "DDT_FREE" }, + { 'G', "GA", "GANG_ASSEMBLE" }, + { 'G', "GI", "GANG_ISSUE" }, + { 'D', "DT", "DVA_THROTTLE" }, + { 'D', "DA", "DVA_ALLOCATE" }, + { 'D', "DF", "DVA_FREE" }, + { 'D', "DC", "DVA_CLAIM" }, + { 'R', "R ", "READY" }, + { 'V', "VS", "VDEV_IO_START" }, + { 'V', "VD", "VDEV_IO_DONE" }, + { 'V', "VA", "VDEV_IO_ASSESS" }, + { 'C', "CV", "CHECKSUM_VERIFY" }, + { 'X', "X ", "DONE" }, +) +/* END CSTYLED */ + +/* ZIO priority: zio_priority_t, typically zio->io_priority */ +/* BEGIN CSTYLED */ +_VALSTR_ENUM_IMPL(zio_priority, + "SYNC_READ", + "SYNC_WRITE", + "ASYNC_READ", + "ASYNC_WRITE", + "SCRUB", + "REMOVAL", + "INITIALIZING", + "TRIM", + "REBUILD", + "[NUM_QUEUEABLE]", + "NOW", +) +/* END CSTYLED */ + +#undef _VALSTR_BITFIELD_IMPL +#undef _VALSTR_ENUM_IMPL From 17dd66dedab9f9bebc823cca3eae3405ef28c7ef Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 29 Feb 2024 15:00:25 +1100 Subject: [PATCH 05/11] zpool events: expand value strings for ZIO error values Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- cmd/zpool/zpool_main.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index ce859226c215..349c208c521b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -75,6 +75,7 @@ #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" +#include "zfs_valstr.h" #include "statcommon.h" @@ -11936,6 +11937,7 @@ static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; + static char flagstr[256]; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { @@ -11995,7 +11997,21 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); - printf(gettext("0x%x"), i32); + if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE) == 0 || + strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE) == 0) { + zfs_valstr_zio_stage(i32, flagstr, + sizeof (flagstr)); + printf(gettext("0x%x [%s]"), i32, flagstr); + } else if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY) == 0) { + zfs_valstr_zio_priority(i32, flagstr, + sizeof (flagstr)); + printf(gettext("0x%x [%s]"), i32, flagstr); + } else { + printf(gettext("0x%x"), i32); + } break; case DATA_TYPE_INT64: @@ -12016,6 +12032,12 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); + } else if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS) == 0) { + zfs_valstr_zio_flag(i64, flagstr, + sizeof (flagstr)); + printf(gettext("0x%llx [%s]"), + (u_longlong_t)i64, flagstr); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } From b109925820fb79db3e37670c159977f03edd950f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 7 Sep 2024 01:45:58 +1000 Subject: [PATCH 06/11] spa_prop_get: require caller to supply output nvlist All callers to spa_prop_get() and spa_prop_get_nvlist() supplied their own preallocated nvlist (except ztest), so we can remove the option to have them allocate one if none is supplied. This sidesteps a bug in spa_prop_get(), where the error var wasn't initialised, which could lead to the provided nvlist being freed at the end. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Jorgen Lundman Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16505 --- cmd/ztest.c | 5 ++- include/sys/spa.h | 4 +- module/zfs/spa.c | 100 ++++++++++++++++++----------------------- module/zfs/zfs_ioctl.c | 10 ++--- 4 files changed, 53 insertions(+), 66 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index a7843d338834..ce031632e758 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -6215,13 +6215,14 @@ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; - nvlist_t *props = NULL; (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); - VERIFY0(spa_prop_get(ztest_spa, &props)); + nvlist_t *props = fnvlist_alloc(); + + VERIFY0(spa_prop_get(ztest_spa, props)); if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); diff --git a/include/sys/spa.h b/include/sys/spa.h index 93f381affd95..aa66d489ef1a 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1201,9 +1201,9 @@ extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); -extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get_nvlist(spa_t *spa, char **props, - unsigned int n_props, nvlist_t **outnvl); + unsigned int n_props, nvlist_t *outnvl); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d51cc4fcd09a..1a68a0953565 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -366,21 +366,15 @@ spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) int spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, - nvlist_t **outnvl) + nvlist_t *outnvl) { int err = 0; if (props == NULL) return (0); - if (*outnvl == NULL) { - err = nvlist_alloc(outnvl, NV_UNIQUE_NAME, KM_SLEEP); - if (err) - return (err); - } - for (unsigned int i = 0; i < n_props && err == 0; i++) { - err = spa_prop_add(spa, props[i], *outnvl); + err = spa_prop_add(spa, props[i], outnvl); } return (err); @@ -406,7 +400,7 @@ spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, * Get property values from the spa configuration. */ static void -spa_prop_get_config(spa_t *spa, nvlist_t **nvp) +spa_prop_get_config(spa_t *spa, nvlist_t *nv) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; @@ -428,48 +422,48 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); - spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); + spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL, size - alloc, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); - spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL, brt_get_used(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL, brt_get_saved(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL, brt_get_ratio(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, ddt_get_ddt_dsize(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } - spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, + spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } @@ -479,62 +473,62 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, + spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, 0, src); } } - spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, + spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_compatibility != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, + spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY, spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) - spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, + spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { - spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } @@ -544,19 +538,13 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) * Get zpool property values. */ int -spa_prop_get(spa_t *spa, nvlist_t **nvp) +spa_prop_get(spa_t *spa, nvlist_t *nv) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; dsl_pool_t *dp; - int err; - - if (*nvp == NULL) { - err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); - if (err) - return (err); - } + int err = 0; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); @@ -565,7 +553,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) /* * Get properties from the spa config. */ - spa_prop_get_config(spa, nvp); + spa_prop_get_config(spa, nv); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) @@ -610,7 +598,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) intval = za.za_first_integer; } - spa_prop_add_list(*nvp, prop, strval, intval, src); + spa_prop_add_list(nv, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); @@ -627,10 +615,10 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) break; } if (prop != ZPOOL_PROP_INVAL) { - spa_prop_add_list(*nvp, prop, strval, 0, src); + spa_prop_add_list(nv, prop, strval, 0, src); } else { src = ZPROP_SRC_LOCAL; - spa_prop_add_user(*nvp, za.za_name, strval, + spa_prop_add_user(nv, za.za_name, strval, src); } kmem_free(strval, za.za_num_integers); @@ -644,11 +632,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); - if (err && err != ENOENT) { - nvlist_free(*nvp); - *nvp = NULL; + + if (err && err != ENOENT) return (err); - } return (0); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 55bf9b683f1a..53366ad49781 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3050,7 +3050,6 @@ static const zfs_ioc_key_t zfs_keys_get_props[] = { static int zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { - nvlist_t *nvp = outnvl; spa_t *spa; char **props = NULL; unsigned int n_props = 0; @@ -3069,16 +3068,17 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) != NULL) { - error = spa_prop_get(spa, &nvp); + error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, - &nvp); + outnvl); } mutex_exit(&spa_namespace_lock); } else { - error = spa_prop_get(spa, &nvp); + error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) - error = spa_prop_get_nvlist(spa, props, n_props, &nvp); + error = spa_prop_get_nvlist(spa, props, n_props, + outnvl); spa_close(spa, FTAG); } From 5c67820265c18e82c2d79c77ab09fc46904b58b8 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 10 Sep 2024 07:13:27 +1000 Subject: [PATCH 07/11] libzstd: also build with LIBZPOOL_CPPFLAGS libzstd now also allocates its own abd_t, and so has the same issue as zstream did, so this applies the same workaround: compile it with ZFS_DEBUG. See 92fca1c2d. This looks weird, because libzstd doesn't appear to look related to the ZFS kernel, but there is already a cross-dependency there: zstd needs zfs_lz4_compress, and zfs needs zfs_zstd_compress (and others), so the two can never really be separated without more work. Another job for another time. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed by: Brian Behlendorf Reviewed-by: Mark Maybee Signed-off-by: Rob Norris Closes #16489 --- lib/libzstd/Makefile.am | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/libzstd/Makefile.am b/lib/libzstd/Makefile.am index 49bfb328a6f7..856175137906 100644 --- a/lib/libzstd/Makefile.am +++ b/lib/libzstd/Makefile.am @@ -1,4 +1,6 @@ libzstd_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) +libzstd_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) + # -fno-tree-vectorize is set for gcc in zstd/common/compiler.h # Set it for other compilers, too. libzstd_la_CFLAGS += -fno-tree-vectorize From 8be2f4c3d2c43b031fd568240beebae1b0bc7423 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 10 Sep 2024 10:21:20 +1000 Subject: [PATCH 08/11] zio_resume: log when unsuspending the pool (#16485) When reviewing logs after a failure, its useful to see where unsuspend/resume was requested. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- module/zfs/zio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e4ccd144f091..53992931e049 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2553,7 +2553,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) if (reason != ZIO_SUSPEND_MMP) { cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " - "I/O failure and has been suspended.\n", spa_name(spa)); + "I/O failure and has been suspended.", spa_name(spa)); } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, @@ -2589,6 +2589,10 @@ zio_resume(spa_t *spa) * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); + if (spa->spa_suspended != ZIO_SUSPEND_NONE) + cmn_err(CE_WARN, "Pool '%s' was suspended and is being " + "resumed. Failed I/O will be retried.", + spa_name(spa)); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; From 88433e640ddbf390bcbed5ff79478f7ac985f161 Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Tue, 10 Sep 2024 01:37:12 +0100 Subject: [PATCH 09/11] sys/types32.h: Remove struct timeval32 from libspl's header (#16491) macOS Sequoia's sys/sockio.h, as included by various bootstrap tools whilst building FreeBSD, has started to include net/if.h, which then includes sys/_types/_timeval32.h and provide a conflicting definition for struct timeval32. Since this type is entirely unused within OpenZFS, simply delete the type rather than adding in some kind of OS detection. This fixes building FreeBSD on macOS Sequoia (Beta). Signed-off-by: Jessica Clarke Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- lib/libspl/include/sys/types32.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/libspl/include/sys/types32.h b/lib/libspl/include/sys/types32.h index eadc67c7122a..d065ebed03b7 100644 --- a/lib/libspl/include/sys/types32.h +++ b/lib/libspl/include/sys/types32.h @@ -65,11 +65,6 @@ typedef int32_t ssize32_t; typedef int32_t time32_t; typedef int32_t clock32_t; -struct timeval32 { - time32_t tv_sec; /* seconds */ - int32_t tv_usec; /* and microseconds */ -}; - typedef struct timespec32 { time32_t tv_sec; /* seconds */ int32_t tv_nsec; /* and nanoseconds */ From 63253dbf4f8611a657474fd2e065f960374bbc35 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 10 Sep 2024 10:49:14 +1000 Subject: [PATCH 10/11] zts-report: don't crash on non-UTF-8 chars in the log (#16497) The report generator expects the log to be clean and tidy UTF-8. That can be a problem if you use some of the verbose/debug test runner options, which sends all sorts of weird output from arbitrary programs to the log. This just makes Python a little more relaxed about such things. It shouldn't matter in practice, as those lines didn't match the test result regex anyway, and are discarded immediately. Sponsored-by: https://despairlabs.com/sponsor/ Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- tests/test-runner/bin/zts-report.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 1177e80e1a75..6db10b91de05 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -389,7 +389,7 @@ if os.environ.get('CI') == 'true': def process_results(pathname): try: - f = open(pathname) + f = open(pathname, errors='replace') except IOError as e: print('Error opening file:', e) sys.exit(1) From 927f6eae697ba327a98a5d061879ac0df98dafe3 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 6 Sep 2024 21:03:50 +0500 Subject: [PATCH 11/11] Revert "Revert "Make mount.zfs(8) calling zfs_mount_at for legacy mounts"" This reverts commit f91496453c6bb53fe04d21d9f0e5e59c54e80624. Signed-off-by: Umer Saleem --- cmd/mount_zfs.c | 5 ++--- module/os/linux/zfs/zfs_ctldir.c | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cmd/mount_zfs.c b/cmd/mount_zfs.c index fc9220950647..283074daf717 100644 --- a/cmd/mount_zfs.c +++ b/cmd/mount_zfs.c @@ -269,8 +269,7 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - if (!zfsutil || sloppy || - libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); } @@ -337,7 +336,7 @@ main(int argc, char **argv) dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); if (!fake) { - if (zfsutil && !sloppy && + if (!remount && !sloppy && !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint); if (error) { diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 54ed70d0394f..e042116333fb 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) zfsvfs_t *snap_zfsvfs; zfs_snapentry_t *se; char *full_name, *full_path; - char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, - NULL }; + char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n", + NULL, NULL, NULL }; char *envp[] = { NULL }; int error; struct path spath; @@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); - argv[5] = full_name; - argv[6] = full_path; + argv[6] = full_name; + argv[7] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) {