From 3570f1f122ba3d8da1f7b889e675859331f72e6e Mon Sep 17 00:00:00 2001 From: Auke Kok Date: Wed, 25 Mar 2026 13:53:29 -0700 Subject: [PATCH] data_prealloc_blocks_min mount+sysfs option. Adds an accompanying option to set a data preallocation minimum threshold value. The value can be set through sysfs or at mount time. data_prealloc_blocks_min cannot be larger than data_prealloc_blocks, and this is enforced. This should be fine for all common use cases where the _min option is expected to be less than 2048, the default of data_prealloc_blocks. Extra test cases are added to validate bad mount option values and sysfs value writes. As well as tests that validate that the minimum threshold is set and honored as expected. Preallocation scales with scoutfs_get_inode_onoff() online values, so that new extents double the online size every allocation until it reaches data_prealloc_blocks. The _onoff() value is only fetched once if possible. Signed-off-by: Auke Kok --- kmod/src/data.c | 28 +++++++++++++++ kmod/src/options.c | 60 +++++++++++++++++++++++++++++++++ kmod/src/options.h | 1 + tests/funcs/filter.sh | 4 ++- tests/golden/basic-bad-mounts | 3 ++ tests/golden/data-prealloc | 15 +++++++++ tests/tests/basic-bad-mounts.sh | 9 +++++ tests/tests/data-prealloc.sh | 54 ++++++++++++++++++++++++++++- 8 files changed, 172 insertions(+), 2 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 0abb48c0c..747470516 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -384,6 +384,15 @@ static inline u64 ext_last(struct scoutfs_extent *ext) * This can waste a lot of space for small or sparse files but is * reasonable when a file population is known to be large and dense but * known to be written with non-streaming write patterns. + * + * In either strategy, a minimum block count threshold can be configured + * to suppress preallocation for small files and then ramp up + * preallocation proportionally with the file's online block count. + * Files below the threshold get single block allocations. Files above + * the threshold limit preallocation to their current online block count, + * regardless of write offset. This avoids overallocation for small + * files in mixed-size workloads while still allowing large files to + * benefit from full preallocation. */ static int alloc_block(struct super_block *sb, struct inode *inode, struct scoutfs_extent *ext, u64 iblock, @@ -400,6 +409,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode, struct scoutfs_extent found; struct scoutfs_extent pre = {0,}; bool undo_pre = false; + bool have_onoff = false; u64 blkno = 0; u64 online; u64 offline; @@ -445,6 +455,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode, * blocks. */ scoutfs_inode_get_onoff(inode, &online, &offline); + have_onoff = true; if (iblock > 1 && iblock == online) { ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found); @@ -491,6 +502,23 @@ static int alloc_block(struct super_block *sb, struct inode *inode, /* overall prealloc limit */ count = min_t(u64, count, opts.data_prealloc_blocks); + /* + * Limit preallocation based on the number of blocks already + * allocated in the file. Files with fewer online blocks than + * the configured minimum get no preallocation. Once past the + * minimum, preallocation ramps up proportionally with the + * file's online block count rather than jumping to the full + * prealloc size. + */ + if (opts.data_prealloc_blocks_min > 0 && !ext->len) { + if (!have_onoff) + scoutfs_inode_get_onoff(inode, &online, &offline); + if (online < opts.data_prealloc_blocks_min) + count = 1; + else + count = min(count, online); + } + ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, &datinf->dalloc, count, &blkno, &count); if (ret < 0) diff --git a/kmod/src/options.c b/kmod/src/options.c index b7565d76d..e244be9c9 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -32,6 +32,7 @@ enum { Opt_acl, Opt_data_prealloc_blocks, + Opt_data_prealloc_blocks_min, Opt_data_prealloc_contig_only, Opt_ino_alloc_per_lock, Opt_lock_idle_count, @@ -48,6 +49,7 @@ enum { static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"}, + {Opt_data_prealloc_blocks_min, "data_prealloc_blocks_min=%s"}, {Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"}, {Opt_ino_alloc_per_lock, "ino_alloc_per_lock=%s"}, {Opt_lock_idle_count, "lock_idle_count=%s"}, @@ -252,6 +254,18 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m opts->data_prealloc_blocks = nr64; break; + case Opt_data_prealloc_blocks_min: + ret = match_u64(args, &nr64); + if (ret < 0 || nr64 > MAX_DATA_PREALLOC_BLOCKS) { + scoutfs_err(sb, "invalid data_prealloc_blocks_min option, must be between 0 and %llu", + MAX_DATA_PREALLOC_BLOCKS); + if (ret == 0) + ret = -EINVAL; + return ret; + } + opts->data_prealloc_blocks_min = nr64; + break; + case Opt_data_prealloc_contig_only: ret = match_int(args, &nr); if (ret < 0 || nr < 0 || nr > 1) { @@ -366,6 +380,12 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m return -EINVAL; } + if (opts->data_prealloc_blocks_min > opts->data_prealloc_blocks) { + scoutfs_err(sb, "data_prealloc_blocks_min %llu must not exceed data_prealloc_blocks %llu", + opts->data_prealloc_blocks_min, opts->data_prealloc_blocks); + return -EINVAL; + } + return 0; } @@ -437,6 +457,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root) if (is_acl) seq_puts(seq, ",acl"); seq_printf(seq, ",data_prealloc_blocks=%llu", opts.data_prealloc_blocks); + seq_printf(seq, ",data_prealloc_blocks_min=%llu", opts.data_prealloc_blocks_min); seq_printf(seq, ",data_prealloc_contig_only=%u", opts.data_prealloc_contig_only); seq_printf(seq, ",ino_alloc_per_lock=%u", opts.ino_alloc_per_lock); seq_printf(seq, ",metadev_path=%s", opts.metadev_path); @@ -480,6 +501,8 @@ static ssize_t data_prealloc_blocks_store(struct kobject *kobj, struct kobj_attr MIN_DATA_PREALLOC_BLOCKS, MAX_DATA_PREALLOC_BLOCKS); return -EINVAL; } + if (val < optinf->opts.data_prealloc_blocks_min) + return -EINVAL; write_seqlock(&optinf->seqlock); optinf->opts.data_prealloc_blocks = val; @@ -489,6 +512,42 @@ static ssize_t data_prealloc_blocks_store(struct kobject *kobj, struct kobj_attr } SCOUTFS_ATTR_RW(data_prealloc_blocks); +static ssize_t data_prealloc_blocks_min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%llu", opts.data_prealloc_blocks_min); +} +static ssize_t data_prealloc_blocks_min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + DECLARE_OPTIONS_INFO(sb, optinf); + char nullterm[30]; /* more than enough for octal -U64_MAX */ + u64 val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtoll(nullterm, 0, &val); + if (ret < 0 || val > optinf->opts.data_prealloc_blocks) + return -EINVAL; + + write_seqlock(&optinf->seqlock); + optinf->opts.data_prealloc_blocks_min = val; + write_sequnlock(&optinf->seqlock); + + return count; +} +SCOUTFS_ATTR_RW(data_prealloc_blocks_min); + static ssize_t data_prealloc_contig_only_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -742,6 +801,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr); static struct attribute *options_attrs[] = { SCOUTFS_ATTR_PTR(data_prealloc_blocks), + SCOUTFS_ATTR_PTR(data_prealloc_blocks_min), SCOUTFS_ATTR_PTR(data_prealloc_contig_only), SCOUTFS_ATTR_PTR(ino_alloc_per_lock), SCOUTFS_ATTR_PTR(lock_idle_count), diff --git a/kmod/src/options.h b/kmod/src/options.h index b37bbd705..ca5444329 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -7,6 +7,7 @@ struct scoutfs_mount_options { u64 data_prealloc_blocks; + u64 data_prealloc_blocks_min; bool data_prealloc_contig_only; unsigned int ino_alloc_per_lock; int lock_idle_count; diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh index cb766300f..e413abbd8 100644 --- a/tests/funcs/filter.sh +++ b/tests/funcs/filter.sh @@ -83,8 +83,10 @@ t_filter_dmesg() re="$re|device-mapper:.*uevent:.*version" re="$re|device-mapper:.*ioctl:.*initialised" - # some tests try invalid devices + # some tests try invalid devices and options re="$re|scoutfs .* error reading super block" + re="$re|scoutfs .* error.*invalid data_prealloc_blocks" + re="$re|scoutfs .* error.*data_prealloc_blocks_min .* must not exceed" re="$re| EXT4-fs (.*): get root inode failed" re="$re| EXT4-fs (.*): mount failed" re="$re| EXT4-fs (.*): no journal found" diff --git a/tests/golden/basic-bad-mounts b/tests/golden/basic-bad-mounts index 6c84453ff..c19cc42c0 100644 --- a/tests/golden/basic-bad-mounts +++ b/tests/golden/basic-bad-mounts @@ -4,3 +4,6 @@ == both meta devices == both data devices == good volume, bad option and good options +== blocks_min greater than blocks +== blocks_min greater than default blocks +== blocks_min greater than blocks, reversed order diff --git a/tests/golden/data-prealloc b/tests/golden/data-prealloc index 19efd20a1..41872df0b 100644 --- a/tests/golden/data-prealloc +++ b/tests/golden/data-prealloc @@ -24,6 +24,21 @@ /mnt/test/test/data-prealloc/file-2: extents: 6 /mnt/test/test/data-prealloc/file-1: extents: 3 /mnt/test/test/data-prealloc/file-2: extents: 3 +== blocks_min rejects values greater than blocks +blocks_min after rejected write: 0 +== blocks rejects values less than blocks_min +blocks after rejected write: 32 +== blocks_min accepts value equal to blocks +blocks_min after accepted write: 32 +== blocks_min suppresses prealloc for small files +/mnt/test/test/data-prealloc/file-1: extents: 8 +/mnt/test/test/data-prealloc/file-2: extents: 8 +== blocks_min prealloc ramps up past threshold +/mnt/test/test/data-prealloc/file-1: extents: 8 +/mnt/test/test/data-prealloc/file-2: extents: 8 +== blocks_min with region prealloc +/mnt/test/test/data-prealloc/file-1: extents: 12 +/mnt/test/test/data-prealloc/file-2: extents: 12 == block writes into region allocs hole wrote blk 24 wrote blk 32 diff --git a/tests/tests/basic-bad-mounts.sh b/tests/tests/basic-bad-mounts.sh index a20f36b6c..409310e19 100644 --- a/tests/tests/basic-bad-mounts.sh +++ b/tests/tests/basic-bad-mounts.sh @@ -30,4 +30,13 @@ mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_ echo "== good volume, bad option and good options" mount_fail -o _bad,metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR" +echo "== blocks_min greater than blocks" +mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0,data_prealloc_blocks=32,data_prealloc_blocks_min=33 "$T_EX_DATA_DEV" "$T_MSCR" + +echo "== blocks_min greater than default blocks" +mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0,data_prealloc_blocks_min=2049 "$T_EX_DATA_DEV" "$T_MSCR" + +echo "== blocks_min greater than blocks, reversed order" +mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0,data_prealloc_blocks_min=33,data_prealloc_blocks=32 "$T_EX_DATA_DEV" "$T_MSCR" + t_pass diff --git a/tests/tests/data-prealloc.sh b/tests/tests/data-prealloc.sh index a76909d77..0447d741b 100644 --- a/tests/tests/data-prealloc.sh +++ b/tests/tests/data-prealloc.sh @@ -90,10 +90,17 @@ print_extents_found() done } +read_opt() +{ + cat "$(t_sysfs_path 0)/mount_options/$1" +} + t_save_all_sysfs_mount_options data_prealloc_blocks +t_save_all_sysfs_mount_options data_prealloc_blocks_min t_save_all_sysfs_mount_options data_prealloc_contig_only restore_options() { + t_restore_all_sysfs_mount_options data_prealloc_blocks_min t_restore_all_sysfs_mount_options data_prealloc_blocks t_restore_all_sysfs_mount_options data_prealloc_contig_only } @@ -153,6 +160,50 @@ t_set_sysfs_mount_option 0 data_prealloc_contig_only 0 write_forwards $prefix 3 print_extents_found $prefix +echo "== blocks_min rejects values greater than blocks" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 33 +echo "blocks_min after rejected write: $(read_opt data_prealloc_blocks_min)" + +echo "== blocks rejects values less than blocks_min" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 16 +t_set_sysfs_mount_option 0 data_prealloc_blocks 8 +echo "blocks after rejected write: $(read_opt data_prealloc_blocks)" + +echo "== blocks_min accepts value equal to blocks" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 32 +echo "blocks_min after accepted write: $(read_opt data_prealloc_blocks_min)" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 + +echo "== blocks_min suppresses prealloc for small files" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 16 +t_set_sysfs_mount_option 0 data_prealloc_contig_only 1 +write_forwards $prefix 8 +print_extents_found $prefix + +echo "== blocks_min prealloc ramps up past threshold" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 4 +t_set_sysfs_mount_option 0 data_prealloc_contig_only 1 +write_forwards $prefix 64 +print_extents_found $prefix + +echo "== blocks_min with region prealloc" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 16 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 8 +t_set_sysfs_mount_option 0 data_prealloc_contig_only 0 +write_forwards $prefix 64 +print_extents_found $prefix + # # prepare aligned regions of 8 blocks that we'll write into. # We'll right into the first, last, and middle block of each @@ -163,7 +214,8 @@ print_extents_found $prefix # through. The correct output is tied to preallocation strategy so it # has to be verified each time we change preallocation. # -echo "== block writes into region allocs hole" +echo "== block writes into region allocs hole" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 t_set_sysfs_mount_option 0 data_prealloc_blocks 8 t_set_sysfs_mount_option 0 data_prealloc_contig_only 1 touch "$prefix"