diff --git a/kmod/src/data.c b/kmod/src/data.c index 0abb48c0..74747051 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -384,6 +384,15 @@ static inline u64 ext_last(struct scoutfs_extent *ext) * This can waste a lot of space for small or sparse files but is * reasonable when a file population is known to be large and dense but * known to be written with non-streaming write patterns. + * + * In either strategy, a minimum block count threshold can be configured + * to suppress preallocation for small files and then ramp up + * preallocation proportionally with the file's online block count. + * Files below the threshold get single block allocations. Files above + * the threshold limit preallocation to their current online block count, + * regardless of write offset. This avoids overallocation for small + * files in mixed-size workloads while still allowing large files to + * benefit from full preallocation. */ static int alloc_block(struct super_block *sb, struct inode *inode, struct scoutfs_extent *ext, u64 iblock, @@ -400,6 +409,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode, struct scoutfs_extent found; struct scoutfs_extent pre = {0,}; bool undo_pre = false; + bool have_onoff = false; u64 blkno = 0; u64 online; u64 offline; @@ -445,6 +455,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode, * blocks. */ scoutfs_inode_get_onoff(inode, &online, &offline); + have_onoff = true; if (iblock > 1 && iblock == online) { ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found); @@ -491,6 +502,23 @@ static int alloc_block(struct super_block *sb, struct inode *inode, /* overall prealloc limit */ count = min_t(u64, count, opts.data_prealloc_blocks); + /* + * Limit preallocation based on the number of blocks already + * allocated in the file. Files with fewer online blocks than + * the configured minimum get no preallocation. Once past the + * minimum, preallocation ramps up proportionally with the + * file's online block count rather than jumping to the full + * prealloc size. + */ + if (opts.data_prealloc_blocks_min > 0 && !ext->len) { + if (!have_onoff) + scoutfs_inode_get_onoff(inode, &online, &offline); + if (online < opts.data_prealloc_blocks_min) + count = 1; + else + count = min(count, online); + } + ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, &datinf->dalloc, count, &blkno, &count); if (ret < 0) diff --git a/kmod/src/options.c b/kmod/src/options.c index b7565d76..e244be9c 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -32,6 +32,7 @@ enum { Opt_acl, Opt_data_prealloc_blocks, + Opt_data_prealloc_blocks_min, Opt_data_prealloc_contig_only, Opt_ino_alloc_per_lock, Opt_lock_idle_count, @@ -48,6 +49,7 @@ enum { static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"}, + {Opt_data_prealloc_blocks_min, "data_prealloc_blocks_min=%s"}, {Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"}, {Opt_ino_alloc_per_lock, "ino_alloc_per_lock=%s"}, {Opt_lock_idle_count, "lock_idle_count=%s"}, @@ -252,6 +254,18 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m opts->data_prealloc_blocks = nr64; break; + case Opt_data_prealloc_blocks_min: + ret = match_u64(args, &nr64); + if (ret < 0 || nr64 > MAX_DATA_PREALLOC_BLOCKS) { + scoutfs_err(sb, "invalid data_prealloc_blocks_min option, must be between 0 and %llu", + MAX_DATA_PREALLOC_BLOCKS); + if (ret == 0) + ret = -EINVAL; + return ret; + } + opts->data_prealloc_blocks_min = nr64; + break; + case Opt_data_prealloc_contig_only: ret = match_int(args, &nr); if (ret < 0 || nr < 0 || nr > 1) { @@ -366,6 +380,12 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m return -EINVAL; } + if (opts->data_prealloc_blocks_min > opts->data_prealloc_blocks) { + scoutfs_err(sb, "data_prealloc_blocks_min %llu must not exceed data_prealloc_blocks %llu", + opts->data_prealloc_blocks_min, opts->data_prealloc_blocks); + return -EINVAL; + } + return 0; } @@ -437,6 +457,7 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root) if (is_acl) seq_puts(seq, ",acl"); seq_printf(seq, ",data_prealloc_blocks=%llu", opts.data_prealloc_blocks); + seq_printf(seq, ",data_prealloc_blocks_min=%llu", opts.data_prealloc_blocks_min); seq_printf(seq, ",data_prealloc_contig_only=%u", opts.data_prealloc_contig_only); seq_printf(seq, ",ino_alloc_per_lock=%u", opts.ino_alloc_per_lock); seq_printf(seq, ",metadev_path=%s", opts.metadev_path); @@ -480,6 +501,8 @@ static ssize_t data_prealloc_blocks_store(struct kobject *kobj, struct kobj_attr MIN_DATA_PREALLOC_BLOCKS, MAX_DATA_PREALLOC_BLOCKS); return -EINVAL; } + if (val < optinf->opts.data_prealloc_blocks_min) + return -EINVAL; write_seqlock(&optinf->seqlock); optinf->opts.data_prealloc_blocks = val; @@ -489,6 +512,42 @@ static ssize_t data_prealloc_blocks_store(struct kobject *kobj, struct kobj_attr } SCOUTFS_ATTR_RW(data_prealloc_blocks); +static ssize_t data_prealloc_blocks_min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%llu", opts.data_prealloc_blocks_min); +} +static ssize_t data_prealloc_blocks_min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + DECLARE_OPTIONS_INFO(sb, optinf); + char nullterm[30]; /* more than enough for octal -U64_MAX */ + u64 val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtoll(nullterm, 0, &val); + if (ret < 0 || val > optinf->opts.data_prealloc_blocks) + return -EINVAL; + + write_seqlock(&optinf->seqlock); + optinf->opts.data_prealloc_blocks_min = val; + write_sequnlock(&optinf->seqlock); + + return count; +} +SCOUTFS_ATTR_RW(data_prealloc_blocks_min); + static ssize_t data_prealloc_contig_only_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -742,6 +801,7 @@ SCOUTFS_ATTR_RO(quorum_slot_nr); static struct attribute *options_attrs[] = { SCOUTFS_ATTR_PTR(data_prealloc_blocks), + SCOUTFS_ATTR_PTR(data_prealloc_blocks_min), SCOUTFS_ATTR_PTR(data_prealloc_contig_only), SCOUTFS_ATTR_PTR(ino_alloc_per_lock), SCOUTFS_ATTR_PTR(lock_idle_count), diff --git a/kmod/src/options.h b/kmod/src/options.h index b37bbd70..ca544432 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -7,6 +7,7 @@ struct scoutfs_mount_options { u64 data_prealloc_blocks; + u64 data_prealloc_blocks_min; bool data_prealloc_contig_only; unsigned int ino_alloc_per_lock; int lock_idle_count; diff --git a/tests/funcs/filter.sh b/tests/funcs/filter.sh index cb766300..e413abbd 100644 --- a/tests/funcs/filter.sh +++ b/tests/funcs/filter.sh @@ -83,8 +83,10 @@ t_filter_dmesg() re="$re|device-mapper:.*uevent:.*version" re="$re|device-mapper:.*ioctl:.*initialised" - # some tests try invalid devices + # some tests try invalid devices and options re="$re|scoutfs .* error reading super block" + re="$re|scoutfs .* error.*invalid data_prealloc_blocks" + re="$re|scoutfs .* error.*data_prealloc_blocks_min .* must not exceed" re="$re| EXT4-fs (.*): get root inode failed" re="$re| EXT4-fs (.*): mount failed" re="$re| EXT4-fs (.*): no journal found" diff --git a/tests/golden/basic-bad-mounts b/tests/golden/basic-bad-mounts index 6c84453f..c19cc42c 100644 --- a/tests/golden/basic-bad-mounts +++ b/tests/golden/basic-bad-mounts @@ -4,3 +4,6 @@ == both meta devices == both data devices == good volume, bad option and good options +== blocks_min greater than blocks +== blocks_min greater than default blocks +== blocks_min greater than blocks, reversed order diff --git a/tests/golden/data-prealloc b/tests/golden/data-prealloc index 19efd20a..41872df0 100644 --- a/tests/golden/data-prealloc +++ b/tests/golden/data-prealloc @@ -24,6 +24,21 @@ /mnt/test/test/data-prealloc/file-2: extents: 6 /mnt/test/test/data-prealloc/file-1: extents: 3 /mnt/test/test/data-prealloc/file-2: extents: 3 +== blocks_min rejects values greater than blocks +blocks_min after rejected write: 0 +== blocks rejects values less than blocks_min +blocks after rejected write: 32 +== blocks_min accepts value equal to blocks +blocks_min after accepted write: 32 +== blocks_min suppresses prealloc for small files +/mnt/test/test/data-prealloc/file-1: extents: 8 +/mnt/test/test/data-prealloc/file-2: extents: 8 +== blocks_min prealloc ramps up past threshold +/mnt/test/test/data-prealloc/file-1: extents: 8 +/mnt/test/test/data-prealloc/file-2: extents: 8 +== blocks_min with region prealloc +/mnt/test/test/data-prealloc/file-1: extents: 12 +/mnt/test/test/data-prealloc/file-2: extents: 12 == block writes into region allocs hole wrote blk 24 wrote blk 32 diff --git a/tests/tests/basic-bad-mounts.sh b/tests/tests/basic-bad-mounts.sh index a20f36b6..409310e1 100644 --- a/tests/tests/basic-bad-mounts.sh +++ b/tests/tests/basic-bad-mounts.sh @@ -30,4 +30,13 @@ mount_fail -o metadev_path=$T_EX_DATA_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_ echo "== good volume, bad option and good options" mount_fail -o _bad,metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$T_MSCR" +echo "== blocks_min greater than blocks" +mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0,data_prealloc_blocks=32,data_prealloc_blocks_min=33 "$T_EX_DATA_DEV" "$T_MSCR" + +echo "== blocks_min greater than default blocks" +mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0,data_prealloc_blocks_min=2049 "$T_EX_DATA_DEV" "$T_MSCR" + +echo "== blocks_min greater than blocks, reversed order" +mount_fail -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0,data_prealloc_blocks_min=33,data_prealloc_blocks=32 "$T_EX_DATA_DEV" "$T_MSCR" + t_pass diff --git a/tests/tests/data-prealloc.sh b/tests/tests/data-prealloc.sh index a76909d7..0447d741 100644 --- a/tests/tests/data-prealloc.sh +++ b/tests/tests/data-prealloc.sh @@ -90,10 +90,17 @@ print_extents_found() done } +read_opt() +{ + cat "$(t_sysfs_path 0)/mount_options/$1" +} + t_save_all_sysfs_mount_options data_prealloc_blocks +t_save_all_sysfs_mount_options data_prealloc_blocks_min t_save_all_sysfs_mount_options data_prealloc_contig_only restore_options() { + t_restore_all_sysfs_mount_options data_prealloc_blocks_min t_restore_all_sysfs_mount_options data_prealloc_blocks t_restore_all_sysfs_mount_options data_prealloc_contig_only } @@ -153,6 +160,50 @@ t_set_sysfs_mount_option 0 data_prealloc_contig_only 0 write_forwards $prefix 3 print_extents_found $prefix +echo "== blocks_min rejects values greater than blocks" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 33 +echo "blocks_min after rejected write: $(read_opt data_prealloc_blocks_min)" + +echo "== blocks rejects values less than blocks_min" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 16 +t_set_sysfs_mount_option 0 data_prealloc_blocks 8 +echo "blocks after rejected write: $(read_opt data_prealloc_blocks)" + +echo "== blocks_min accepts value equal to blocks" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 32 +echo "blocks_min after accepted write: $(read_opt data_prealloc_blocks_min)" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 + +echo "== blocks_min suppresses prealloc for small files" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 16 +t_set_sysfs_mount_option 0 data_prealloc_contig_only 1 +write_forwards $prefix 8 +print_extents_found $prefix + +echo "== blocks_min prealloc ramps up past threshold" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 32 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 4 +t_set_sysfs_mount_option 0 data_prealloc_contig_only 1 +write_forwards $prefix 64 +print_extents_found $prefix + +echo "== blocks_min with region prealloc" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 +t_set_sysfs_mount_option 0 data_prealloc_blocks 16 +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 8 +t_set_sysfs_mount_option 0 data_prealloc_contig_only 0 +write_forwards $prefix 64 +print_extents_found $prefix + # # prepare aligned regions of 8 blocks that we'll write into. # We'll right into the first, last, and middle block of each @@ -163,7 +214,8 @@ print_extents_found $prefix # through. The correct output is tied to preallocation strategy so it # has to be verified each time we change preallocation. # -echo "== block writes into region allocs hole" +echo "== block writes into region allocs hole" +t_set_sysfs_mount_option 0 data_prealloc_blocks_min 0 t_set_sysfs_mount_option 0 data_prealloc_blocks 8 t_set_sysfs_mount_option 0 data_prealloc_contig_only 1 touch "$prefix"