diff --git a/Documentation/btrfs-rescue.rst b/Documentation/btrfs-rescue.rst index f52e6c263..7fc2bde59 100644 --- a/Documentation/btrfs-rescue.rst +++ b/Documentation/btrfs-rescue.rst @@ -50,6 +50,34 @@ fix-device-size WARNING: CPU: 3 PID: 439 at fs/btrfs/ctree.h:1559 btrfs_update_device+0x1c5/0x1d0 [btrfs] +fix-data-checksum + fix data checksum mismatch + + There is a long existing problem that if a user space program is doing + direct IO and modifies the buffer before the write back finished, it + can lead to data checksum mismatches. + + This problem is known but not fixed until upstream release v6.15 + (backported to older kernels). So it's possible to hit false data + checksum mismatch for any long running btrfs. + + In that case this program can be utilized to repair such problem. + + ``Options`` + + -r|--readonly + readonly mode, only scan and report for data checksum mismatch, + do no repair + + -i|--interactive + interactive mode, ask for how to repair, ignore the error by default + + -m|--mirror + use specified mirror to update the checksum item for all corrupted blocks. + + The value must be >= 1, and if the corrupted block has less mirrors than + the value, the mirror number will be `num % (num_mirrors + 1)`. + .. _man-rescue-clear-ino-cache: clear-ino-cache diff --git a/Documentation/mkfs.btrfs.rst b/Documentation/mkfs.btrfs.rst index 119e18b47..00fe418b3 100644 --- a/Documentation/mkfs.btrfs.rst +++ b/Documentation/mkfs.btrfs.rst @@ -213,6 +213,41 @@ OPTIONS :file:`hardlink1` and :file:`hardlink2` because :file:`hardlink3` will be inside a new subvolume. +--inode-flags : + Specify that *path* to have inode *flags*, other than the default one (which + implies data CoW and data checksum). The option *--rootdir* must also be + specified. This option can be specified multiple times. + + The supported flag(s) are: + + * *nodatacow*: disable data CoW, implies *nodatasum* for regular files. + * *nodatasum*: disable data checksum only. + + *flags* can be separated by comma (','). + + Children inodes will inherit the flags from their parent inodes, like the + following case: + + .. code-block:: none + + rootdir/ + |- file1 + |- file2 + |- dir/ + |- file3 + + In that case, if *--inode-flags nodatacow:dir* is specified, both + :file:`dir` and :file:`file3` will have the *nodatacow* flag. + + And this option also works with *--subvol* option, but the inode flag of + each subvolume is independent and will not inherit from the parent directory. + (The same as the kernel behavior) + + .. note:: + Both *--inode-flags* and *--subvol* options are memory hungry, + will consume at least 8KiB for each option. + Please keep the usage of both options to minimal. + --shrink Shrink the filesystem to its minimal size, only works with *--rootdir* option. diff --git a/Makefile b/Makefile index 7e36aa425..523b83495 100644 --- a/Makefile +++ b/Makefile @@ -256,7 +256,7 @@ cmds_objects = cmds/subvolume.o cmds/subvolume-list.o \ cmds/inspect.o cmds/balance.o cmds/send.o cmds/receive.o \ cmds/quota.o cmds/qgroup.o cmds/replace.o check/main.o \ cmds/restore.o cmds/rescue.o cmds/rescue-chunk-recover.o \ - cmds/rescue-super-recover.o \ + cmds/rescue-super-recover.o cmds/rescue-fix-data-checksum.o \ cmds/property.o cmds/filesystem-usage.o cmds/inspect-dump-tree.o \ cmds/inspect-dump-super.o cmds/inspect-tree-stats.o cmds/filesystem-du.o \ cmds/reflink.o \ diff --git a/cmds/replace.c b/cmds/replace.c index 5f1222b24..887c3251a 100644 --- a/cmds/replace.c +++ b/cmds/replace.c @@ -319,12 +319,11 @@ static int cmd_replace_start(const struct cmd_struct *cmd, ret = ioctl(fdmnt, BTRFS_IOC_DEV_REPLACE, &start_args); if (do_not_background) { if (ret < 0) { - error("ioctl(DEV_REPLACE_START) failed on \"%s\": %m", path); - if (start_args.result != BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_RESULT) - pr_stderr(LOG_DEFAULT, ", %s\n", - replace_dev_result2string(start_args.result)); + if (start_args.result == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_RESULT) + error("ioctl(DEV_REPLACE_START) failed on \"%s\": %m", path); else - pr_stderr(LOG_DEFAULT, "\n"); + error("ioctl(DEV_REPLACE_START) failed on \"%s\": %m, %s", + path, replace_dev_result2string(start_args.result)); if (errno == EOPNOTSUPP) warning("device replace of RAID5/6 not supported with this kernel"); diff --git a/cmds/rescue-fix-data-checksum.c b/cmds/rescue-fix-data-checksum.c new file mode 100644 index 000000000..23b59fffe --- /dev/null +++ b/cmds/rescue-fix-data-checksum.c @@ -0,0 +1,511 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "kerncompat.h" +#include "kernel-shared/disk-io.h" +#include "kernel-shared/ctree.h" +#include "kernel-shared/volumes.h" +#include "kernel-shared/backref.h" +#include "kernel-shared/transaction.h" +#include "kernel-shared/file-item.h" +#include "common/messages.h" +#include "common/open-utils.h" +#include "cmds/rescue.h" + +/* + * Record one corrupted data blocks. + * + * We do not report immediately, this is for future file deleting support. + */ +struct corrupted_block { + struct list_head list; + /* The logical bytenr of the exact corrupted block. */ + u64 logical; + + /* The amount of mirrors above logical have. */ + unsigned int num_mirrors; + + /* + * Which mirror failed. + * + * Note, bit 0 means mirror 1, since mirror 0 means choosing a + * live mirror, and we never utilized that mirror 0. + */ + unsigned long *error_mirror_bitmap; +}; + +enum fix_data_checksum_action_value { + ACTION_IGNORE, + ACTION_UPDATE_CSUM, + ACTION_LAST, +}; + +static const struct fix_data_checksum_action { + enum fix_data_checksum_action_value value; + const char *string; +} actions[] = { + [ACTION_IGNORE] = { + .value = ACTION_IGNORE, + .string = "ignore", + }, + [ACTION_UPDATE_CSUM] = { + .value = ACTION_UPDATE_CSUM, + .string = "update-csum", + }, +}; + +static int global_repair_mode; +LIST_HEAD(corrupted_blocks); + +static int add_corrupted_block(struct btrfs_fs_info *fs_info, u64 logical, + unsigned int mirror, unsigned int num_mirrors) +{ + struct corrupted_block *last; + if (list_empty(&corrupted_blocks)) + goto add; + + last = list_entry(corrupted_blocks.prev, struct corrupted_block, list); + /* The last entry is the same, just set update the error mirror bitmap. */ + if (last->logical == logical) { + UASSERT(last->error_mirror_bitmap); + set_bit(mirror, last->error_mirror_bitmap); + return 0; + } +add: + last = calloc(1, sizeof(*last)); + if (!last) + return -ENOMEM; + last->error_mirror_bitmap = calloc(1, BITS_TO_LONGS(num_mirrors)); + if (!last->error_mirror_bitmap) { + free(last); + return -ENOMEM; + } + set_bit(mirror - 1, last->error_mirror_bitmap); + last->logical = logical; + last->num_mirrors = num_mirrors; + + list_add_tail(&last->list, &corrupted_blocks); + return 0; +} + +/* + * Verify all mirrors for @logical. + * + * If something critical happened, return <0 and should end the run immediately. + * Otherwise return 0, including data checksum mismatch or read failure. + */ +static int verify_one_data_block(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + unsigned long leaf_offset, u64 logical, + unsigned int num_mirrors) +{ + const u32 blocksize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u8 *buf; + u8 csum[BTRFS_CSUM_SIZE]; + u8 csum_expected[BTRFS_CSUM_SIZE]; + int ret = 0; + + buf = malloc(blocksize); + if (!buf) + return -ENOMEM; + + for (int mirror = 1; mirror <= num_mirrors; mirror++) { + u64 read_len = blocksize; + + ret = read_data_from_disk(fs_info, buf, logical, &read_len, mirror); + if (ret < 0) { + /* IO error, add one record. */ + ret = add_corrupted_block(fs_info, logical, mirror, num_mirrors); + if (ret < 0) + break; + } + /* Verify the data checksum. */ + btrfs_csum_data(fs_info, fs_info->csum_type, buf, csum, blocksize); + read_extent_buffer(leaf, csum_expected, leaf_offset, csum_size); + if (memcmp(csum_expected, csum, csum_size) != 0) { + ret = add_corrupted_block(fs_info, logical, mirror, num_mirrors); + if (ret < 0) + break; + } + } + + free(buf); + return ret; +} + +static int iterate_one_csum_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path) +{ + struct btrfs_key key; + const unsigned long item_ptr_off = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + const u32 blocksize = fs_info->sectorsize; + int num_mirrors; + u64 data_size; + u64 cur; + char *buf; + int ret = 0; + + buf = malloc(blocksize); + if (!buf) + return -ENOMEM; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + data_size = btrfs_item_size(path->nodes[0], path->slots[0]) / + fs_info->csum_size * blocksize; + num_mirrors = btrfs_num_copies(fs_info, key.offset, data_size); + + for (cur = 0; cur < data_size; cur += blocksize) { + const unsigned long leaf_offset = item_ptr_off + + cur / blocksize * fs_info->csum_size; + + ret = verify_one_data_block(fs_info, path->nodes[0], leaf_offset, + key.offset + cur, num_mirrors); + if (ret < 0) + break; + } + free(buf); + return ret; +} + +static int print_filenames(u64 ino, u64 offset, u64 rootid, void *ctx) +{ + struct btrfs_fs_info *fs_info = ctx; + struct btrfs_root *root; + struct btrfs_key key; + struct inode_fs_paths *ipath; + struct btrfs_path path = { 0 }; + int ret; + + key.objectid = rootid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + root = btrfs_read_fs_root(fs_info, &key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + errno = -ret; + error("failed to get subvolume %llu: %m", rootid); + return ret; + } + ipath = init_ipath(128 * BTRFS_PATH_NAME_MAX, root, &path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + errno = -ret; + error("failed to initialize ipath: %m"); + return ret; + } + ret = paths_from_inode(ino, ipath); + if (ret < 0) { + errno = -ret; + error("failed to resolve root %llu ino %llu to paths: %m", rootid, ino); + goto out; + } + for (int i = 0; i < ipath->fspath->elem_cnt; i++) + printf(" (subvolume %llu)/%s\n", rootid, (char *)ipath->fspath->val[i]); + if (ipath->fspath->elem_missed) + printf(" (subvolume %llu) %d files not printed\n", rootid, + ipath->fspath->elem_missed); +out: + free_ipath(ipath); + return ret; +} + +static int iterate_csum_root(struct btrfs_fs_info *fs_info, struct btrfs_root *csum_root) +{ + struct btrfs_path path = { 0 }; + struct btrfs_key key; + int ret; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + ret = btrfs_search_slot(NULL, csum_root, &key, &path, 0, 0); + if (ret < 0) { + errno = -ret; + error("failed to get the first tree block of csum tree: %m"); + return ret; + } + UASSERT(ret > 0); + while (true) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.type != BTRFS_EXTENT_CSUM_KEY) + goto next; + ret = iterate_one_csum_item(fs_info, &path); + if (ret < 0) + break; +next: + ret = btrfs_next_item(csum_root, &path); + if (ret > 0) { + ret = 0; + break; + } + if (ret < 0) { + errno = -ret; + error("failed to get next csum item: %m"); + } + } + btrfs_release_path(&path); + return ret; +} + +#define ASK_ACTION_BUFSIZE (32) +static enum fix_data_checksum_action_value ask_action(unsigned int num_mirrors, + unsigned int *mirror_ret) +{ + unsigned long ret; + char buf[ASK_ACTION_BUFSIZE] = { 0 }; + bool printed; + char *endptr; + +again: + printed = false; + for (int i = 0; i < ACTION_LAST; i++) { + if (printed) + printf("/"); + /* Mark Ignore as default */ + if (i == ACTION_IGNORE) { + printf("<<%c>>%s", toupper(actions[i].string[0]), + actions[i].string + 1); + } else if (i == ACTION_UPDATE_CSUM) { + /* + * For update-csum action, we need a mirror number, + * so output all valid mirrors numbers instead. + */ + for (int cur_mirror = 1; cur_mirror <= num_mirrors; + cur_mirror++) + printf("<%u>", cur_mirror); + } else { + printf("<%c>%s", toupper(actions[i].string[0]), + actions[i].string + 1); + } + printed = true; + } + printf(":"); + fflush(stdout); + /* Default to Ignore if no action provided. */ + if (!fgets(buf, sizeof(buf) - 1, stdin)) + return ACTION_IGNORE; + if (buf[0] == '\n') + return ACTION_IGNORE; + /* Check exact match or matching the initial letter. */ + for (int i = 0; i < ACTION_LAST; i++) { + if ((strncasecmp(buf, actions[i].string, 1) == 0 || + strncasecmp(buf, actions[i].string, ASK_ACTION_BUFSIZE) == 0) && + actions[i].value != ACTION_UPDATE_CSUM) + return actions[i].value; + } + /* No match, check if it's some numeric string. */ + ret = strtoul(buf, &endptr, 10); + if (endptr == buf || ret == ULONG_MAX) { + /* No valid action found, retry. */ + warning("invalid action, please retry"); + goto again; + } + if (ret > num_mirrors || ret == 0) { + warning("invalid mirror number %lu, must be in range [1, %d], please retry", + ret, num_mirrors); + goto again; + } + *mirror_ret = ret; + return ACTION_UPDATE_CSUM; +} + +static int update_csum_item(struct btrfs_fs_info *fs_info, u64 logical, + unsigned int mirror) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); + struct btrfs_path path = { 0 }; + struct btrfs_csum_item *citem; + u64 read_len = fs_info->sectorsize; + u8 csum[BTRFS_CSUM_SIZE] = { 0 }; + u8 *buf; + int ret; + + buf = malloc(fs_info->sectorsize); + if (!buf) + return -ENOMEM; + ret = read_data_from_disk(fs_info, buf, logical, &read_len, mirror); + if (ret < 0) { + errno = -ret; + error("failed to read block at logical %llu mirror %u: %m", + logical, mirror); + goto out; + } + trans = btrfs_start_transaction(csum_root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + errno = -ret; + error_msg(ERROR_MSG_START_TRANS, "%m"); + goto out; + } + citem = btrfs_lookup_csum(trans, csum_root, &path, logical, + BTRFS_EXTENT_CSUM_OBJECTID, fs_info->csum_type, 1); + if (IS_ERR(citem)) { + ret = PTR_ERR(citem); + errno = -ret; + error("failed to find csum item for logical %llu: $m", logical); + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_csum_data(fs_info, fs_info->csum_type, buf, csum, fs_info->sectorsize); + write_extent_buffer(path.nodes[0], csum, (unsigned long)citem, fs_info->csum_size); + btrfs_release_path(&path); + ret = btrfs_commit_transaction(trans, csum_root); + if (ret < 0) { + errno = -ret; + error_msg(ERROR_MSG_COMMIT_TRANS, "%m"); + } + printf("Csum item for logical %llu updated using data from mirror %u\n", + logical, mirror); +out: + free(buf); + btrfs_release_path(&path); + return ret; +} + +static void report_corrupted_blocks(struct btrfs_fs_info *fs_info, + enum btrfs_fix_data_checksum_mode mode, + unsigned int mirror) +{ + struct corrupted_block *entry; + struct btrfs_path path = { 0 }; + enum fix_data_checksum_action_value action; + + if (list_empty(&corrupted_blocks)) { + printf("No data checksum mismatch found\n"); + return; + } + + list_for_each_entry(entry, &corrupted_blocks, list) { + bool has_printed = false; + int ret; + + printf("logical=%llu corrtuped mirrors=", entry->logical); + /* Poor man's bitmap print. */ + for (int i = 0; i < entry->num_mirrors; i++) { + if (test_bit(i, entry->error_mirror_bitmap)) { + if (has_printed) + printf(","); + /* + * Bit 0 means mirror 1, thus we need to increase + * the value by 1. + */ + printf("%d", i + 1); + has_printed=true; + } + } + printf(" affected files:\n"); + ret = iterate_inodes_from_logical(entry->logical, fs_info, &path, + print_filenames, fs_info); + if (ret < 0) { + errno = -ret; + error("failed to iterate involved files: %m"); + break; + } + switch (mode) { + case BTRFS_FIX_DATA_CSUMS_INTERACTIVE: + action = ask_action(entry->num_mirrors, &mirror); + break; + case BTRFS_FIX_DATA_CSUMS_READONLY: + action = ACTION_IGNORE; + break; + case BTRFS_FIX_DATA_CSUMS_UPDATE_CSUM_ITEM: + action = ACTION_UPDATE_CSUM; + mirror = mirror % (entry->num_mirrors + 1); + break; + default: + UASSERT(0); + } + + switch (action) { + case ACTION_IGNORE: + break; + case ACTION_UPDATE_CSUM: + UASSERT(mirror > 0 && mirror <= entry->num_mirrors); + ret = update_csum_item(fs_info, entry->logical, mirror); + break; + default: + UASSERT(0); + } + } +} + +static void free_corrupted_blocks(void) +{ + while (!list_empty(&corrupted_blocks)) { + struct corrupted_block *entry; + + entry = list_entry(corrupted_blocks.next, struct corrupted_block, list); + list_del_init(&entry->list); + free(entry->error_mirror_bitmap); + free(entry); + } +} + +int btrfs_recover_fix_data_checksum(const char *path, + enum btrfs_fix_data_checksum_mode mode, + unsigned int mirror) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_root *csum_root; + struct open_ctree_args oca = { 0 }; + int ret; + + if (mode >= BTRFS_FIX_DATA_CSUMS_LAST) + return -EINVAL; + + if (mode == BTRFS_FIX_DATA_CSUMS_UPDATE_CSUM_ITEM) + UASSERT(mirror > 0); + ret = check_mounted(path); + if (ret < 0) { + errno = -ret; + error("could not check mount status: %m"); + return ret; + } + if (ret > 0) { + error("%s is currently mounted", path); + return -EBUSY; + } + + global_repair_mode = mode; + oca.filename = path; + oca.flags = OPEN_CTREE_WRITES; + fs_info = open_ctree_fs_info(&oca); + if (!fs_info) { + error("failed to open btrfs at %s", path); + return -EIO; + } + csum_root = btrfs_csum_root(fs_info, 0); + if (!csum_root) { + error("failed to get csum root"); + ret = -EIO; + goto out_close; + } + ret = iterate_csum_root(fs_info, csum_root); + if (ret) { + errno = -ret; + error("failed to iterate csum tree: %m"); + } + report_corrupted_blocks(fs_info, mode, mirror); +out_close: + free_corrupted_blocks(); + close_ctree_fs_info(fs_info); + return ret; +} diff --git a/cmds/rescue.c b/cmds/rescue.c index c60bf1167..f575646c7 100644 --- a/cmds/rescue.c +++ b/cmds/rescue.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "kernel-lib/list.h" #include "kernel-shared/ctree.h" #include "kernel-shared/volumes.h" @@ -30,6 +31,7 @@ #include "kernel-shared/extent_io.h" #include "kernel-shared/accessors.h" #include "kernel-shared/uapi/btrfs_tree.h" +#include "common/string-utils.h" #include "common/messages.h" #include "common/utils.h" #include "common/help.h" @@ -275,6 +277,68 @@ static int cmd_rescue_fix_device_size(const struct cmd_struct *cmd, } static DEFINE_SIMPLE_COMMAND(rescue_fix_device_size, "fix-device-size"); +static const char * const cmd_rescue_fix_data_checksum_usage[] = { + "btrfs rescue fix-data-checksum ", + "Fix data checksum mismatches.", + "", + OPTLINE("-r|--readonly", "readonly mode, only report errors without repair"), + OPTLINE("-i|--interactive", "interactive mode, ignore the error by default."), + OPTLINE("-m|--mirror ", "update csum item using specified mirror"), + HELPINFO_INSERT_GLOBALS, + HELPINFO_INSERT_VERBOSE, + NULL +}; + +static int cmd_rescue_fix_data_checksum(const struct cmd_struct *cmd, + int argc, char **argv) +{ + enum btrfs_fix_data_checksum_mode mode = BTRFS_FIX_DATA_CSUMS_READONLY; + unsigned int mirror = 0; + int ret; + optind = 0; + + while (1) { + int c; + enum { GETOPT_VAL_DRYRUN = GETOPT_VAL_FIRST, }; + static const struct option long_options [] = { + {"readonly", no_argument, NULL, 'r'}, + {"interactive", no_argument, NULL, 'i'}, + {"mirror", required_argument, NULL, 'm'}, + {"NULL", 0, NULL, 0}, + }; + c = getopt_long(argc, argv, "rim:", long_options, NULL); + if (c < 0) + break; + switch (c) { + case 'r': + mode = BTRFS_FIX_DATA_CSUMS_READONLY; + break; + case 'i': + mode = BTRFS_FIX_DATA_CSUMS_INTERACTIVE; + break; + case 'm': + mode = BTRFS_FIX_DATA_CSUMS_UPDATE_CSUM_ITEM; + mirror = arg_strtou64(optarg); + if (mirror == 0) { + error("invalid mirror number %u, must be >= 1", mirror); + return 1; + } + break; + default: + usage_unknown_option(cmd, argv); + } + } + if (check_argc_min(argc - optind, 1)) + return 1; + ret = btrfs_recover_fix_data_checksum(argv[optind], mode, mirror); + if (ret < 0) { + errno = -ret; + error("failed to fix data checksums: %m"); + } + return !!ret; +} +static DEFINE_SIMPLE_COMMAND(rescue_fix_data_checksum, "fix-data-checksum"); + static const char * const cmd_rescue_create_control_device_usage[] = { "btrfs rescue create-control-device", "Create /dev/btrfs-control (see 'CONTROL DEVICE' in btrfs(5))", @@ -527,6 +591,7 @@ static const struct cmd_group rescue_cmd_group = { &cmd_struct_rescue_super_recover, &cmd_struct_rescue_zero_log, &cmd_struct_rescue_fix_device_size, + &cmd_struct_rescue_fix_data_checksum, &cmd_struct_rescue_create_control_device, &cmd_struct_rescue_clear_ino_cache, &cmd_struct_rescue_clear_space_cache, diff --git a/cmds/rescue.h b/cmds/rescue.h index 5a9e46b7a..f78ec436a 100644 --- a/cmds/rescue.h +++ b/cmds/rescue.h @@ -20,7 +20,17 @@ #ifndef __BTRFS_RESCUE_H__ #define __BTRFS_RESCUE_H__ +enum btrfs_fix_data_checksum_mode { + BTRFS_FIX_DATA_CSUMS_READONLY, + BTRFS_FIX_DATA_CSUMS_INTERACTIVE, + BTRFS_FIX_DATA_CSUMS_UPDATE_CSUM_ITEM, + BTRFS_FIX_DATA_CSUMS_LAST, +}; + int btrfs_recover_superblocks(const char *path, int yes); int btrfs_recover_chunk_tree(const char *path, int yes); +int btrfs_recover_fix_data_checksum(const char *path, + enum btrfs_fix_data_checksum_mode mode, + unsigned int mirror); #endif diff --git a/kernel-shared/ctree.c b/kernel-shared/ctree.c index 3184c9161..f90de606e 100644 --- a/kernel-shared/ctree.c +++ b/kernel-shared/ctree.c @@ -1246,6 +1246,17 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, } } +/* + * Find the first key in @fs_root that matches all the following conditions: + * + * - key.obojectid == @iobjectid + * - key.type == @key_type + * - key.offset >= ioff + * + * Return 0 if such key can be found, and @found_key is updated. + * Return >0 if no such key can be found. + * Return <0 for critical errors. + */ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, u64 iobjectid, u64 ioff, u8 key_type, struct btrfs_key *found_key) @@ -1280,10 +1291,10 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); if (found_key->type != key.type || - found_key->objectid != key.objectid) { + found_key->objectid != key.objectid) ret = 1; - goto out; - } + else + ret = 0; out: if (path != found_path) diff --git a/kernel-shared/file-item.c b/kernel-shared/file-item.c index 18791c064..503ad657c 100644 --- a/kernel-shared/file-item.c +++ b/kernel-shared/file-item.c @@ -112,7 +112,7 @@ int btrfs_insert_inline_extent(struct btrfs_trans_handle *trans, return err; } -static struct btrfs_csum_item * +struct btrfs_csum_item * btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, diff --git a/kernel-shared/file-item.h b/kernel-shared/file-item.h index cab0bc4e9..5a5d8da10 100644 --- a/kernel-shared/file-item.h +++ b/kernel-shared/file-item.h @@ -89,6 +89,11 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *stack_fi); int btrfs_csum_file_block(struct btrfs_trans_handle *trans, u64 logical, u64 csum_objectid, u32 csum_type, const char *data); +struct btrfs_csum_item * +btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 csum_objectid, u16 csum_type, int cow); int btrfs_insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, const char *buffer, size_t size, diff --git a/kernel-shared/inode.c b/kernel-shared/inode.c index 97bcbf822..f77157a2d 100644 --- a/kernel-shared/inode.c +++ b/kernel-shared/inode.c @@ -158,6 +158,59 @@ int btrfs_check_dir_conflict(struct btrfs_root *root, const char *name, return ret; } +/* Similar to btrfs_inherit_iflags(), but different interfaces. */ +static int inherit_inode_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 ino, u64 parent_ino) +{ + struct btrfs_path path = { 0 }; + struct btrfs_key key; + struct btrfs_inode_item *iitem; + u64 parent_inode_flags; + u64 child_inode_flags; + int ret; + + key.objectid = parent_ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + iitem = btrfs_item_ptr(path.nodes[0], path.slots[0], struct btrfs_inode_item); + parent_inode_flags = btrfs_inode_flags(path.nodes[0], iitem); + btrfs_release_path(&path); + + key.objectid = ino; + + ret = btrfs_search_slot(trans, root, &key, &path, 0, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + iitem = btrfs_item_ptr(path.nodes[0], path.slots[0], struct btrfs_inode_item); + child_inode_flags = btrfs_inode_flags(path.nodes[0], iitem); + + if (parent_inode_flags & BTRFS_INODE_NOCOMPRESS) { + child_inode_flags &= ~BTRFS_INODE_COMPRESS; + child_inode_flags |= BTRFS_INODE_NOCOMPRESS; + } else if (parent_inode_flags & BTRFS_INODE_COMPRESS){ + child_inode_flags &= ~BTRFS_INODE_NOCOMPRESS; + child_inode_flags |= BTRFS_INODE_COMPRESS; + } + if (parent_inode_flags & BTRFS_INODE_NODATACOW) { + child_inode_flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(btrfs_inode_mode(path.nodes[0], iitem))) + child_inode_flags |= BTRFS_INODE_NODATASUM; + } + btrfs_set_inode_flags(path.nodes[0], iitem, child_inode_flags); +out: + btrfs_release_path(&path); + return ret; +} + /* * Add dir_item/index for 'parent_ino' if add_backref is true, also insert a * backref from the ino to parent dir and update the nlink(Kernel version does @@ -220,6 +273,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, nlink); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); + /* + * If this is the first nlink of the inode, meaning the + * inode is newly created under the parent inode, this + * new child inode should inherit the inode flags from + * the parent. + */ + if (nlink == 1) { + ret = inherit_inode_flags(trans, root, ino, parent_ino); + if (ret < 0) + goto out; + } } } diff --git a/mkfs/main.c b/mkfs/main.c index 4c2ce98c7..872f6872d 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -1164,6 +1164,63 @@ static int parse_subvolume(const char *path, struct list_head *subvols, return 0; } +static int parse_inode_flags(const char *option, struct list_head *inode_flags_list) +{ + struct rootdir_inode_flags_entry *entry = NULL; + char *colon; + char *dumpped = NULL; + char *token; + int ret; + + dumpped = strdup(option); + if (!dumpped) { + ret = -ENOMEM; + error_msg(ERROR_MSG_MEMORY, NULL); + goto cleanup; + } + entry = calloc(1, sizeof(*entry)); + if (!entry) { + ret = -ENOMEM; + error_msg(ERROR_MSG_MEMORY, NULL); + goto cleanup; + } + colon = strstr(dumpped, ":"); + if (!colon) { + error("invalid option: %s", option); + ret = -EINVAL; + goto cleanup; + } + *colon = '\0'; + + token = strtok(dumpped, ","); + while (token) { + if (token == NULL) + break; + if (strcmp(token, "nodatacow") == 0) { + entry->nodatacow = true; + } else if (strcmp(token, "nodatasum") == 0) { + entry->nodatasum = true; + } else { + error("unknown flag: %s", token); + ret = -EINVAL; + goto cleanup; + } + token = strtok(NULL, ","); + } + + if (arg_copy_path(entry->inode_path, colon + 1, sizeof(entry->inode_path))) { + error("--inode-flags path too long"); + ret = -E2BIG; + goto cleanup; + } + list_add_tail(&entry->list, inode_flags_list); + return 0; +cleanup: + free(dumpped); + free(entry); + return ret; +} + int BOX_MAIN(mkfs)(int argc, char **argv) { char *file; @@ -1206,10 +1263,12 @@ int BOX_MAIN(mkfs)(int argc, char **argv) int nr_global_roots = sysconf(_SC_NPROCESSORS_ONLN); char *source_dir = NULL; struct rootdir_subvol *rds; + struct rootdir_inode_flags_entry *rif; bool has_default_subvol = false; enum btrfs_compression_type compression = BTRFS_COMPRESS_NONE; unsigned int compression_level = 0; LIST_HEAD(subvols); + LIST_HEAD(inode_flags_list); cpu_detect_flags(); hash_init_accel(); @@ -1223,6 +1282,7 @@ int BOX_MAIN(mkfs)(int argc, char **argv) GETOPT_VAL_CHECKSUM, GETOPT_VAL_GLOBAL_ROOTS, GETOPT_VAL_DEVICE_UUID, + GETOPT_VAL_INODE_FLAGS, GETOPT_VAL_COMPRESS, }; static const struct option long_options[] = { @@ -1241,6 +1301,7 @@ int BOX_MAIN(mkfs)(int argc, char **argv) { "version", no_argument, NULL, 'V' }, { "rootdir", required_argument, NULL, 'r' }, { "subvol", required_argument, NULL, 'u' }, + { "inode-flags", required_argument, NULL, GETOPT_VAL_INODE_FLAGS }, { "nodiscard", no_argument, NULL, 'K' }, { "features", required_argument, NULL, 'O' }, { "runtime-features", required_argument, NULL, 'R' }, @@ -1374,6 +1435,11 @@ int BOX_MAIN(mkfs)(int argc, char **argv) case 'q': bconf_be_quiet(); break; + case GETOPT_VAL_INODE_FLAGS: + ret = parse_inode_flags(optarg, &inode_flags_list); + if (ret) + goto error; + break; case GETOPT_VAL_COMPRESS: if (parse_compression(optarg, &compression, &compression_level)) { ret = 1; @@ -1438,6 +1504,11 @@ int BOX_MAIN(mkfs)(int argc, char **argv) ret = 1; goto error; } + if (!list_empty(&inode_flags_list) && source_dir == NULL) { + error("option --inode-flags must be used with --rootdir"); + ret = 1; + goto error; + } if (source_dir) { char *canonical = realpath(source_dir, NULL); @@ -1503,6 +1574,41 @@ int BOX_MAIN(mkfs)(int argc, char **argv) } } + list_for_each_entry(rif, &inode_flags_list, list) { + char path[PATH_MAX]; + struct rootdir_inode_flags_entry *rif2; + + if (path_cat_out(path, source_dir, rif->inode_path)) { + ret = -EINVAL; + error("path invalid: %s", path); + goto error; + } + if (!realpath(path, rif->full_path)) { + ret = -errno; + error("could not get canonical path: %s: %m", path); + goto error; + } + if (!path_exists(rif->full_path)) { + ret = -ENOENT; + error("inode path does not exist: %s", rif->full_path); + goto error; + } + list_for_each_entry(rif2, &inode_flags_list, list) { + /* + * Only compare entryies before us. So we won't compare + * the same pair twice. + */ + if (rif2 == rif) + break; + if (strcmp(rif2->full_path, rif->full_path) == 0) { + error("duplicated inode flag entries for %s", + rif->full_path); + ret = -EEXIST; + goto error; + } + } + } + if (*fs_uuid) { uuid_t dummy_uuid; @@ -2084,9 +2190,15 @@ int BOX_MAIN(mkfs)(int argc, char **argv) rds->is_default ? "" : " ", rds->dir); } + list_for_each_entry(rif, &inode_flags_list, list) { + pr_verbose(LOG_DEFAULT, " Inode flags (%s): %s\n", + rif->nodatacow ? "NODATACOW" : "", + rif->inode_path); + } ret = btrfs_mkfs_fill_dir(trans, source_dir, root, - &subvols, compression, + &subvols, &inode_flags_list, + compression, compression_level); if (ret) { errno = -ret; @@ -2229,6 +2341,12 @@ int BOX_MAIN(mkfs)(int argc, char **argv) list_del(&head->list); free(head); } + while (!list_empty(&inode_flags_list)) { + rif = list_entry(inode_flags_list.next, + struct rootdir_inode_flags_entry, list); + list_del(&rif->list); + free(rif); + } return !!ret; diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c index 5f4cfb93c..361ac9b72 100644 --- a/mkfs/rootdir.c +++ b/mkfs/rootdir.c @@ -153,6 +153,7 @@ static struct rootdir_path current_path = { static struct btrfs_trans_handle *g_trans = NULL; static struct list_head *g_subvols; +static struct list_head *g_inode_flags_list; static u64 next_subvol_id = BTRFS_FIRST_FREE_OBJECTID; static u64 default_subvol_id; static enum btrfs_compression_type g_compression; @@ -716,12 +717,19 @@ static int add_file_item_extent(struct btrfs_trans_handle *trans, u64 buf_size; char *write_buf; bool do_comp = g_compression != BTRFS_COMPRESS_NONE; + bool datasum = true; ssize_t comp_ret; u64 flags = btrfs_stack_inode_flags(btrfs_inode); if (flags & BTRFS_INODE_NOCOMPRESS) do_comp = false; + if (flags & BTRFS_INODE_NODATACOW || + flags & BTRFS_INODE_NODATASUM) { + datasum = false; + do_comp = false; + } + buf_size = do_comp ? BTRFS_MAX_COMPRESSED : MAX_EXTENT_SIZE; to_read = min(file_pos + buf_size, source->size) - file_pos; @@ -852,13 +860,15 @@ static int add_file_item_extent(struct btrfs_trans_handle *trans, return ret; } - for (unsigned int i = 0; i < to_write / sectorsize; i++) { - ret = btrfs_csum_file_block(trans, first_block + (i * sectorsize), + if (datasum) { + for (unsigned int i = 0; i < to_write / sectorsize; i++) { + ret = btrfs_csum_file_block(trans, first_block + (i * sectorsize), BTRFS_EXTENT_CSUM_OBJECTID, root->fs_info->csum_type, write_buf + (i * sectorsize)); - if (ret) - return ret; + if (ret) + return ret; + } } btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); @@ -1287,6 +1297,40 @@ static u8 ftype_to_btrfs_type(mode_t ftype) return BTRFS_FT_UNKNOWN; } +static void update_inode_flags(const struct rootdir_inode_flags_entry *rif, + struct btrfs_inode_item *stack_inode) +{ + u64 inode_flags; + + inode_flags = btrfs_stack_inode_flags(stack_inode); + if (rif->nodatacow) { + inode_flags |= BTRFS_INODE_NODATACOW; + + if (S_ISREG(btrfs_stack_inode_mode(stack_inode))) + inode_flags |= BTRFS_INODE_NODATASUM; + } + if (rif->nodatasum) + inode_flags |= BTRFS_INODE_NODATASUM; + + btrfs_set_stack_inode_flags(stack_inode, inode_flags); +} + +static void search_and_update_inode_flags(struct btrfs_inode_item *stack_inode, + const char *full_path) +{ + struct rootdir_inode_flags_entry *rif; + + list_for_each_entry(rif, g_inode_flags_list, list) { + if (strcmp(rif->full_path, full_path) == 0) { + update_inode_flags(rif, stack_inode); + + list_del(&rif->list); + free(rif); + return; + } + } +} + static int ftw_add_subvol(const char *full_path, const struct stat *st, int typeflag, struct FTW *ftwbuf, struct rootdir_subvol *subvol) @@ -1345,6 +1389,7 @@ static int ftw_add_subvol(const char *full_path, const struct stat *st, } stat_to_inode_item(&inode_item, st); + search_and_update_inode_flags(&inode_item, full_path); btrfs_set_stack_inode_nlink(&inode_item, 1); ret = update_inode_item(g_trans, new_root, &inode_item, ino); if (ret < 0) { @@ -1364,6 +1409,31 @@ static int ftw_add_subvol(const char *full_path, const struct stat *st, return 0; } +static int read_inode_item(struct btrfs_root *root, struct btrfs_inode_item *inode_item, + u64 ino) +{ + struct btrfs_path path = { 0 }; + struct btrfs_key key; + int ret; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + read_extent_buffer(path.nodes[0], inode_item, + btrfs_item_ptr_offset(path.nodes[0], path.slots[0]), + sizeof(*inode_item)); +out: + btrfs_release_path(&path); + return ret; +} + static int ftw_add_inode(const char *full_path, const struct stat *st, int typeflag, struct FTW *ftwbuf) { @@ -1511,6 +1581,7 @@ static int ftw_add_inode(const char *full_path, const struct stat *st, return ret; } stat_to_inode_item(&inode_item, st); + search_and_update_inode_flags(&inode_item, full_path); ret = btrfs_insert_inode(g_trans, root, ino, &inode_item); if (ret < 0) { @@ -1543,11 +1614,17 @@ static int ftw_add_inode(const char *full_path, const struct stat *st, } /* - * btrfs_add_link() has increased the nlink to 1 in the metadata. - * Also update the value in case we need to update the inode item - * later. + * btrfs_add_link() has increased the nlink, and may even updated the + * inode flags (inherited from the parent). + * Read out the latest version of inode item. */ - btrfs_set_stack_inode_nlink(&inode_item, 1); + ret = read_inode_item(root, &inode_item, ino); + if (ret < 0) { + errno = -ret; + error("failed to read inode item for subvol %llu inode %llu ('%s'): %m", + btrfs_root_id(root), ino, full_path); + return ret; + } ret = add_xattr_item(g_trans, root, ino, full_path); if (ret < 0) { @@ -1640,6 +1717,7 @@ static int set_default_subvolume(struct btrfs_trans_handle *trans) int btrfs_mkfs_fill_dir(struct btrfs_trans_handle *trans, const char *source_dir, struct btrfs_root *root, struct list_head *subvols, + struct list_head *inode_flags_list, enum btrfs_compression_type compression, unsigned int compression_level) { @@ -1686,6 +1764,7 @@ int btrfs_mkfs_fill_dir(struct btrfs_trans_handle *trans, const char *source_dir g_trans = trans; g_subvols = subvols; + g_inode_flags_list = inode_flags_list; g_compression = compression; g_compression_level = compression_level; INIT_LIST_HEAD(¤t_path.inode_list); diff --git a/mkfs/rootdir.h b/mkfs/rootdir.h index b32fda5bf..f8b959f7a 100644 --- a/mkfs/rootdir.h +++ b/mkfs/rootdir.h @@ -45,8 +45,23 @@ struct rootdir_subvol { bool readonly; }; +/* + * Represent a flag for specified inode at @full_path. + */ +struct rootdir_inode_flags_entry { + struct list_head list; + /* Fully canonicalized path to the source file. */ + char full_path[PATH_MAX]; + /* Path inside the source directory. */ + char inode_path[PATH_MAX]; + + bool nodatacow; + bool nodatasum; +}; + int btrfs_mkfs_fill_dir(struct btrfs_trans_handle *trans, const char *source_dir, struct btrfs_root *root, struct list_head *subvols, + struct list_head *inode_flags_list, enum btrfs_compression_type compression, unsigned int compression_level); u64 btrfs_mkfs_size_dir(const char *dir_name, u32 sectorsize, u64 min_dev_size, diff --git a/tests/mkfs-tests/038-inode-flags/test.sh b/tests/mkfs-tests/038-inode-flags/test.sh new file mode 100755 index 000000000..bb2f61c55 --- /dev/null +++ b/tests/mkfs-tests/038-inode-flags/test.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Basic test for mkfs.btrfs --inode-flags --rootdir. Create a dataset and use it as +# rootdir, then various inode-flags and verify the flag is properly set. + +source "$TEST_TOP/common" || exit + +check_prereq mkfs.btrfs +check_prereq btrfs +check_global_prereq lsattr + +setup_root_helper +prepare_test_dev + +tmp=$(_mktemp_dir mkfs-rootdir) + +write_file() +{ + local path="$1" + local size="$2" + + run_check dd if=/dev/zero of="$path" bs="$size" count=1 status=noxfer > /dev/null 2>&1 +} + +check_nodatacow() +{ + local path="$1" + + lsattr "$TEST_MNT"/"$path" | grep -q C || _fail "missing NODATACOW flag for $path" +} + +write_file "$tmp/file1" 64K +write_file "$tmp/file2" 64K +run_check mkdir -p "$tmp/subv" "$tmp/nocow_subv" "$tmp/nocow_dir/dir2" +write_file "$tmp/subv/file3" 64K +write_file "$tmp/nocow_subv/file4" 64K +write_file "$tmp/nocow_dir/dir2/file5" 64K +write_file "$tmp/nocow_dir/file6" 64K +write_file "$tmp/nocow_file1" 64K + +run_check_mkfs_test_dev --rootdir "$tmp" --inode-flags nodatacow:nocow_subv \ + --subvol nocow_subv --inode-flags nodatacow:nocow_dir \ + --inode-flags nodatacow:nocow_file1 + +run_check $SUDO_HELPER "$TOP/btrfs" check "$TEST_DEV" + +run_check_mount_test_dev +check_nodatacow "nocow_subv" +check_nodatacow "nocow_subv/file4" +check_nodatacow "nocow_dir" +check_nodatacow "nocow_dir/file6" +check_nodatacow "nocow_dir/dir2/file5" +check_nodatacow "nocow_file1" +run_check lsattr -R "$TEST_MNT" +run_check_umount_test_dev +run_check rm -rf -- "$tmp"