Home Home > GIT Browse > SLE15-AZURE
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2019-05-21 11:46:03 +0100
committerFilipe Manana <fdmanana@suse.com>2019-05-21 11:46:03 +0100
commit5aa67027f859d1e60ecf5996188aab0d2fec1b80 (patch)
tree9263210b5e621c939c330899ec7c5c51ce522ff0
parent64bec9fcaa2719cde1311ed871ea1811cf3016a5 (diff)
Btrfs: improve performance on fsync of files with multiple
hardlinks (bsc#1123454). suse-commit: 19a09ba6f064c03c926fabdaaed44ea1955bdda5
-rw-r--r--fs/btrfs/btrfs_inode.h6
-rw-r--r--fs/btrfs/inode.c17
-rw-r--r--fs/btrfs/tree-log.c228
3 files changed, 188 insertions, 63 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7f895b673c4f..19e4eecd67ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -160,12 +160,6 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
- * Track the transaction id of the last transaction used to create a
- * hard link for the inode. This is used by the log tree (fsync).
- */
- u64 last_link_trans;
-
- /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0038dab54f40..b3e279e68eaa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3668,21 +3668,6 @@ cache_index:
* inode is not a directory, logging its parent unnecessarily.
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
- /*
- * Similar reasoning for last_link_trans, needs to be set otherwise
- * for a case like the following:
- *
- * mkdir A
- * touch foo
- * ln foo A/bar
- * echo 2 > /proc/sys/vm/drop_caches
- * fsync foo
- * <power failure>
- *
- * Would result in link bar and directory A not existing after the power
- * failure.
- */
- BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
path->slots[0]++;
if (inode->i_nlink != 1 ||
@@ -6494,7 +6479,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto fail;
}
- BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
true, NULL);
@@ -9202,7 +9186,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
- ei->last_link_trans = 0;
ei->last_log_commit = 0;
ei->delayed_iput_count = 0;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 27c47acde982..5c0fd8b848ec 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -6008,6 +6008,190 @@ out:
return ret;
}
+static int log_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ while (true) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ const u64 last_committed = fs_info->last_trans_committed;
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key search_key;
+ struct inode *inode;
+ int ret = 0;
+
+ btrfs_release_path(path);
+
+ search_key.objectid = found_key.offset;
+ search_key.type = BTRFS_INODE_ITEM_KEY;
+ search_key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (BTRFS_I(inode)->generation > last_committed)
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_INODE_EXISTS,
+ 0, LLONG_MAX, ctx);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
+ break;
+
+ search_key.type = BTRFS_INODE_REF_KEY;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return -ENOENT;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != search_key.objectid ||
+ found_key.type != BTRFS_INODE_REF_KEY)
+ return -ENOENT;
+ }
+ return 0;
+}
+
+static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct dentry *old_parent = NULL;
+ struct super_block *sb = inode->vfs_inode.i_sb;
+ int ret = 0;
+
+ while (true) {
+ if (!parent || d_really_is_negative(parent) ||
+ sb != parent->d_sb)
+ break;
+
+ inode = BTRFS_I(d_inode(parent));
+ if (root != inode->root)
+ break;
+
+ if (inode->generation > fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode,
+ LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ if (ret)
+ break;
+ }
+ if (IS_ROOT(parent))
+ break;
+
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
+ }
+ dput(old_parent);
+
+ return ret;
+}
+
+static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct dentry *parent,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = inode->root;
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ int ret;
+
+ /*
+ * For a single hard link case, go through a fast path that does not
+ * need to iterate the fs/subvolume tree.
+ */
+ if (inode->vfs_inode.i_nlink < 2)
+ return log_new_ancestors_fast(trans, inode, parent, ctx);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ path->slots[0]++;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_key found_key;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != ino ||
+ found_key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ /*
+ * Don't deal with extended references because they are rare
+ * cases and too complex to deal with (we would need to keep
+ * track of which subitem we are processing for each item in
+ * this loop, etc). So just return some error to fallback to
+ * a transaction commit.
+ */
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = -EMLINK;
+ goto out;
+ }
+
+ /*
+ * Logging ancestors needs to do more searches on the fs/subvol
+ * tree, so it releases the path as needed to avoid deadlocks.
+ * Keep track of the last inode ref key and resume from that key
+ * after logging all new ancestors for the current hard link.
+ */
+ memcpy(&search_key, &found_key, sizeof(search_key));
+
+ ret = log_new_ancestors(trans, root, path, ctx);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ goto again;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -6026,11 +6210,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
- struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
- struct btrfs_inode *orig_inode = inode;
sb = inode->vfs_inode.i_sb;
@@ -6136,54 +6318,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* and has a link count of 2.
*/
if (inode->last_unlink_trans > last_committed) {
- ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+ ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
}
- /*
- * If a new hard link was added to the inode in the current transaction
- * and its link count is now greater than 1, we need to fallback to a
- * transaction commit, otherwise we can end up not logging all its new
- * parents for all the hard links. Here just from the dentry used to
- * fsync, we can not visit the ancestor inodes for all the other hard
- * links to figure out if any is new, so we fallback to a transaction
- * commit (instead of adding a lot of complexity of scanning a btree,
- * since this scenario is not a common use case).
- */
- if (inode->vfs_inode.i_nlink > 1 &&
- inode->last_link_trans > last_committed) {
- ret = -EMLINK;
+ ret = log_all_new_ancestors(trans, inode, parent, ctx);
+ if (ret)
goto end_trans;
- }
-
- while (1) {
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- break;
-
- inode = BTRFS_I(d_inode(parent));
- if (root != inode->root)
- break;
- if (inode->generation > last_committed) {
- ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
- if (ret)
- goto end_trans;
- }
- if (IS_ROOT(parent))
- break;
-
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- }
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+ ret = log_new_dir_dentries(trans, root, inode, ctx);
else
ret = 0;
end_trans:
- dput(old_parent);
if (ret < 0) {
btrfs_set_log_full_commit(fs_info, trans);
ret = 1;