Home Home > GIT Browse > SLE12-SP4
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2019-05-21 11:46:03 +0100
committerFilipe Manana <fdmanana@suse.com>2019-05-21 11:46:03 +0100
commit19a09ba6f064c03c926fabdaaed44ea1955bdda5 (patch)
tree6a924a577421f7675695bfd019483dce4938cee1
parent9e6fc7e1ea30fb7f03a76a818b884770e8d4d87b (diff)
Btrfs: improve performance on fsync of files with multiple
hardlinks (bsc#1123454).
-rw-r--r--patches.suse/btrfs-improve-performance-on-fsync-of-files-with-mul.patch362
-rw-r--r--series.conf1
2 files changed, 363 insertions, 0 deletions
diff --git a/patches.suse/btrfs-improve-performance-on-fsync-of-files-with-mul.patch b/patches.suse/btrfs-improve-performance-on-fsync-of-files-with-mul.patch
new file mode 100644
index 0000000000..9f8bac3ea2
--- /dev/null
+++ b/patches.suse/btrfs-improve-performance-on-fsync-of-files-with-mul.patch
@@ -0,0 +1,362 @@
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 17 Apr 2019 11:31:06 +0100
+Git-commit: b8aa330d2acb122563be87c42d82c5c8649cf658
+Patch-mainline: 5.2-rc1
+References: bsc#1123454
+Subject: [PATCH] Btrfs: improve performance on fsync of files with
+ multiple hardlinks
+
+Commit 41bd6067692382 ("Btrfs: fix fsync of files with multiple hard links
+in new directories") introduced a path that makes fsync fallback to a full
+transaction commit in order to avoid losing hard links and new ancestors
+of the fsynced inode. That path is triggered only when the inode has more
+than one hard link and either has a new hard link created in the current
+transaction or the inode was evicted and reloaded in the current
+transaction.
+
+That path ends up getting triggered very often (hundreds of times) during
+the course of pgbench benchmarks, resulting in performance drops of about
+20%.
+
+This change restores the performance by not triggering the full transaction
+commit in those cases, and instead iterate the fs/subvolume tree in search
+of all possible new ancestors, for all hard links, to log them.
+
+Reported-by: Zhao Yuhu <zyuhu@suse.com>
+Tested-by: James Wang <jnwang@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/btrfs_inode.h | 6 --
+ fs/btrfs/inode.c | 17 ---
+ fs/btrfs/tree-log.c | 228 +++++++++++++++++++++++++++++++++--------
+ 3 files changed, 188 insertions(+), 63 deletions(-)
+
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index 7f895b673c4f..19e4eecd67ce 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -159,12 +159,6 @@ struct btrfs_inode {
+ */
+ u64 last_unlink_trans;
+
+- /*
+- * Track the transaction id of the last transaction used to create a
+- * hard link for the inode. This is used by the log tree (fsync).
+- */
+- u64 last_link_trans;
+-
+ /*
+ * Number of bytes outstanding that are going to need csums. This is
+ * used in ENOSPC accounting.
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 0038dab54f40..b3e279e68eaa 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3668,21 +3668,6 @@ static int btrfs_read_locked_inode(struct inode *inode,
+ * inode is not a directory, logging its parent unnecessarily.
+ */
+ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+- /*
+- * Similar reasoning for last_link_trans, needs to be set otherwise
+- * for a case like the following:
+- *
+- * mkdir A
+- * touch foo
+- * ln foo A/bar
+- * echo 2 > /proc/sys/vm/drop_caches
+- * fsync foo
+- * <power failure>
+- *
+- * Would result in link bar and directory A not existing after the power
+- * failure.
+- */
+- BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
+
+ path->slots[0]++;
+ if (inode->i_nlink != 1 ||
+@@ -6494,7 +6479,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ if (err)
+ goto fail;
+ }
+- BTRFS_I(inode)->last_link_trans = trans->transid;
+ d_instantiate(dentry, inode);
+ ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
+ true, NULL);
+@@ -9202,7 +9186,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+ ei->index_cnt = (u64)-1;
+ ei->dir_index = 0;
+ ei->last_unlink_trans = 0;
+- ei->last_link_trans = 0;
+ ei->last_log_commit = 0;
+ ei->delayed_iput_count = 0;
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 27c47acde982..5c0fd8b848ec 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -6008,6 +6008,190 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ return ret;
+ }
+
++static int log_new_ancestors(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct btrfs_path *path,
++ struct btrfs_log_ctx *ctx)
++{
++ struct btrfs_key found_key;
++
++ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
++
++ while (true) {
++ struct btrfs_fs_info *fs_info = root->fs_info;
++ const u64 last_committed = fs_info->last_trans_committed;
++ struct extent_buffer *leaf = path->nodes[0];
++ int slot = path->slots[0];
++ struct btrfs_key search_key;
++ struct inode *inode;
++ int ret = 0;
++
++ btrfs_release_path(path);
++
++ search_key.objectid = found_key.offset;
++ search_key.type = BTRFS_INODE_ITEM_KEY;
++ search_key.offset = 0;
++ inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
++ if (IS_ERR(inode))
++ return PTR_ERR(inode);
++
++ if (BTRFS_I(inode)->generation > last_committed)
++ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
++ LOG_INODE_EXISTS,
++ 0, LLONG_MAX, ctx);
++ iput(inode);
++ if (ret)
++ return ret;
++
++ if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
++ break;
++
++ search_key.type = BTRFS_INODE_REF_KEY;
++ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
++ if (ret < 0)
++ return ret;
++
++ leaf = path->nodes[0];
++ slot = path->slots[0];
++ if (slot >= btrfs_header_nritems(leaf)) {
++ ret = btrfs_next_leaf(root, path);
++ if (ret < 0)
++ return ret;
++ else if (ret > 0)
++ return -ENOENT;
++ leaf = path->nodes[0];
++ slot = path->slots[0];
++ }
++
++ btrfs_item_key_to_cpu(leaf, &found_key, slot);
++ if (found_key.objectid != search_key.objectid ||
++ found_key.type != BTRFS_INODE_REF_KEY)
++ return -ENOENT;
++ }
++ return 0;
++}
++
++static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
++ struct btrfs_inode *inode,
++ struct dentry *parent,
++ struct btrfs_log_ctx *ctx)
++{
++ struct btrfs_root *root = inode->root;
++ struct btrfs_fs_info *fs_info = root->fs_info;
++ struct dentry *old_parent = NULL;
++ struct super_block *sb = inode->vfs_inode.i_sb;
++ int ret = 0;
++
++ while (true) {
++ if (!parent || d_really_is_negative(parent) ||
++ sb != parent->d_sb)
++ break;
++
++ inode = BTRFS_I(d_inode(parent));
++ if (root != inode->root)
++ break;
++
++ if (inode->generation > fs_info->last_trans_committed) {
++ ret = btrfs_log_inode(trans, root, inode,
++ LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
++ if (ret)
++ break;
++ }
++ if (IS_ROOT(parent))
++ break;
++
++ parent = dget_parent(parent);
++ dput(old_parent);
++ old_parent = parent;
++ }
++ dput(old_parent);
++
++ return ret;
++}
++
++static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
++ struct btrfs_inode *inode,
++ struct dentry *parent,
++ struct btrfs_log_ctx *ctx)
++{
++ struct btrfs_root *root = inode->root;
++ const u64 ino = btrfs_ino(inode);
++ struct btrfs_path *path;
++ struct btrfs_key search_key;
++ int ret;
++
++ /*
++ * For a single hard link case, go through a fast path that does not
++ * need to iterate the fs/subvolume tree.
++ */
++ if (inode->vfs_inode.i_nlink < 2)
++ return log_new_ancestors_fast(trans, inode, parent, ctx);
++
++ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
++
++ search_key.objectid = ino;
++ search_key.type = BTRFS_INODE_REF_KEY;
++ search_key.offset = 0;
++again:
++ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
++ if (ret < 0)
++ goto out;
++ if (ret == 0)
++ path->slots[0]++;
++
++ while (true) {
++ struct extent_buffer *leaf = path->nodes[0];
++ int slot = path->slots[0];
++ struct btrfs_key found_key;
++
++ if (slot >= btrfs_header_nritems(leaf)) {
++ ret = btrfs_next_leaf(root, path);
++ if (ret < 0)
++ goto out;
++ else if (ret > 0)
++ break;
++ continue;
++ }
++
++ btrfs_item_key_to_cpu(leaf, &found_key, slot);
++ if (found_key.objectid != ino ||
++ found_key.type > BTRFS_INODE_EXTREF_KEY)
++ break;
++
++ /*
++ * Don't deal with extended references because they are rare
++ * cases and too complex to deal with (we would need to keep
++ * track of which subitem we are processing for each item in
++ * this loop, etc). So just return some error to fallback to
++ * a transaction commit.
++ */
++ if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
++ ret = -EMLINK;
++ goto out;
++ }
++
++ /*
++ * Logging ancestors needs to do more searches on the fs/subvol
++ * tree, so it releases the path as needed to avoid deadlocks.
++ * Keep track of the last inode ref key and resume from that key
++ * after logging all new ancestors for the current hard link.
++ */
++ memcpy(&search_key, &found_key, sizeof(search_key));
++
++ ret = log_new_ancestors(trans, root, path, ctx);
++ if (ret)
++ goto out;
++ btrfs_release_path(path);
++ goto again;
++ }
++ ret = 0;
++out:
++ btrfs_free_path(path);
++ return ret;
++}
++
+ /*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log. A minimal inode and backref
+@@ -6026,11 +6210,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
+ struct super_block *sb;
+- struct dentry *old_parent = NULL;
+ int ret = 0;
+ u64 last_committed = fs_info->last_trans_committed;
+ bool log_dentries = false;
+- struct btrfs_inode *orig_inode = inode;
+
+ sb = inode->vfs_inode.i_sb;
+
+@@ -6136,54 +6318,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ * and has a link count of 2.
+ */
+ if (inode->last_unlink_trans > last_committed) {
+- ret = btrfs_log_all_parents(trans, orig_inode, ctx);
++ ret = btrfs_log_all_parents(trans, inode, ctx);
+ if (ret)
+ goto end_trans;
+ }
+
+- /*
+- * If a new hard link was added to the inode in the current transaction
+- * and its link count is now greater than 1, we need to fallback to a
+- * transaction commit, otherwise we can end up not logging all its new
+- * parents for all the hard links. Here just from the dentry used to
+- * fsync, we can not visit the ancestor inodes for all the other hard
+- * links to figure out if any is new, so we fallback to a transaction
+- * commit (instead of adding a lot of complexity of scanning a btree,
+- * since this scenario is not a common use case).
+- */
+- if (inode->vfs_inode.i_nlink > 1 &&
+- inode->last_link_trans > last_committed) {
+- ret = -EMLINK;
++ ret = log_all_new_ancestors(trans, inode, parent, ctx);
++ if (ret)
+ goto end_trans;
+- }
+-
+- while (1) {
+- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
+- break;
+-
+- inode = BTRFS_I(d_inode(parent));
+- if (root != inode->root)
+- break;
+
+- if (inode->generation > last_committed) {
+- ret = btrfs_log_inode(trans, root, inode,
+- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+- if (ret)
+- goto end_trans;
+- }
+- if (IS_ROOT(parent))
+- break;
+-
+- parent = dget_parent(parent);
+- dput(old_parent);
+- old_parent = parent;
+- }
+ if (log_dentries)
+- ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
++ ret = log_new_dir_dentries(trans, root, inode, ctx);
+ else
+ ret = 0;
+ end_trans:
+- dput(old_parent);
+ if (ret < 0) {
+ btrfs_set_log_full_commit(fs_info, trans);
+ ret = 1;
+--
+2.19.0
+
diff --git a/series.conf b/series.conf
index 731e2bdfc2..68cacf60aa 100644
--- a/series.conf
+++ b/series.conf
@@ -21905,6 +21905,7 @@
patches.suse/0008-btrfs-extent-tree-Use-btrfs_ref-to-refactor-btrfs_fr.patch
patches.suse/0009-btrfs-qgroup-Don-t-scan-leaf-if-we-re-modifying-relo.patch
patches.suse/btrfs-send-flush-dellaloc-in-order-to-avoid-data-los.patch
+ patches.suse/btrfs-improve-performance-on-fsync-of-files-with-mul.patch
patches.drivers/mmc-core-fix-possible-use-after-free-of-host.patch
patches.drivers/phy-sun4i-usb-Make-sure-to-disable-PHY0-passby-for-p.patch
patches.drivers/stm-class-Fix-channel-free-in-stm-output-free-path.patch