Home Home > GIT Browse > openSUSE-42.3
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKernel Build Daemon <kbuild@suse.de>2019-04-23 07:22:31 +0200
committerKernel Build Daemon <kbuild@suse.de>2019-04-23 07:22:31 +0200
commit6bab347d311f43292edb4152d21dfe3f5b38cb72 (patch)
tree4f8b72215ee8724fe826871c53465f2b8b3739d3
parent3d0b964e2ba9541acf5acdbc81d9eaf3a5a15007 (diff)
parentdbfe46bbc7bcefe8360003cf4919f6c25e469e9e (diff)
Merge branch 'SLE12-SP3' into openSUSE-42.3openSUSE-42.3
-rw-r--r--Documentation/kernel-parameters.txt32
-rw-r--r--arch/powerpc/kernel/security.c5
-rw-r--r--arch/powerpc/kernel/setup_64.c2
-rw-r--r--arch/s390/kernel/nospec-branch.c3
-rw-r--r--arch/x86/kernel/cpu/bugs.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c35
-rw-r--r--arch/x86/mm/kaiser.c4
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h24
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c8
-rw-r--r--fs/btrfs/qgroup.c603
-rw-r--r--fs/btrfs/qgroup.h95
-rw-r--r--fs/btrfs/relocation.c110
-rw-r--r--fs/btrfs/transaction.c1
-rw-r--r--fs/fuse/dev.c12
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c19
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--include/linux/cpu.h24
-rw-r--r--include/trace/events/btrfs.h21
-rw-r--r--kernel/cpu.c15
-rw-r--r--net/core/net-sysfs.c6
23 files changed, 977 insertions, 67 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 39f3bafc2f93..1ebb189f1dfa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2320,6 +2320,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
in the "bleeding edge" mini2440 support kernel at
http://repo.or.cz/w/linux-2.6/mini2440.git
+ mitigations=
+ [X86,PPC,S390] Control optional mitigations for CPU
+ vulnerabilities. This is a set of curated,
+ arch-independent options, each of which is an
+ aggregation of existing arch-specific options.
+
+ off
+ Disable all optional CPU mitigations. This
+ improves system performance, but it may also
+ expose users to several CPU vulnerabilities.
+ Equivalent to: nopti [X86,PPC]
+ nospectre_v1 [PPC]
+ nobp=0 [S390]
+ nospectre_v2 [X86,PPC,S390]
+ spectre_v2_user=off [X86]
+ spec_store_bypass_disable=off [X86,PPC]
+ l1tf=off [X86]
+
+ auto (default)
+ Mitigate all CPU vulnerabilities, but leave SMT
+ enabled, even if it's vulnerable. This is for
+ users who don't want to be surprised by SMT
+ getting disabled across kernel upgrades, or who
+ have other ways of avoiding SMT-based attacks.
+ Equivalent to: (default behavior)
+
+ auto,nosmt
+ Mitigate all CPU vulnerabilities, disabling SMT
+ if needed. This is for users who always want to
+ be fully mitigated, even if it means losing SMT.
+ Equivalent to: l1tf=flush,nosmt [X86]
+
mminit_loglevel=
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
parameter allows control of the logging verbosity for
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index 4661079969f2..3274ee8de4f4 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -8,6 +8,7 @@
#include <linux/device.h>
#include <linux/seq_buf.h>
#include <linux/debugfs.h>
+#include <linux/cpu.h>
#include <asm/asm-prototypes.h>
#include <asm/code-patching.h>
@@ -52,7 +53,7 @@ void setup_barrier_nospec(void)
enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR);
- if (!no_nospec)
+ if (!no_nospec && !cpu_mitigations_off())
enable_barrier_nospec(enable);
}
@@ -274,7 +275,7 @@ void setup_stf_barrier(void)
stf_enabled_flush_types = type;
- if (!no_stf_barrier)
+ if (!no_stf_barrier && !cpu_mitigations_off())
stf_barrier_enable(enable);
}
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index dd41a3a1d7ce..7210db3b578c 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -974,7 +974,7 @@ void setup_rfi_flush(enum l1d_flush_type types, bool enable)
enabled_flush_types = types;
- if (!no_rfi_flush)
+ if (!no_rfi_flush && !cpu_mitigations_off())
rfi_flush_enable(enable);
}
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index acc9957a59d9..04f0061b1fcc 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -2,6 +2,7 @@
#include <linux/module.h>
#include <linux/device.h>
#include <asm/facility.h>
+#include <linux/cpu.h>
#include <asm/nospec-branch.h>
static int __init nobp_setup_early(char *str)
@@ -59,7 +60,7 @@ early_param("nospectre_v2", nospectre_v2_setup_early);
void __init nospec_auto_detect(void)
{
- if (test_facility(156)) {
+ if (test_facility(156) || cpu_mitigations_off()) {
/*
* The machine supports etokens.
* Disable expolines and disable nobp.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e159f743a2cd..e9e2f0769233 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -294,7 +294,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
int ret, i;
enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
+ cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
else {
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
@@ -473,7 +474,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
char arg[20];
int ret, i;
- if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
+ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
+ cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
} else {
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
@@ -717,6 +719,11 @@ static void __init l1tf_select_mitigation(void)
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
+ if (cpu_mitigations_off())
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (cpu_mitigations_auto_nosmt())
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+
override_cache_bits(&boot_cpu_data);
switch (l1tf_mitigation) {
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 076b7d750980..6c4db0207a35 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -113,22 +113,39 @@ static __initconst const u64 amd_hw_cache_event_ids
};
/*
- * AMD Performance Monitor K7 and later.
+ * AMD Performance Monitor K7 and later, up to and including Family 16h:
*/
static const u64 amd_perfmon_event_map[] =
{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
- [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
- [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x077e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
+};
+
+/*
+ * AMD Performance Monitor Family 17h and later:
+ */
+static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287,
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187,
};
static u64 amd_pmu_event_map(int hw_event)
{
+ if (boot_cpu_data.x86 >= 0x17)
+ return amd_f17h_perfmon_event_map[hw_event];
+
return amd_perfmon_event_map[hw_event];
}
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 36e0d67bb046..5bfaa8c43be8 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/cpu.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -291,7 +292,8 @@ void __init kaiser_check_boottime_disable(void)
goto skip;
}
- if (cmdline_find_option_bool(boot_command_line, "nopti"))
+ if (cmdline_find_option_bool(boot_command_line, "nopti") ||
+ cpu_mitigations_off())
goto disable;
skip:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e6c50b7992c0..d1799ccc4423 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -25,6 +25,7 @@
#include "transaction.h"
#include "print-tree.h"
#include "locking.h"
+#include "qgroup.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -1554,6 +1555,13 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(parent);
btrfs_set_lock_blocking(buf);
+ /*
+ * Before CoWing this block for later modification, check if it's
+ * the subtree root and do the delayed subtree trace if needed.
+ *
+ * Also We don't care about the error, as it's handled internally.
+ */
+ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fdf79eaa2c88..29516613dfe9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1122,6 +1122,19 @@ struct btrfs_subvolume_writers {
#define BTRFS_ROOT_MULTI_LOG_TASKS 8
#define BTRFS_ROOT_DIRTY 9
+#define BTRFS_ROOT_DEAD_RELOC_TREE 11
+
+/*
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+ spinlock_t lock;
+ /* RM_EMPTY_ROOT() of above blocks[] */
+ bool swapped;
+ struct rb_root blocks[BTRFS_MAX_LEVEL];
+};
+
/*
* in ram representation of the tree. extent_root is used for all allocations
* and for the extent tree extent_root root.
@@ -1182,6 +1195,9 @@ struct btrfs_root {
u64 highest_objectid;
+ /* Record pairs of swapped blocks for qgroup */
+ struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
u64 alloc_bytenr;
@@ -1248,6 +1264,14 @@ struct btrfs_root {
u64 nr_ordered_extents;
/*
+ * Not empty if this subvolume root has gone through tree block swap
+ * (relocation)
+ *
+ * Will be used by reloc_control::dirty_subvol_roots.
+ */
+ struct list_head reloc_dirty_list;
+
+ /*
* Number of currently running SEND ioctls to prevent
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 01941d53fb05..488ed32612dc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1247,6 +1247,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->orphan_lock);
@@ -1290,6 +1291,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
root->root_key.objectid = objectid;
spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 688ae254ef67..5f769de98da5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8815,7 +8815,13 @@ skip:
parent = 0;
}
- if (need_account) {
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
+ * already accounted them at merge time (replace_path),
+ * thus we could skip expensive subtree trace here.
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ need_account) {
ret = btrfs_qgroup_trace_subtree(trans, root, next,
generation, level - 1);
if (ret) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e9ad124f266c..49c160ed86f8 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1629,6 +1629,358 @@ static int adjust_slots_upwards(struct btrfs_root *root,
return 0;
}
+/*
+ * Helper function to trace a subtree tree block swap.
+ *
+ * The swap will happen in highest tree block, but there may be a lot of
+ * tree blocks involved.
+ *
+ * For example:
+ * OO = Old tree blocks
+ * NN = New tree blocks allocated during balance
+ *
+ * File tree (257) Reloc tree for 257
+ * L2 OO NN
+ * / \ / \
+ * L1 OO OO (a) OO NN (a)
+ * / \ / \ / \ / \
+ * L0 OO OO OO OO OO OO NN NN
+ * (b) (c) (b) (c)
+ *
+ * When calling qgroup_trace_extent_swap(), we will pass:
+ * @src_eb = OO(a)
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
+ * @dst_level = 0
+ * @root_level = 1
+ *
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
+ *
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
+ *
+ * 1) Tree search from @src_eb
+ * It should acts as a simplified btrfs_search_slot().
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
+ * (first key).
+ *
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
+ * They should be marked during preivous (@dst_level = 1) iteration.
+ *
+ * 3) Mark file extents in leaves dirty
+ * We don't have good way to pick out new file extents only.
+ * So we still follow the old method by scanning all file extents in
+ * the leave.
+ *
+ * This function can free us from keeping two pathes, thus later we only need
+ * to care about how to iterate all new tree blocks in reloc tree.
+ */
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int dst_level, int root_level,
+ bool trace_leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_path *src_path;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u32 nodesize = fs_info->tree_root->nodesize;
+ int cur_level = root_level;
+ int ret;
+
+ BUG_ON(dst_level > root_level);
+ /* Level mismatch */
+ if (btrfs_header_level(src_eb) != root_level)
+ return -EINVAL;
+
+ src_path = btrfs_alloc_path();
+ if (!src_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dst_level)
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+ else
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+
+ /* For src_path */
+ extent_buffer_get(src_eb);
+ src_path->nodes[root_level] = src_eb;
+ src_path->slots[root_level] = dst_path->slots[root_level];
+ src_path->locks[root_level] = 0;
+
+ /* A simplified version of btrfs_search_slot() */
+ while (cur_level >= dst_level) {
+ struct btrfs_key src_key;
+ struct btrfs_key dst_key;
+
+ if (src_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ struct extent_buffer *eb;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ eb = src_path->nodes[cur_level + 1];
+ parent_slot = src_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ eb = read_tree_block(fs_info->tree_root, child_bytenr,
+ child_gen, cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ src_path->nodes[cur_level] = eb;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ }
+
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
+ if (cur_level) {
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ } else {
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ }
+ /* Content mismatch, something went wrong */
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ cur_level--;
+ }
+
+ /*
+ * Now both @dst_path and @src_path have been populated, record the tree
+ * blocks for qgroup accounting.
+ */
+ ret = btrfs_qgroup_trace_extent(trans, fs_info,
+ src_path->nodes[dst_level]->start, nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_extent(trans, fs_info,
+ dst_path->nodes[dst_level]->start, nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
+ /* Record leaf file extents */
+ if (dst_level == 0 && trace_leaf) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, fs_info->tree_root,
+ src_path->nodes[0]);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_leaf_items(trans, fs_info->tree_root,
+ dst_path->nodes[0]);
+ }
+out:
+ btrfs_free_path(src_path);
+ return ret;
+}
+
+/*
+ * Helper function to do recursive generation-aware depth-first search, to
+ * locate all new tree blocks in a subtree of reloc tree.
+ *
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
+ * reloc tree
+ * L2 NN (a)
+ * / \
+ * L1 OO NN (b)
+ * / \ / \
+ * L0 OO OO OO NN
+ * (c) (d)
+ * If we pass:
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
+ * @cur_level = 1
+ * @root_level = 1
+ *
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
+ * above tree blocks along with their counter parts in file tree.
+ * While during search, old tree blocsk OO(c) will be skiped as tree block swap
+ * won't affect OO(c).
+ */
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int cur_level, int root_level,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *eb;
+ bool need_cleanup = false;
+ int ret = 0;
+ int i;
+
+ /* Level sanity check */
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < cur_level) {
+ btrfs_err_rl(fs_info,
+ "%s: bad levels, cur_level=%d root_level=%d",
+ __func__, cur_level, root_level);
+ return -EUCLEAN;
+ }
+
+ /* Read the tree block if needed */
+ if (dst_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /*
+ * dst_path->nodes[root_level] must be initialized before
+ * calling this function.
+ */
+ if (cur_level == root_level) {
+ btrfs_err_rl(fs_info,
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
+ __func__, root_level, root_level, cur_level);
+ return -EUCLEAN;
+ }
+
+ /*
+ * We need to get child blockptr/gen from parent before we can
+ * read it.
+ */
+ eb = dst_path->nodes[cur_level + 1];
+ parent_slot = dst_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ /* This node is old, no need to trace */
+ if (child_gen < last_snapshot)
+ goto out;
+
+ eb = read_tree_block(fs_info->tree_root, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ dst_path->nodes[cur_level] = eb;
+ dst_path->slots[cur_level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ need_cleanup = true;
+ }
+
+ /* Now record this tree block and its counter part for qgroups */
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
+ root_level, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+
+ eb = dst_path->nodes[cur_level];
+
+ if (cur_level > 0) {
+ /* Iterate all child tree blocks */
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ /* Skip old tree blocks as they won't be swapped */
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
+ continue;
+ dst_path->slots[cur_level] = i;
+
+ /* Recursive call (at most 7 times) */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
+ dst_path, cur_level - 1, root_level,
+ last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (need_cleanup) {
+ /* Clean up */
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
+ dst_path->locks[cur_level]);
+ free_extent_buffer(dst_path->nodes[cur_level]);
+ dst_path->nodes[cur_level] = NULL;
+ dst_path->slots[cur_level] = 0;
+ dst_path->locks[cur_level] = 0;
+ }
+out:
+ return ret;
+}
+
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct extent_buffer *src_eb,
+ struct extent_buffer *dst_eb,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *dst_path = NULL;
+ int level;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ /* Wrong parameter order */
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+ btrfs_header_generation(src_eb),
+ btrfs_header_generation(dst_eb));
+ return -EUCLEAN;
+ }
+
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ level = btrfs_header_level(dst_eb);
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* For dst_path */
+ extent_buffer_get(dst_eb);
+ dst_path->nodes[level] = dst_eb;
+ dst_path->slots[level] = 0;
+ dst_path->locks[level] = 0;
+
+ /* Do the generation aware breadth-first search */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+ level, last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ btrfs_free_path(dst_path);
+ if (ret < 0)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *root_eb,
@@ -2049,6 +2401,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
struct rb_node *node;
+ u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
@@ -2058,6 +2411,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
+ num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
if (!ret) {
@@ -2102,6 +2456,8 @@ cleanup:
kfree(record);
}
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
+ num_dirty_extents);
return ret;
}
@@ -2609,7 +2965,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
bool done;
int slot;
@@ -2646,7 +3001,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_header_nritems(path->nodes[0]) - 1);
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
- btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
if (!scratch_leaf) {
ret = -ENOMEM;
@@ -2685,7 +3039,6 @@ out:
btrfs_tree_read_unlock_blocking(scratch_leaf);
free_extent_buffer(scratch_leaf);
}
- btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
if (done && !ret) {
ret = 1;
@@ -2706,6 +3059,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path = btrfs_alloc_path();
if (!path)
goto out;
+ /*
+ * Rescan should only search for commit root, and any later difference
+ * should be recorded by qgroup
+ */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
err = 0;
while (!err && !btrfs_fs_closing(fs_info)) {
@@ -3315,3 +3674,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
extent_changeset_release(&changeset);
}
+
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+ int i;
+
+ spin_lock_init(&swapped_blocks->lock);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ swapped_blocks->blocks[i] = RB_ROOT;
+ swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+ int i;
+
+ swapped_blocks = &root->swapped_blocks;
+
+ spin_lock(&swapped_blocks->lock);
+ if (!swapped_blocks->swapped)
+ goto out;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
+ struct btrfs_qgroup_swapped_block *entry;
+ struct btrfs_qgroup_swapped_block *next;
+
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+ node)
+ kfree(entry);
+ swapped_blocks->blocks[i] = RB_ROOT;
+ }
+ swapped_blocks->swapped = false;
+out:
+ spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root: tree root of the subvolume tree get swapped
+ * @bg: block group under balance
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ * BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot: last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct rb_node **cur;
+ struct rb_node *parent = NULL;
+ int level = btrfs_header_level(subvol_parent) - 1;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+ __func__,
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+ return -EUCLEAN;
+ }
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * @reloc_parent/slot is still before swap, while @block is going to
+ * record the bytenr after swap, so we do the swap here.
+ */
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+ reloc_slot);
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+ subvol_slot);
+ block->last_snapshot = last_snapshot;
+ block->level = level;
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ block->trace_leaf = true;
+ else
+ block->trace_leaf = false;
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+ /* Insert @block into @blocks */
+ spin_lock(&blocks->lock);
+ cur = &blocks->blocks[level].rb_node;
+ while (*cur) {
+ struct btrfs_qgroup_swapped_block *entry;
+
+ parent = *cur;
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+ node);
+
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
+ cur = &(*cur)->rb_left;
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+ cur = &(*cur)->rb_right;
+ } else {
+ if (entry->subvol_generation !=
+ block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation !=
+ block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found.
+ * Shouldn't happen.
+ *
+ * Marking qgroup inconsistent should be enough
+ * for end users.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EEXIST;
+ }
+ kfree(block);
+ goto out_unlock;
+ }
+ }
+ rb_link_node(&block->node, parent, cur);
+ rb_insert_color(&block->node, &blocks->blocks[level]);
+ blocks->swapped = true;
+out_unlock:
+ spin_unlock(&blocks->lock);
+out:
+ if (ret < 0)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *subvol_eb)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct extent_buffer *reloc_eb = NULL;
+ struct rb_node *node;
+ bool found = false;
+ bool swapped = false;
+ int level = btrfs_header_level(subvol_eb);
+ int ret = 0;
+ int i;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ return 0;
+
+ spin_lock(&blocks->lock);
+ if (!blocks->swapped) {
+ spin_unlock(&blocks->lock);
+ return 0;
+ }
+ node = blocks->blocks[level].rb_node;
+
+ while (node) {
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+ if (block->subvol_bytenr < subvol_eb->start) {
+ node = node->rb_left;
+ } else if (block->subvol_bytenr > subvol_eb->start) {
+ node = node->rb_right;
+ } else {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ spin_unlock(&blocks->lock);
+ goto out;
+ }
+ /* Found one, remove it from @blocks first and update blocks->swapped */
+ rb_erase(&block->node, &blocks->blocks[level]);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+ swapped = true;
+ break;
+ }
+ }
+ blocks->swapped = swapped;
+ spin_unlock(&blocks->lock);
+
+ /* Read out reloc subtree root */
+ reloc_eb = read_tree_block(fs_info->tree_root, block->reloc_bytenr,
+ block->reloc_generation, block->level,
+ &block->first_key);
+ if (IS_ERR(reloc_eb)) {
+ ret = PTR_ERR(reloc_eb);
+ reloc_eb = NULL;
+ goto free_out;
+ }
+ if (!extent_buffer_uptodate(reloc_eb)) {
+ ret = -EIO;
+ goto free_out;
+ }
+
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+ block->last_snapshot, block->trace_leaf);
+free_out:
+ kfree(block);
+ free_extent_buffer(reloc_eb);
+out:
+ if (ret < 0) {
+ btrfs_err_rl(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index b8a6561922cf..ee4d53ba9c60 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -19,6 +19,8 @@
#ifndef __BTRFS_QGROUP__
#define __BTRFS_QGROUP__
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -62,6 +64,66 @@ enum btrfs_qgroup_rsv_type {
};
/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ * During subtree swap:
+ * O = Old tree blocks
+ * N = New tree blocks
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * NA OB OA OB
+ * / | | \ / | | \
+ * NC ND OE OF OC OD OE OF
+ *
+ * In this case, NA and OA are going to be swapped, record (NA, OA) into
+ * subvolume tree X.
+ *
+ * 2) After subtree swap.
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * OA OB NA OB
+ * / | | \ / | | \
+ * OC OD OE OF NC ND OE OF
+ *
+ * 3a) COW happens for OB
+ * If we are going to COW tree block OB, we check OB's bytenr against
+ * tree X's swapped_blocks structure.
+ * If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ * Then we do subtree scan on both subtrees OA and NA.
+ * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ * Then no matter what we do to subvolume tree X, qgroup numbers will
+ * still be correct.
+ * Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4) Transaction commit
+ * Any record in X's swapped_blocks gets removed, since there is no
+ * modification to the swapped subtrees, no need to trigger heavy qgroup
+ * subtree rescan for them.
+ */
+
+/*
* Represents how many bytes we have reserved for this qgroup.
*
* Each type should have different reservation behavior.
@@ -75,6 +137,24 @@ struct btrfs_qgroup_rsv {
u64 values[BTRFS_QGROUP_RSV_LAST];
};
+struct btrfs_qgroup_swapped_block {
+ struct rb_node node;
+
+ int level;
+ bool trace_leaf;
+
+ /* bytenr/generation of the tree block in subvolume tree after swap */
+ u64 subvol_bytenr;
+ u64 subvol_generation;
+
+ /* bytenr/generation of the tree block in reloc tree after swap */
+ u64 reloc_bytenr;
+ u64 reloc_generation;
+
+ u64 last_snapshot;
+ struct btrfs_key first_key;
+};
+
/*
* one struct for each qgroup, organized in fs_info->qgroup_tree.
*/
@@ -292,4 +372,19 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *eb);
+
#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 04617352250a..d26b0a3d75b6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -174,6 +174,8 @@ struct reloc_control {
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* list of subvolume trees that get relocated */
+ struct list_head dirty_subvol_roots;
/* size of metadata reservation for merging reloc trees */
u64 merging_rsv_size;
/* size of relocated tree nodes */
@@ -1505,15 +1507,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
- if (!root->reloc_root)
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+ !root->reloc_root)
goto out;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /* root->reloc_root will stay until current relocation finished */
if (root->fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
- root->reloc_root = NULL;
+ set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
__del_reloc_root(reloc_root);
}
@@ -1781,7 +1785,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* errors, a negative error number is returned.
*/
static noinline_for_stack
-int replace_path(struct btrfs_trans_handle *trans,
+int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
struct btrfs_root *dest, struct btrfs_root *src,
struct btrfs_path *path, struct btrfs_key *next_key,
int lowest_level, int max_level)
@@ -1921,21 +1925,18 @@ again:
* If not traced, we will leak data numbers
* 2) Fs subtree
* If not traced, we will double count old data
- * and tree block numbers, if current trans doesn't free
- * data reloc tree inode.
+ *
+ * We don't scan the subtree right now, but only record
+ * the swapped tree blocks.
+ * The real subtree rescan is delayed until we have new
+ * CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_trace_subtree(trans, src, parent,
- btrfs_header_generation(parent),
- btrfs_header_level(parent));
+ ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ rc->block_group, parent, slot,
+ path->nodes[level], path->slots[level],
+ last_snapshot);
if (ret < 0)
break;
- ret = btrfs_qgroup_trace_subtree(trans, dest,
- path->nodes[level],
- btrfs_header_generation(path->nodes[level]),
- btrfs_header_level(path->nodes[level]));
- if (ret < 0)
- break;
-
/*
* swap blocks in fs tree and reloc tree.
*/
@@ -2160,6 +2161,58 @@ static int find_next_key(struct btrfs_path *path, int level,
}
/*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root = root->reloc_root;
+ struct btrfs_root_item *reloc_root_item;
+
+ /* @root must be a subvolume tree root with a valid reloc tree */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(reloc_root);
+
+ reloc_root_item = &reloc_root->root_item;
+ memset(&reloc_root_item->drop_progress, 0,
+ sizeof(reloc_root_item->drop_progress));
+ reloc_root_item->drop_level = 0;
+ btrfs_set_root_refs(reloc_root_item, 0);
+ btrfs_update_reloc_root(trans, root);
+
+ if (list_empty(&root->reloc_dirty_list)) {
+ btrfs_grab_fs_root(root);
+ list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+ }
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *next;
+ int ret = 0;
+
+ list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+ reloc_dirty_list) {
+ struct btrfs_root *reloc_root = root->reloc_root;
+
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ list_del_init(&root->reloc_dirty_list);
+ root->reloc_root = NULL;
+ if (reloc_root) {
+ int ret2;
+
+ ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
+ }
+ btrfs_put_fs_root(root);
+ }
+ return ret;
+}
+
+/*
* merge the relocated tree blocks in reloc tree with corresponding
* fs tree.
*/
@@ -2247,7 +2300,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
} else {
- ret = replace_path(trans, root, reloc_root, path,
+ ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
if (ret < 0) {
@@ -2297,13 +2350,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
- if (err == 0) {
- memset(&root_item->drop_progress, 0,
- sizeof(root_item->drop_progress));
- root_item->drop_level = 0;
- btrfs_set_root_refs(root_item, 0);
- btrfs_update_reloc_root(trans, root);
- }
+ if (err == 0)
+ insert_dirty_subvol(trans, rc, root);
if (trans)
btrfs_end_transaction_throttle(trans, root);
@@ -2458,14 +2506,6 @@ again:
last_snap = btrfs_root_rtransid(&reloc_root->root_item);
otransid = btrfs_root_otransid(&reloc_root->root_item);
objectid = reloc_root->root_key.offset;
-
- ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
- if (ret < 0) {
- if (list_empty(&reloc_root->root_list))
- list_add_tail(&reloc_root->root_list,
- &reloc_roots);
- goto out;
- }
}
if (found) {
@@ -4189,6 +4229,9 @@ restart:
goto out_free;
}
btrfs_commit_transaction(trans, rc->extent_root);
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
btrfs_free_path(path);
@@ -4283,6 +4326,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
return NULL;
INIT_LIST_HEAD(&rc->reloc_roots);
+ INIT_LIST_HEAD(&rc->dirty_subvol_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(&rc->processed_blocks,
@@ -4557,6 +4601,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_free;
}
err = btrfs_commit_transaction(trans, rc->extent_root);
+
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
kfree(rc);
out:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7ae81b05a708..68f08d4a8b30 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -140,6 +140,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
if (is_fstree(root->objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
+ btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 341196338e48..72fee351bd0e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -145,9 +145,13 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
static void fuse_drop_waiting(struct fuse_conn *fc)
{
- if (fc->connected) {
- atomic_dec(&fc->num_waiting);
- } else if (atomic_dec_and_test(&fc->num_waiting)) {
+ /*
+ * lockess check of fc->connected is okay, because atomic_dec_and_test()
+ * provides a memory barrier mached with the one in fuse_wait_aborted()
+ * to ensure no wake-up is missed.
+ */
+ if (atomic_dec_and_test(&fc->num_waiting) &&
+ !READ_ONCE(fc->connected)) {
/* wake up aborters */
wake_up_all(&fc->blocked_waitq);
}
@@ -2222,6 +2226,8 @@ EXPORT_SYMBOL_GPL(fuse_abort_conn);
void fuse_wait_aborted(struct fuse_conn *fc)
{
+ /* matches implicit memory barrier in fuse_drop_waiting() */
+ smp_mb();
wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 328f9b45701a..fc5c6a9a37d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1403,7 +1403,7 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
static int fuse_dir_release(struct inode *inode, struct file *file)
{
- fuse_release_common(file, FUSE_RELEASEDIR);
+ fuse_release_common(file, true);
return 0;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1524f9bc8ed0..7346b599cbc5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,12 +86,12 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
iput(req->misc.release.inode);
}
-static void fuse_file_put(struct fuse_file *ff, bool sync)
+static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
{
if (atomic_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
- if (ff->fc->no_open) {
+ if (ff->fc->no_open && !isdir) {
/*
* Drop the release request when client does not
* implement 'open'
@@ -244,10 +244,11 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
req->in.args[0].value = inarg;
}
-void fuse_release_common(struct file *file, int opcode)
+void fuse_release_common(struct file *file, bool isdir)
{
struct fuse_file *ff;
struct fuse_req *req;
+ int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
ff = file->private_data;
if (unlikely(!ff))
@@ -274,7 +275,7 @@ void fuse_release_common(struct file *file, int opcode)
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
- fuse_file_put(ff, ff->fc->destroy_req != NULL);
+ fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir);
}
static int fuse_open(struct inode *inode, struct file *file)
@@ -290,7 +291,7 @@ static int fuse_release(struct inode *inode, struct file *file)
if (fc->writeback_cache)
write_inode_now(inode, 1);
- fuse_release_common(file, FUSE_RELEASE);
+ fuse_release_common(file, false);
/* return value is ignored by VFS */
return 0;
@@ -817,7 +818,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
page_cache_release(page);
}
if (req->ff)
- fuse_file_put(req->ff, false);
+ fuse_file_put(req->ff, false, false);
}
static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1467,7 +1468,7 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
__free_page(req->pages[i]);
if (req->ff)
- fuse_file_put(req->ff, false);
+ fuse_file_put(req->ff, false, false);
}
static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1624,7 +1625,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
ff = __fuse_write_file_get(fc, fi);
err = fuse_flush_times(inode, ff);
if (ff)
- fuse_file_put(ff, 0);
+ fuse_file_put(ff, false, false);
return err;
}
@@ -1937,7 +1938,7 @@ static int fuse_writepages(struct address_space *mapping,
err = 0;
}
if (data.ff)
- fuse_file_put(data.ff, false);
+ fuse_file_put(data.ff, false, false);
kfree(data.orig_pages);
out:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 576f84b0e9b5..ebe2decbd187 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -732,7 +732,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags);
/**
* Send RELEASE or RELEASEDIR request
*/
-void fuse_release_common(struct file *file, int opcode);
+void fuse_release_common(struct file *file, bool isdir);
/**
* Send FSYNC or FSYNCDIR request
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 8c1e3ec85c74..01530b4c557d 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -314,4 +314,28 @@ static inline void cpu_smt_check_topology_early(void) { }
static inline void cpu_smt_check_topology(void) { }
#endif
+/*
+ * These are used for a global "mitigations=" cmdline option for toggling
+ * optional CPU mitigations.
+ */
+enum cpu_mitigations {
+ CPU_MITIGATIONS_OFF,
+ CPU_MITIGATIONS_AUTO,
+ CPU_MITIGATIONS_AUTO_NOSMT,
+};
+
+extern enum cpu_mitigations cpu_mitigations;
+
+/* mitigations=off */
+static inline bool cpu_mitigations_off(void)
+{
+ return cpu_mitigations == CPU_MITIGATIONS_OFF;
+}
+
+/* mitigations=auto,nosmt */
+static inline bool cpu_mitigations_auto_nosmt(void)
+{
+ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
+}
+
#endif /* _LINUX_CPU_H_ */
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index c4f2796298d3..be32b3b99736 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1439,6 +1439,27 @@ DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
TP_ARGS(fs_info, rec)
);
+TRACE_EVENT(qgroup_num_dirty_extents,
+
+ TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid,
+ u64 num_dirty_extents),
+
+ TP_ARGS(fs_info, transid, num_dirty_extents),
+
+ TP_STRUCT__entry_btrfs(
+ __field( u64, transid )
+ __field( u64, num_dirty_extents )
+ ),
+
+ TP_fast_assign_btrfs(fs_info,
+ __entry->transid = transid;
+ __entry->num_dirty_extents = num_dirty_extents;
+ ),
+
+ TP_printk_btrfs("transid=%llu num_dirty_extents=%llu",
+ __entry->transid, __entry->num_dirty_extents)
+);
+
TRACE_EVENT(btrfs_qgroup_account_extent,
TP_PROTO(const struct btrfs_fs_info *fs_info, u64 bytenr,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f7452fb803b7..2d1bb9aca618 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1206,3 +1206,18 @@ void init_cpu_online(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_online_bits), src);
}
+
+enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ if (!strcmp(arg, "off"))
+ cpu_mitigations = CPU_MITIGATIONS_OFF;
+ else if (!strcmp(arg, "auto"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO;
+ else if (!strcmp(arg, "auto,nosmt"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+
+ return 0;
+}
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 37488e5130fd..edafd949ec64 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -898,6 +898,8 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
if (error)
return error;
+ dev_hold(queue->dev);
+
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
if (error) {
@@ -907,7 +909,6 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
}
kobject_uevent(kobj, KOBJ_ADD);
- dev_hold(queue->dev);
return error;
}
@@ -1289,6 +1290,8 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
if (error)
return error;
+ dev_hold(queue->dev);
+
#ifdef CONFIG_BQL
error = sysfs_create_group(kobj, &dql_group);
if (error) {
@@ -1298,7 +1301,6 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
#endif
kobject_uevent(kobj, KOBJ_ADD);
- dev_hold(queue->dev);
return 0;
}