Home Home > GIT Browse > SLE11-SP4
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2018-06-05 16:17:56 +0200
committerJiri Kosina <jkosina@suse.cz>2018-06-05 16:17:56 +0200
commit93b0e34053ea44f25445271c1ee6af84e341e1bd (patch)
treee1b690d857e0c6e1390925a131ee0c7e5c72c146
parentf27dc40ebb1f58ea6490b4259f41cbb201df3c7d (diff)
parent45ad29f7425a445f71833d45c089ede2e61828d4 (diff)
Merge remote-tracking branch 'origin/users/jeffm/SLE11-SP4/for-next' into SLE11-SP4
Pull btrfs fixes from Jeff Mahoney suse-commit: d343b0a947deafd25fb32616b9298544075f5153
-rw-r--r--arch/powerpc/mm/stab.c298
-rw-r--r--fs/btrfs/Kconfig9
-rw-r--r--fs/btrfs/async-thread.c25
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c93
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h25
-rw-r--r--fs/btrfs/check-integrity.c15
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c152
-rw-r--r--fs/btrfs/ctree.h87
-rw-r--r--fs/btrfs/dev-replace.c69
-rw-r--r--fs/btrfs/disk-io.c176
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c767
-rw-r--r--fs/btrfs/extent_io.c11
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c368
-rw-r--r--fs/btrfs/free-space-cache.c70
-rw-r--r--fs/btrfs/inode-item.c9
-rw-r--r--fs/btrfs/inode-map.c14
-rw-r--r--fs/btrfs/inode.c83
-rw-r--r--fs/btrfs/ioctl.c37
-rw-r--r--fs/btrfs/ordered-data.c77
-rw-r--r--fs/btrfs/ordered-data.h11
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/qgroup.c57
-rw-r--r--fs/btrfs/reada.c2
-rw-r--r--fs/btrfs/relocation.c70
-rw-r--r--fs/btrfs/scrub.c40
-rw-r--r--fs/btrfs/send.c63
-rw-r--r--fs/btrfs/super.c19
-rw-r--r--fs/btrfs/transaction.c113
-rw-r--r--fs/btrfs/transaction.h16
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c1215
-rw-r--r--fs/btrfs/tree-log.h22
-rw-r--r--fs/btrfs/volumes.c1104
-rw-r--r--fs/btrfs/volumes.h134
40 files changed, 3703 insertions, 1574 deletions
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
deleted file mode 100644
index c443fdcf1d16..000000000000
--- a/arch/powerpc/mm/stab.c
+++ /dev/null
@@ -1,298 +0,0 @@
-#ifndef CONFIG_BIGMEM
-/*
- * PowerPC64 Segment Translation Support.
- *
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- *
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/memblock.h>
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/cputable.h>
-#include <asm/prom.h>
-#include <asm/abs_addr.h>
-#include <asm/firmware.h>
-#include <asm/iseries/hv_call.h>
-
-struct stab_entry {
- unsigned long esid_data;
- unsigned long vsid_data;
-};
-
-#define NR_STAB_CACHE_ENTRIES 8
-static DEFINE_PER_CPU(long, stab_cache_ptr);
-static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
-
-/*
- * Create a segment table entry for the given esid/vsid pair.
- */
-static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
-{
- unsigned long esid_data, vsid_data;
- unsigned long entry, group, old_esid, castout_entry, i;
- unsigned int global_entry;
- struct stab_entry *ste, *castout_ste;
- unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET;
-
- vsid_data = vsid << STE_VSID_SHIFT;
- esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
- if (! kernel_segment)
- esid_data |= STE_ESID_KS;
-
- /* Search the primary group first. */
- global_entry = (esid & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
-
- /* Find an empty entry, if one exists. */
- for (group = 0; group < 2; group++) {
- for (entry = 0; entry < 8; entry++, ste++) {
- if (!(ste->esid_data & STE_ESID_V)) {
- ste->vsid_data = vsid_data;
- eieio();
- ste->esid_data = esid_data;
- return (global_entry | entry);
- }
- }
- /* Now search the secondary group. */
- global_entry = ((~esid) & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
- }
-
- /*
- * Could not find empty entry, pick one with a round robin selection.
- * Search all entries in the two groups.
- */
- castout_entry = get_paca()->stab_rr;
- for (i = 0; i < 16; i++) {
- if (castout_entry < 8) {
- global_entry = (esid & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
- castout_ste = ste + castout_entry;
- } else {
- global_entry = ((~esid) & 0x1f) << 3;
- ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
- castout_ste = ste + (castout_entry - 8);
- }
-
- /* Dont cast out the first kernel segment */
- if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET)
- break;
-
- castout_entry = (castout_entry + 1) & 0xf;
- }
-
- get_paca()->stab_rr = (castout_entry + 1) & 0xf;
-
- /* Modify the old entry to the new value. */
-
- /* Force previous translations to complete. DRENG */
- asm volatile("isync" : : : "memory");
-
- old_esid = castout_ste->esid_data >> SID_SHIFT;
- castout_ste->esid_data = 0; /* Invalidate old entry */
-
- asm volatile("sync" : : : "memory"); /* Order update */
-
- castout_ste->vsid_data = vsid_data;
- eieio(); /* Order update */
- castout_ste->esid_data = esid_data;
-
- asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT));
- /* Ensure completion of slbie */
- asm volatile("sync" : : : "memory");
-
- return (global_entry | (castout_entry & 0x7));
-}
-
-/*
- * Allocate a segment table entry for the given ea and mm
- */
-static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
-{
- unsigned long vsid;
- unsigned char stab_entry;
- unsigned long offset;
-
- /* Kernel or user address? */
- if (is_kernel_addr(ea)) {
- vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M);
- } else {
- if ((ea >= TASK_SIZE_USER64) || (! mm))
- return 1;
-
- vsid = get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M);
- }
-
- stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
-
- if (!is_kernel_addr(ea)) {
- offset = __get_cpu_var(stab_cache_ptr);
- if (offset < NR_STAB_CACHE_ENTRIES)
- __get_cpu_var(stab_cache[offset++]) = stab_entry;
- else
- offset = NR_STAB_CACHE_ENTRIES+1;
- __get_cpu_var(stab_cache_ptr) = offset;
-
- /* Order update */
- asm volatile("sync":::"memory");
- }
-
- return 0;
-}
-
-int ste_allocate(unsigned long ea)
-{
- return __ste_allocate(ea, current->mm);
-}
-
-/*
- * Do the segment table work for a context switch: flush all user
- * entries from the table, then preload some probably useful entries
- * for the new task
- */
-void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
-{
- struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
- struct stab_entry *ste;
- unsigned long offset;
- unsigned long pc = KSTK_EIP(tsk);
- unsigned long stack = KSTK_ESP(tsk);
- unsigned long unmapped_base;
-
- /* Force previous translations to complete. DRENG */
- asm volatile("isync" : : : "memory");
-
- /*
- * We need interrupts hard-disabled here, not just soft-disabled,
- * so that a PMU interrupt can't occur, which might try to access
- * user memory (to get a stack trace) and possible cause an STAB miss
- * which would update the stab_cache/stab_cache_ptr per-cpu variables.
- */
- hard_irq_disable();
-
- offset = __get_cpu_var(stab_cache_ptr);
- if (offset <= NR_STAB_CACHE_ENTRIES) {
- int i;
-
- for (i = 0; i < offset; i++) {
- ste = stab + __get_cpu_var(stab_cache[i]);
- ste->esid_data = 0; /* invalidate entry */
- }
- } else {
- unsigned long entry;
-
- /* Invalidate all entries. */
- ste = stab;
-
- /* Never flush the first entry. */
- ste += 1;
- for (entry = 1;
- entry < (HW_PAGE_SIZE / sizeof(struct stab_entry));
- entry++, ste++) {
- unsigned long ea;
- ea = ste->esid_data & ESID_MASK;
- if (!is_kernel_addr(ea)) {
- ste->esid_data = 0;
- }
- }
- }
-
- asm volatile("sync; slbia; sync":::"memory");
-
- __get_cpu_var(stab_cache_ptr) = 0;
-
- /* Now preload some entries for the new task */
- if (test_tsk_thread_flag(tsk, TIF_32BIT))
- unmapped_base = TASK_UNMAPPED_BASE_USER32;
- else
- unmapped_base = TASK_UNMAPPED_BASE_USER64;
-
- __ste_allocate(pc, mm);
-
- if (GET_ESID(pc) == GET_ESID(stack))
- return;
-
- __ste_allocate(stack, mm);
-
- if ((GET_ESID(pc) == GET_ESID(unmapped_base))
- || (GET_ESID(stack) == GET_ESID(unmapped_base)))
- return;
-
- __ste_allocate(unmapped_base, mm);
-
- /* Order update */
- asm volatile("sync" : : : "memory");
-}
-
-/*
- * Allocate segment tables for secondary CPUs. These must all go in
- * the first (bolted) segment, so that do_stab_bolted won't get a
- * recursive segment miss on the segment table itself.
- */
-void __init stabs_alloc(void)
-{
- int cpu;
-
- if (mmu_has_feature(MMU_FTR_SLB))
- return;
-
- for_each_possible_cpu(cpu) {
- unsigned long newstab;
-
- if (cpu == 0)
- continue; /* stab for CPU 0 is statically allocated */
-
- newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
- 1<<SID_SHIFT);
- newstab = (unsigned long)__va(newstab);
-
- memset((void *)newstab, 0, HW_PAGE_SIZE);
-
- paca[cpu].stab_addr = newstab;
- paca[cpu].stab_real = virt_to_abs(newstab);
- printk(KERN_INFO "Segment table for CPU %d at 0x%llx "
- "virtual, 0x%llx absolute\n",
- cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
- }
-}
-
-/*
- * Build an entry for the base kernel segment and put it into
- * the segment table or SLB. All other segment table or SLB
- * entries are faulted in.
- */
-void stab_initialize(unsigned long stab)
-{
- unsigned long vsid = get_kernel_vsid(PAGE_OFFSET, MMU_SEGSIZE_256M);
- unsigned long stabreal;
-
- asm volatile("isync; slbia; isync":::"memory");
- make_ste(stab, GET_ESID(PAGE_OFFSET), vsid);
-
- /* Order update */
- asm volatile("sync":::"memory");
-
- /* Set ASR */
- stabreal = get_paca()->stab_real | 0x1ul;
-
-#ifdef CONFIG_PPC_ISERIES
- if (firmware_has_feature(FW_FEATURE_ISERIES)) {
- HvCall1(HvCallBaseSetASR, stabreal);
- return;
- }
-#endif /* CONFIG_PPC_ISERIES */
-
- mtspr(SPRN_ASR, stabreal);
-}
-#endif
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b60..99e3d84e6b0b 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -50,3 +50,12 @@ config BTRFS_FS_CHECK_INTEGRITY
In most cases, unless you are a btrfs developer who needs
to verify the integrity of (super)-block write requests
during the run of a regression test, say N
+
+config BTRFS_ASSERT
+ bool "Btrfs assert support"
+ depends on BTRFS_FS
+ help
+ Enable run-time assertion checking. This will result in panics if
+ any of the assertions trip. This is meant for btrfs developers only.
+
+ If unsure, say N.
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5e0321b2605b..e4e78aaffe01 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -107,7 +107,8 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
worker->idle = 1;
/* the list may be empty if the worker is just starting */
- if (!list_empty(&worker->worker_list)) {
+ if (!list_empty(&worker->worker_list) &&
+ !worker->workers->stopping) {
list_move(&worker->worker_list,
&worker->workers->idle_list);
}
@@ -127,7 +128,8 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 0;
- if (!list_empty(&worker->worker_list)) {
+ if (!list_empty(&worker->worker_list) &&
+ !worker->workers->stopping) {
list_move_tail(&worker->worker_list,
&worker->workers->worker_list);
}
@@ -417,6 +419,7 @@ void btrfs_stop_workers(struct btrfs_workers *workers)
unsigned long flags;
spin_lock_irqsave(&workers->lock, flags);
+ workers->stopping = 1;
list_splice_init(&workers->idle_list, &workers->worker_list);
while (!list_empty(&workers->worker_list)) {
cur = workers->worker_list.next;
@@ -460,6 +463,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
workers->ordered = 0;
workers->atomic_start_pending = 0;
workers->atomic_worker_start = async_helper;
+ workers->stopping = 0;
}
/*
@@ -486,15 +490,19 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
atomic_set(&worker->num_pending, 0);
atomic_set(&worker->refs, 1);
worker->workers = workers;
- worker->task = kthread_run(worker_loop, worker,
- "btrfs-%s-%d", workers->name,
- workers->num_workers + 1);
+ worker->task = kthread_create(worker_loop, worker,
+ "btrfs-%s-%d", workers->name,
+ workers->num_workers + 1);
if (IS_ERR(worker->task)) {
ret = PTR_ERR(worker->task);
- kfree(worker);
goto fail;
}
+
spin_lock_irqsave(&workers->lock, flags);
+ if (workers->stopping) {
+ spin_unlock_irq(&workers->lock);
+ goto fail_kthread;
+ }
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
workers->num_workers++;
@@ -502,8 +510,13 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
WARN_ON(workers->num_workers_starting < 0);
spin_unlock_irqrestore(&workers->lock, flags);
+ wake_up_process(worker->task);
return 0;
+
+fail_kthread:
+ kthread_stop(worker->task);
fail:
+ kfree(worker);
spin_lock_irqsave(&workers->lock, flags);
workers->num_workers_starting--;
spin_unlock_irqrestore(&workers->lock, flags);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 063698b90ce2..1f26792683ed 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -107,6 +107,8 @@ struct btrfs_workers {
/* extra name for this worker, used for current->name */
char *name;
+
+ int stopping;
};
void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 95886d51352c..e2dd44034ec7 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -268,13 +268,11 @@ next:
* to a logical address
*/
static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
- int search_commit_root,
- u64 time_seq,
- struct __prelim_ref *ref,
- struct ulist *parents,
- const u64 *extent_item_pos)
+ struct btrfs_path *path, u64 time_seq,
+ struct __prelim_ref *ref,
+ struct ulist *parents,
+ const u64 *extent_item_pos)
{
- struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_key root_key;
struct extent_buffer *eb;
@@ -283,11 +281,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int level = ref->level;
int index;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
-
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
@@ -301,7 +294,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- root_level = btrfs_old_root_level(root, time_seq);
+ if (path->search_commit_root)
+ root_level = btrfs_header_level(root->commit_root);
+ else
+ root_level = btrfs_old_root_level(root, time_seq);
if (root_level + 1 == level) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -338,7 +334,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
time_seq, ref->wanted_disk_byte,
extent_item_pos);
out:
- btrfs_free_path(path);
+ path->lowest_level = 0;
+ btrfs_release_path(path);
return ret;
}
@@ -346,7 +343,7 @@ out:
* resolve all indirect backrefs from the list
*/
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- int search_commit_root, u64 time_seq,
+ struct btrfs_path *path, u64 time_seq,
struct list_head *head,
const u64 *extent_item_pos)
{
@@ -373,9 +370,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
continue;
if (ref->count == 0)
continue;
- err = __resolve_indirect_ref(fs_info, search_commit_root,
- time_seq, ref, parents,
- extent_item_pos);
+ err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+ parents, extent_item_pos);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@@ -452,8 +448,10 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
continue;
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
- fs_info->tree_root->leafsize, 0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ 0);
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -830,7 +828,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int info_level = 0;
int ret;
- int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
struct list_head prefs_delayed;
struct list_head prefs;
struct __prelim_ref *ref;
@@ -848,7 +845,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->search_commit_root = !!search_commit_root;
+ if (!trans)
+ path->search_commit_root = 1;
/*
* grab both a lock on the path and a lock on the delayed ref head.
@@ -863,7 +861,7 @@ again:
goto out;
BUG_ON(ret == 0);
- if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+ if (trans) {
/*
* look if there are updates for this ref queued and lock the
* head
@@ -929,8 +927,8 @@ again:
__merge_refs(&prefs, 1);
- ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
- &prefs, extent_item_pos);
+ ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+ extent_item_pos);
if (ret)
goto out;
@@ -949,13 +947,14 @@ again:
struct extent_inode_elem *eie = NULL;
if (extent_item_pos && !ref->inode_list &&
ref->level == 0) {
- u32 bsz;
struct extent_buffer *eb;
- bsz = btrfs_level_size(fs_info->extent_root,
- ref->level);
+
eb = read_tree_block(fs_info->extent_root,
- ref->parent, bsz, 0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ ref->parent, 0);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
@@ -1077,9 +1076,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*
* returns 0 on success, < 0 on error.
*/
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots)
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
@@ -1146,6 +1145,20 @@ static int __inode_info(u64 inum, u64 ioff, u8 key_type,
return 0;
}
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
+{
+ int ret;
+
+ if (!trans)
+ down_read(&fs_info->commit_root_sem);
+ ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+ return ret;
+}
+
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
@@ -1376,7 +1389,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
- size = fs_info->extent_root->leafsize;
+ size = fs_info->extent_root->nodesize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
size = found_key->offset;
@@ -1552,7 +1565,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct ulist *refs = NULL;
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
@@ -1564,13 +1577,13 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
pr_debug("resolving all inodes for extent %llu\n",
extent_item_objectid);
- if (search_commit_root) {
- trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
- } else {
+ if (!search_commit_root) {
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ } else {
+ down_read(&fs_info->commit_root_sem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
@@ -1581,8 +1594,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
- ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots);
+ ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
ULIST_ITER_INIT(&root_uiter);
@@ -1604,6 +1617,8 @@ out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
+ } else {
+ up_read(&fs_info->commit_root_sem);
}
return ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 526d09e70c93..a3bd63b06044 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
#include "ulist.h"
#include "extent_io.h"
-#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
-
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 56efd8d44061..9c12395f908f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -64,7 +64,11 @@ struct btrfs_inode {
*/
struct btrfs_key location;
- /* Lock for counters */
+ /*
+ * Lock for counters and all fields used to determine if the inode is in
+ * the log or not (last_trans, last_sub_trans, last_log_commit,
+ * logged_trans).
+ */
spinlock_t lock;
/* the extent_tree has caches of all the extent mappings to disk */
@@ -242,13 +246,26 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
{
+ int ret = 0;
+
+ spin_lock(&BTRFS_I(inode)->lock);
if (BTRFS_I(inode)->logged_trans == generation &&
BTRFS_I(inode)->last_sub_trans <=
BTRFS_I(inode)->last_log_commit &&
BTRFS_I(inode)->last_sub_trans <=
- BTRFS_I(inode)->root->last_log_commit)
- return 1;
- return 0;
+ BTRFS_I(inode)->root->last_log_commit) {
+ /*
+ * After a ranged fsync we might have left some extent maps
+ * (that fall outside the fsync's range). So return false
+ * here if the list isn't empty, to make sure btrfs_log_inode()
+ * will be called and process those extent maps.
+ */
+ smp_mb();
+ if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+ ret = 1;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return ret;
}
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 98ba85a16d31..2fa057b6dfdb 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -804,7 +804,7 @@ static int btrfsic_process_superblock_dev_mirror(
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
bh = __bread(superblock_bdev, dev_bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
@@ -817,7 +817,6 @@ static int btrfsic_process_superblock_dev_mirror(
super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
- btrfs_super_leafsize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
brelse(bh);
return 0;
@@ -3205,24 +3204,12 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (root->nodesize != root->leafsize) {
- printk(KERN_INFO
- "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
- root->nodesize, root->leafsize);
- return -1;
- }
if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
return -1;
}
- if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
- printk(KERN_INFO
- "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
- return -1;
- }
if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1b9034330eae..d6284c21ee2b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -162,6 +162,7 @@ static void end_compressed_bio_read(struct bio *bio, int err)
struct inode *inode;
struct page *page;
unsigned long index;
+ unsigned int mirror = (unsigned long)bio->bi_bdev;
int ret;
if (err)
@@ -173,6 +174,13 @@ static void end_compressed_bio_read(struct bio *bio, int err)
if (!atomic_dec_and_test(&cb->pending_bios))
goto out;
+ /*
+ * Record the correct mirror_num in cb->orig_bio so that
+ * read-repair can work properly.
+ */
+ cb->orig_bio->bi_bdev = (struct block_device *)(unsigned long)mirror;
+ cb->mirror_num = mirror;
+
inode = cb->inode;
ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
if (ret)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 36c51c38e098..25d975830343 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1299,7 +1299,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
- u32 blocksize;
eb_root = btrfs_read_lock_root_node(root);
tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1318,10 +1317,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- blocksize = btrfs_level_size(root, old_root->level);
- old = read_tree_block(root, logical, blocksize, 0);
- if (!old || !extent_buffer_uptodate(old)) {
- free_extent_buffer(old);
+ old = read_tree_block(root, logical, 0);
+ if (IS_ERR(old) || !extent_buffer_uptodate(old)) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
logical);
WARN_ON(1);
@@ -1524,7 +1523,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid != root->fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
- blocksize = btrfs_level_size(root, parent_level - 1);
+ blocksize = root->nodesize;
end_slot = parent_nritems;
if (parent_nritems == 1)
@@ -1565,9 +1564,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
uptodate = 0;
if (!cur || !uptodate) {
if (!cur) {
- cur = read_tree_block(root, blocknr,
- blocksize, gen);
- if (!cur || !extent_buffer_uptodate(cur)) {
+ cur = read_tree_block(root, blocknr, gen);
+ if (IS_ERR(cur)) {
+ return PTR_ERR(cur);
+ } else if (!extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
}
@@ -1745,10 +1745,10 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
BUG_ON(level == 0);
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(parent, slot));
- if (eb && !extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
+ if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+ if (!IS_ERR(eb))
+ free_extent_buffer(eb);
eb = NULL;
}
@@ -2140,7 +2140,7 @@ static void reada_for_search(struct btrfs_root *root,
node = path->nodes[level];
search = btrfs_node_blockptr(node, slot);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
eb = btrfs_find_tree_block(root, search, blocksize);
if (eb) {
free_extent_buffer(eb);
@@ -2203,7 +2203,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2239,12 +2239,14 @@ static noinline int reada_for_balance(struct btrfs_root *root,
readahead_tree_block(root, block2, blocksize, 0);
if (block1) {
- eb = read_tree_block(root, block1, blocksize, 0);
- free_extent_buffer(eb);
+ eb = read_tree_block(root, block1, 0);
+ if (!IS_ERR(eb))
+ free_extent_buffer(eb);
}
if (block2) {
- eb = read_tree_block(root, block2, blocksize, 0);
- free_extent_buffer(eb);
+ eb = read_tree_block(root, block2, 0);
+ if (!IS_ERR(eb))
+ free_extent_buffer(eb);
}
}
return ret;
@@ -2356,7 +2358,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
tmp = btrfs_find_tree_block(root, blocknr, blocksize);
if (tmp) {
@@ -2381,7 +2383,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
/* now we're allowed to do a blocking uptodate check */
- tmp = read_tree_block(root, blocknr, blocksize, gen);
+ tmp = read_tree_block(root, blocknr, gen);
if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
*eb_ret = tmp;
return 0;
@@ -2409,8 +2411,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_release_path(p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, blocksize, 0);
- if (tmp) {
+ tmp = read_tree_block(root, blocknr, 0);
+ if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
@@ -4064,13 +4066,13 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+ right = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid,
&disk_key, 0, l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, root->leafsize);
+ root_add_used(root, root->nodesize);
memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
@@ -4916,7 +4918,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
* was nothing in the tree that matched the search criteria.
*/
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
- struct btrfs_key *max_key,
struct btrfs_path *path,
u64 min_trans)
{
@@ -4927,8 +4928,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u32 nritems;
int level;
int ret = 1;
+ int keep_locks = path->keep_locks;
- WARN_ON(!path->keep_locks);
+ path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
@@ -4994,7 +4996,6 @@ find_next_key:
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
- unlock_up(path, level, 1, 0, NULL);
goto out;
}
btrfs_set_path_blocking(path);
@@ -5009,9 +5010,12 @@ find_next_key:
btrfs_clear_path_blocking(path, NULL, 0);
}
out:
- if (ret == 0)
+ path->keep_locks = keep_locks;
+ if (ret == 0) {
+ btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
- btrfs_set_path_blocking(path);
+ }
return ret;
}
@@ -5130,7 +5134,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
{
int ret;
int cmp;
- struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *left_path = NULL;
struct btrfs_path *right_path = NULL;
struct btrfs_key left_key;
@@ -5148,9 +5151,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
u64 right_blockptr;
u64 left_gen;
u64 right_gen;
- u64 left_start_ctransid;
- u64 right_start_ctransid;
- u64 ctransid;
left_path = btrfs_alloc_path();
if (!left_path) {
@@ -5163,7 +5163,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+ tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
if (!tmp_buf) {
ret = -ENOMEM;
goto out;
@@ -5174,21 +5174,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->search_commit_root = 1;
right_path->skip_locking = 1;
- spin_lock(&left_root->root_item_lock);
- left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
-
- spin_lock(&right_root->root_item_lock);
- right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
-
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
/*
* Strategy: Go to the first items of both trees. Then do
*
@@ -5252,67 +5237,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = advance_right = 0;
while (1) {
- /*
- * We need to make sure the transaction does not get committed
- * while we do anything on commit roots. This means, we need to
- * join and leave transactions for every item that we process.
- */
- if (trans && btrfs_should_end_transaction(trans, left_root)) {
- btrfs_release_path(left_path);
- btrfs_release_path(right_path);
-
- ret = btrfs_end_transaction(trans, left_root);
- trans = NULL;
- if (ret < 0)
- goto out;
- }
- /* now rejoin the transaction */
- if (!trans) {
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
- spin_lock(&left_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
- if (ctransid != left_start_ctransid)
- left_start_ctransid = 0;
-
- spin_lock(&right_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
- if (ctransid != right_start_ctransid)
- right_start_ctransid = 0;
-
- if (!left_start_ctransid || !right_start_ctransid) {
- WARN(1, KERN_WARNING
- "btrfs: btrfs_compare_tree detected "
- "a change in one of the trees while "
- "iterating. This is probably a "
- "bug.\n");
- ret = -EIO;
- goto out;
- }
-
- /*
- * the commit root may have changed, so start again
- * where we stopped
- */
- left_path->lowest_level = left_level;
- right_path->lowest_level = right_level;
- ret = btrfs_search_slot(NULL, left_root,
- &left_key, left_path, 0, 0);
- if (ret < 0)
- goto out;
- ret = btrfs_search_slot(NULL, right_root,
- &right_key, right_path, 0, 0);
- if (ret < 0)
- goto out;
- }
-
if (advance_left && !left_end_reached) {
ret = tree_advance(left_root, left_path, &left_level,
left_root_level,
@@ -5441,14 +5365,6 @@ out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
kfree(tmp_buf);
-
- if (trans) {
- if (!ret)
- ret = btrfs_end_transaction(trans, left_root);
- else
- btrfs_end_transaction(trans, left_root);
- }
-
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b27a718da3ef..21216386181d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -379,7 +379,7 @@ struct btrfs_header {
sizeof(struct btrfs_header)) / \
sizeof(struct btrfs_key_ptr))
#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) - \
sizeof(struct btrfs_file_extent_item))
@@ -462,7 +462,7 @@ struct btrfs_super_block {
__le64 num_devices;
__le32 sectorsize;
__le32 nodesize;
- __le32 leafsize;
+ __le32 __unused_leafsize;
__le32 stripesize;
__le32 sys_chunk_array_size;
__le64 chunk_root_generation;
@@ -1202,9 +1202,11 @@ struct btrfs_block_group_cache {
u64 flags;
u64 sectorsize;
u64 cache_generation;
- unsigned int ro:1;
+ unsigned int ro;
unsigned int dirty:1;
unsigned int iref:1;
+ unsigned int has_caching_ctl:1;
+ unsigned int removed:1;
int disk_cache_state;
@@ -1232,8 +1234,10 @@ struct btrfs_block_group_cache {
*/
struct list_head cluster_list;
- /* For delayed block group creation */
- struct list_head new_bg_list;
+ /* For delayed block group creation or deletion of empty block groups */
+ struct list_head bg_list;
+
+ atomic_t trimming;
};
/* delayed seq elem */
@@ -1359,7 +1363,7 @@ struct btrfs_fs_info {
* before jumping into the main commit.
*/
struct mutex ordered_operations_mutex;
- struct rw_semaphore extent_commit_sem;
+ struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
@@ -1456,6 +1460,7 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
+ int open;
u64 total_pinned;
@@ -1595,6 +1600,17 @@ struct btrfs_fs_info {
struct btrfs_dev_replace dev_replace;
atomic_t mutually_exclusive_operation_running;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
+ struct mutex delete_unused_bgs_mutex;
+
+ /*
+ * Chunks that can't be freed yet (under a trim/discard operation)
+ * and will be latter freed. Protected by fs_info->chunk_mutex.
+ */
+ struct list_head pinned_chunks;
};
/*
@@ -1621,7 +1637,6 @@ struct btrfs_root {
struct btrfs_block_rsv *block_rsv;
/* free ino cache stuff */
- struct mutex fs_commit_mutex;
struct btrfs_free_space_ctl *free_ino_ctl;
enum btrfs_caching_type cached;
spinlock_t cache_lock;
@@ -1633,6 +1648,7 @@ struct btrfs_root {
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
+ struct list_head log_ctxs[2];
atomic_t log_writers;
atomic_t log_commit[2];
atomic_t log_batch;
@@ -1650,9 +1666,6 @@ struct btrfs_root {
/* node allocations are done in nodesize units */
u32 nodesize;
- /* leaf allocations are done in leafsize units */
- u32 leafsize;
-
u32 stripesize;
u32 type;
@@ -2741,8 +2754,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
sectorsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
nodesize, 32);
-BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
- leafsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
stripesize, 32);
BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -2963,13 +2974,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
-{
- if (level == 0)
- return root->leafsize;
- return root->nodesize;
-}
-
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(btrfs_leaf_data(leaf) + \
@@ -2999,7 +3003,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2 * num_items;
}
@@ -3010,8 +3014,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
- num_items;
+ return root->nodesize * BTRFS_MAX_LEVEL * num_items;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3094,8 +3097,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start);
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3116,6 +3126,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
@@ -3152,16 +3163,16 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_root *root,
u64 start, u64 end);
-int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes);
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 type);
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
@@ -3170,6 +3181,10 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const u64 type,
+ const bool is_allocation);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3187,7 +3202,6 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
- struct btrfs_key *max_key,
struct btrfs_path *path,
u64 min_trans);
enum btrfs_compare_tree_result {
@@ -3672,10 +3686,27 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#ifdef CONFIG_BTRFS_ASSERT
+
+static inline void assfail(char *expr, char *file, int line)
+{
+ printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d",
+ expr, file, line);
+ BUG();
+}
+
+#define ASSERT(expr) \
+ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr) ((void)0)
+#endif
+
+#define btrfs_assert()
__printf(5, 6)
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+const char *btrfs_decode_error(int errno);
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7ba7b3900cb8..6a691088c80e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -162,8 +162,12 @@ no_valid_dev_replace_entry_found:
dev_replace->srcdev->total_bytes;
dev_replace->tgtdev->disk_total_bytes =
dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->commit_total_bytes =
+ dev_replace->srcdev->commit_total_bytes;
dev_replace->tgtdev->bytes_used =
dev_replace->srcdev->bytes_used;
+ dev_replace->tgtdev->commit_bytes_used =
+ dev_replace->srcdev->commit_bytes_used;
}
dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -325,30 +329,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- mutex_lock(&fs_info->volume_mutex);
- ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
- &tgt_device);
- if (ret) {
- pr_err("btrfs: target device %s is invalid!\n",
- args->start.tgtdev_name);
- mutex_unlock(&fs_info->volume_mutex);
- return -EINVAL;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
}
+ /* the disk copy procedure reuses the scrub code */
+ mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
args->start.srcdev_name,
&src_device);
- mutex_unlock(&fs_info->volume_mutex);
if (ret) {
- ret = -EINVAL;
- goto leave_no_lock;
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
}
- if (tgt_device->total_bytes < src_device->total_bytes) {
- pr_err("btrfs: target device is smaller than source device!\n");
- ret = -EINVAL;
- goto leave_no_lock;
- }
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ src_device, &tgt_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret)
+ return ret;
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
@@ -376,10 +384,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
src_device->devid,
rcu_str_deref(tgt_device->name));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
-
/*
* from now on, the writes to the srcdev are all duplicated to
* go to the tgtdev as well (refer to btrfs_map_block()).
@@ -410,7 +414,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
/* the disk copy procedure reuses the scrub code */
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
- src_device->total_bytes,
+ btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -422,9 +426,7 @@ leave:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
btrfs_dev_replace_unlock(dev_replace);
-leave_no_lock:
- if (tgt_device)
- btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
}
@@ -482,6 +484,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&root->fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -499,6 +502,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->chunk_mutex);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -520,9 +524,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
- tgt_device->total_bytes = src_device->total_bytes;
- tgt_device->disk_total_bytes = src_device->disk_total_bytes;
- tgt_device->bytes_used = src_device->bytes_used;
+ btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
+ btrfs_device_set_disk_total_bytes(tgt_device,
+ src_device->disk_total_bytes);
+ btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+ ASSERT(list_empty(&src_device->resized_list));
+ tgt_device->commit_total_bytes = src_device->commit_total_bytes;
+ tgt_device->commit_bytes_used = src_device->bytes_used;
if (fs_info->sb->s_bdev == src_device->bdev)
fs_info->sb->s_bdev = tgt_device->bdev;
if (fs_info->fs_devices->latest_bdev == src_device->bdev)
@@ -542,6 +550,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* belong to this filesystem.
*/
btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->chunk_mutex);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
/* write back the superblocks */
@@ -603,6 +612,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *srcdev;
btrfs_dev_replace_lock(dev_replace);
/* even if !dev_replace_is_valid, the values are good enough for
@@ -625,8 +635,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ srcdev = dev_replace->srcdev;
args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
- div64_u64(dev_replace->srcdev->total_bytes, 1000));
+ div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
btrfs_dev_replace_unlock(dev_replace);
@@ -781,7 +792,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
- dev_replace->srcdev->total_bytes,
+ btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e64aba15089a..5a6d6f8aef0b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1112,19 +1112,19 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid)
+ u64 parent_transid)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
if (!buf)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret) {
free_extent_buffer(buf);
- return NULL;
+ return ERR_PTR(ret);
}
return buf;
@@ -1150,16 +1150,14 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
}
-static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+ struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
root->nodesize = nodesize;
- root->leafsize = leafsize;
root->stripesize = stripesize;
root->ref_cows = 0;
root->track_dirty = 0;
@@ -1190,6 +1188,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
+ INIT_LIST_HEAD(&root->log_ctxs[0]);
+ INIT_LIST_HEAD(&root->log_ctxs[1]);
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
@@ -1219,12 +1219,10 @@ static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
struct btrfs_root *root)
{
int ret;
- u32 blocksize;
u64 generation;
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, objectid);
ret = btrfs_find_last_root(tree_root, objectid,
&root->root_item, &root->root_key);
if (ret > 0)
@@ -1233,11 +1231,14 @@ static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
return ret;
generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->commit_root = NULL;
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
- if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
+ generation);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ return ret;
+ } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
free_extent_buffer(root->node);
root->node = NULL;
return -EIO;
@@ -1270,14 +1271,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+ leaf = btrfs_alloc_free_block(trans, root, root->nodesize,
0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
@@ -1351,9 +1351,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1366,7 +1366,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
*/
root->ref_cows = 0;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+ leaf = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
BTRFS_TREE_LOG_OBJECTID, NULL,
0, 0, 0);
if (IS_ERR(leaf)) {
@@ -1419,7 +1419,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
inode_item->generation = cpu_to_le64(1);
inode_item->size = cpu_to_le64(3);
inode_item->nlink = cpu_to_le32(1);
- inode_item->nbytes = cpu_to_le64(root->leafsize);
+ inode_item->nbytes = cpu_to_le64(root->nodesize);
inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1439,7 +1439,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_path *path;
struct extent_buffer *l;
u64 generation;
- u32 blocksize;
int ret = 0;
int slot;
@@ -1456,9 +1455,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
goto out;
}
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, location->objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, location->objectid);
path = btrfs_alloc_path();
if (!path) {
@@ -1481,15 +1479,16 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
- if (!root->node || !extent_buffer_uptodate(root->node)) {
- ret = (!root->node) ? -ENOMEM : -EIO;
-
- free_extent_buffer(root->node);
+ generation);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
kfree(root);
return ERR_PTR(ret);
+ } else if (!extent_buffer_uptodate(root->node)) {
+ free_extent_buffer(root->node);
+ kfree(root);
+ return ERR_PTR(-EIO);
}
root->commit_root = btrfs_root_node(root);
@@ -1542,7 +1541,6 @@ again:
}
btrfs_init_free_ino_ctl(root);
- mutex_init(&root->fs_commit_mutex);
spin_lock_init(&root->cache_lock);
init_waitqueue_head(&root->cache_wait);
@@ -1687,6 +1685,16 @@ static int cleaner_kthread(void *arg)
* needn't do anything special here.
*/
btrfs_run_defrag_inodes(root->fs_info);
+
+ /*
+ * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * with relocation (btrfs_relocate_chunk) and relocation
+ * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
+ * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * can't hold, nor need to, fs_info->cleaner_mutex when deleting
+ * unused block groups.
+ */
+ btrfs_delete_unused_bgs(root->fs_info);
sleep:
if (freezing(current)) {
refrigerator();
@@ -2059,8 +2067,6 @@ int open_ctree(struct super_block *sb,
{
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
- u32 blocksize;
u32 stripesize;
u64 generation;
u64 features;
@@ -2143,7 +2149,10 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
rwlock_init(&fs_info->tree_mod_log_lock);
+ mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
seqlock_init(&fs_info->profiles_lock);
@@ -2151,6 +2160,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->unused_bgs);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
@@ -2259,7 +2269,7 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
- init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
fs_info->dev_replace.lock_owner = 0;
@@ -2286,7 +2296,9 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
- __setup_root(4096, 4096, 4096, 4096, tree_root,
+ INIT_LIST_HEAD(&fs_info->pinned_chunks);
+
+ __setup_root(4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
invalidate_bdev(fs_devices->latest_bdev);
@@ -2374,19 +2386,22 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) !=
+ /*
+ * Leafsize and nodesize were always equal, this is only a sanity check.
+ */
+ if (le32_to_cpu(disk_super->__unused_leafsize) !=
btrfs_super_nodesize(disk_super)) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksizes don't match. node %d leaf %d\n",
btrfs_super_nodesize(disk_super),
- btrfs_super_leafsize(disk_super));
+ le32_to_cpu(disk_super->__unused_leafsize));
err = -EINVAL;
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksize (%d) was too large\n",
- btrfs_super_leafsize(disk_super));
+ btrfs_super_nodesize(disk_super));
err = -EINVAL;
goto fail_alloc;
}
@@ -2403,7 +2418,7 @@ int open_ctree(struct super_block *sb,
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
- if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
@@ -2424,10 +2439,9 @@ int open_ctree(struct super_block *sb,
}
nodesize = btrfs_super_nodesize(disk_super);
- leafsize = btrfs_super_leafsize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = btrfs_super_stripesize(disk_super);
- fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
/*
@@ -2435,7 +2449,7 @@ int open_ctree(struct super_block *sb,
* extent buffers for the same range. It leads to corruptions
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != leafsize)) {
+ (sectorsize != nodesize)) {
printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
@@ -2558,7 +2572,6 @@ int open_ctree(struct super_block *sb,
4 * 1024 * 1024 / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
- tree_root->leafsize = leafsize;
tree_root->sectorsize = sectorsize;
tree_root->stripesize = stripesize;
@@ -2587,20 +2600,21 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_chunk_root_level(disk_super));
generation = btrfs_super_chunk_root_generation(disk_super);
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
- chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+ __setup_root(nodesize, sectorsize, stripesize, chunk_root,
+ fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
- blocksize, generation);
- if (!chunk_root->node ||
+ generation);
+ if (IS_ERR(chunk_root->node) ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
+ if (!IS_ERR(chunk_root->node))
+ free_extent_buffer(chunk_root->node);
+ chunk_root->node = NULL;
goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
@@ -2630,18 +2644,18 @@ int open_ctree(struct super_block *sb,
}
retry_root_backup:
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
- blocksize, generation);
- if (!tree_root->node ||
+ generation);
+ if (IS_ERR(tree_root->node) ||
!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
sb->s_id);
-
+ if (!IS_ERR(tree_root->node))
+ free_extent_buffer(tree_root->node);
+ tree_root->node = NULL;
goto recovery_tree_root;
}
@@ -2767,9 +2781,6 @@ retry_root_backup:
err = -EIO;
goto fail_qgroup;
}
- blocksize =
- btrfs_level_size(tree_root,
- btrfs_super_log_root_level(disk_super));
log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
@@ -2777,16 +2788,16 @@ retry_root_backup:
goto fail_qgroup;
}
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ __setup_root(nodesize, sectorsize, stripesize,
log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(tree_root, bytenr,
- blocksize,
generation + 1);
- if (!log_tree_root->node ||
+ if (IS_ERR(log_tree_root->node) ||
!extent_buffer_uptodate(log_tree_root->node)) {
printk(KERN_ERR "btrfs: failed to read log tree\n");
- free_extent_buffer(log_tree_root->node);
+ if (!IS_ERR(log_tree_root->node))
+ free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
goto fail_trans_kthread;
}
@@ -2867,6 +2878,8 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
+ fs_info->open = 1;
+
return 0;
fail_qgroup:
@@ -3018,7 +3031,8 @@ static int write_dev_supers(struct btrfs_device *device,
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
break;
if (wait) {
@@ -3323,8 +3337,9 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
btrfs_set_stack_device_type(dev_item, dev->type);
btrfs_set_stack_device_id(dev_item, dev->devid);
btrfs_set_stack_device_total_bytes(dev_item,
- dev->disk_total_bytes);
- btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ dev->commit_total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item,
+ dev->commit_bytes_used);
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3503,6 +3518,13 @@ int close_ctree(struct btrfs_root *root)
btrfs_cleanup_defrag_inodes(fs_info);
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ /*
+ * If the cleaner thread is stopped and there are
+ * block groups queued for removal, the deletion will be
+ * skipped when we quit the cleaner thread.
+ */
+ btrfs_delete_unused_bgs(root->fs_info);
+
ret = btrfs_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
@@ -3537,6 +3559,7 @@ int close_ctree(struct btrfs_root *root)
del_fs_roots(fs_info);
+ fs_info->open = 0;
free_root_pointers(fs_info, 1);
iput(fs_info->btree_inode);
@@ -3557,6 +3580,17 @@ int close_ctree(struct btrfs_root *root)
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
+ lock_chunks(root);
+ while (!list_empty(&fs_info->pinned_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&fs_info->pinned_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
+ unlock_chunks(root);
+
return 0;
}
@@ -3923,8 +3957,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
eb = btrfs_find_tree_block(root, start,
- root->leafsize);
- start += root->leafsize;
+ root->nodesize);
+ start += root->nodesize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 29a28e6c2ad0..c7f2380e3abf 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -37,7 +37,7 @@ struct btrfs_device;
struct btrfs_fs_devices;
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid);
+ u64 parent_transid);
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
u64 parent_transid);
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d9683146e47f..7e5bfff8f463 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -417,7 +417,7 @@ static noinline void caching_thread(struct btrfs_work *work)
again:
mutex_lock(&caching_ctl->mutex);
/* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->extent_commit_sem);
+ down_read(&fs_info->commit_root_sem);
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
@@ -442,7 +442,7 @@ again:
if (need_resched()) {
caching_ctl->progress = last;
btrfs_release_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
goto again;
@@ -474,7 +474,7 @@ again:
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
- fs_info->tree_root->leafsize;
+ fs_info->tree_root->nodesize;
else
last = key.objectid + key.offset;
@@ -499,7 +499,7 @@ again:
err:
btrfs_free_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
free_excluded_extents(extent_root, block_group);
@@ -589,6 +589,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
}
spin_unlock(&cache->lock);
@@ -609,6 +610,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_NO;
} else {
cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
}
spin_unlock(&cache->lock);
wake_up(&caching_ctl->wait);
@@ -619,10 +621,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
return 0;
}
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->extent_commit_sem);
+ up_write(&fs_info->commit_root_sem);
btrfs_get_block_group(cache);
@@ -741,7 +743,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
* different
*/
if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
- offset = root->leafsize;
+ offset = root->nodesize;
metadata = 0;
}
@@ -774,7 +776,7 @@ search_again:
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == root->leafsize)
+ key.offset == root->nodesize)
ret = 0;
}
}
@@ -1859,14 +1861,81 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_issue_discard(struct block_device *bdev,
- u64 start, u64 len)
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+ u64 *discarded_bytes)
{
- return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+ int j, ret = 0;
+ u64 bytes_left, end;
+ u64 aligned_start = ALIGN(start, 1 << 9);
+
+ if (WARN_ON(start != aligned_start)) {
+ len -= aligned_start - start;
+ len = round_down(len, 1 << 9);
+ start = aligned_start;
+ }
+
+ *discarded_bytes = 0;
+
+ if (!len)
+ return 0;
+
+ end = start + len;
+ bytes_left = len;
+
+ /* Skip any superblocks on this device. */
+ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+ u64 sb_start = btrfs_sb_offset(j);
+ u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+ u64 size = sb_start - start;
+
+ if (!in_range(sb_start, start, bytes_left) &&
+ !in_range(sb_end, start, bytes_left) &&
+ !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+ continue;
+
+ /*
+ * Superblock spans beginning of range. Adjust start and
+ * try again.
+ */
+ if (sb_start <= start) {
+ start += sb_end - start;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ continue;
+ }
+
+ if (size) {
+ ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += size;
+ else if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ start = sb_end;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ }
+
+ if (bytes_left) {
+ ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += bytes_left;
+ }
+ return ret;
}
-static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes)
{
int ret;
u64 discarded_bytes = 0;
@@ -1883,14 +1952,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ u64 bytes;
if (!stripe->dev->can_discard)
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
stripe->physical,
- stripe->length);
+ stripe->length,
+ &bytes);
if (!ret)
- discarded_bytes += stripe->length;
+ discarded_bytes += bytes;
else if (ret != -EOPNOTSUPP)
break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
@@ -2558,7 +2629,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
num_heads = heads_to_leaves(root, num_heads);
if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->leafsize;
+ num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
global_rsv = &root->fs_info->global_block_rsv;
@@ -3025,7 +3096,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = btrfs_level_size(root, level - 1);
+ num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
for_cow);
@@ -3736,7 +3807,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
}
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
{
u64 num_dev;
@@ -3748,24 +3819,47 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
else
num_dev = 1; /* DUP or single */
- /* metadata for updaing devices and chunk tree */
- return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+ return num_dev;
}
-static void check_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 type,
+ const bool is_allocation)
{
struct btrfs_space_info *info;
u64 left;
u64 thresh;
+ int ret = 0;
+ u64 num_devs;
+
+ /*
+ * Needed because we can end up allocating a system chunk and for an
+ * atomic and race free space reservation in the chunk block reserve.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - info->bytes_used - info->bytes_pinned -
- info->bytes_reserved - info->bytes_readonly;
+ info->bytes_reserved - info->bytes_readonly -
+ info->bytes_may_use;
spin_unlock(&info->lock);
- thresh = get_system_chunk_thresh(root, type);
+ num_devs = get_profile_num_devs(root, type);
+
+ /* num_devs device items to update and 1 chunk item to add or remove */
+ if (is_allocation)
+ thresh = btrfs_calc_trans_metadata_size(root, num_devs + 1);
+ else
+ thresh = btrfs_calc_trans_metadata_size(root, num_devs) +
+ btrfs_calc_trunc_metadata_size(root, 1);
+
if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
btrfs_debug(root->fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
@@ -3776,7 +3870,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
u64 flags;
flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
- btrfs_alloc_chunk(trans, root, flags);
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
+ */
+ ret = btrfs_alloc_chunk(trans, root, flags);
+ }
+
+ if (!ret) {
+ ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+ &root->fs_info->chunk_block_rsv,
+ thresh, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->chunk_bytes_reserved += thresh;
}
}
@@ -3863,7 +3971,7 @@ again:
* Check if we have enough space in SYSTEM chunk because we may need
* to update devices.
*/
- check_system_chunk(trans, extent_root, flags);
+ check_system_chunk(trans, extent_root, flags, true);
ret = btrfs_alloc_chunk(trans, extent_root, flags);
trans->allocating_chunk = false;
@@ -4608,7 +4716,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
if (num_bytes * 3 > meta_used)
num_bytes = div64_u64(meta_used, 3);
- return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+ return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4704,6 +4812,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
trans->bytes_reserved = 0;
}
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+ WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+ block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+ trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved = 0;
+}
+
/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
@@ -4758,7 +4884,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * root->leafsize;
+ num_bytes = 3 * root->nodesize;
ret = btrfs_qgroup_reserve(root, num_bytes);
if (ret)
return ret;
@@ -4946,7 +5072,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -4955,7 +5081,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->leafsize);
+ nr_extents * root->nodesize);
goto out_fail;
}
@@ -5071,7 +5197,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
btrfs_qgroup_free(root, num_bytes +
- dropped * root->leafsize);
+ dropped * root->nodesize);
}
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5203,6 +5329,19 @@ static int update_block_group(struct btrfs_root *root,
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
+ /*
+ * No longer have used bytes in this block group, queue
+ * it for deletion.
+ */
+ if (old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -5436,7 +5575,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
@@ -5455,7 +5594,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
else
fs_info->pinned_extents = &fs_info->freed_extents[0];
- up_write(&fs_info->extent_commit_sem);
+ up_write(&fs_info->commit_root_sem);
update_global_block_rsv(fs_info);
}
@@ -5525,24 +5664,26 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- if (trans->aborted)
- return 0;
-
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
unpin = &fs_info->freed_extents[0];
- while (1) {
+ while (!trans->aborted) {
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
- if (ret)
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
+ }
if (btrfs_test_opt(root, DISCARD))
ret = btrfs_discard_extent(root, start,
@@ -5550,9 +5691,38 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
clear_extent_dirty(unpin, start, end, GFP_NOFS);
unpin_extent_range(root, start, end, true);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
+ /*
+ * Transaction is finished. We don't need the lock anymore. We
+ * do need to clean up the block groups in case of a transaction
+ * abort.
+ */
+ deleted_bgs = &trans->transaction->deleted_bgs;
+ list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+ u64 trimmed = 0;
+
+ ret = -EROFS;
+ if (!trans->aborted)
+ ret = btrfs_discard_extent(root,
+ block_group->key.objectid,
+ block_group->key.offset,
+ &trimmed);
+
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group_trimming(block_group);
+ btrfs_put_block_group(block_group);
+
+ if (ret) {
+ const char *errstr = btrfs_decode_error(ret);
+ btrfs_warn(fs_info,
+ "Discard failed while removing blockgroup: errno=%d %s\n",
+ ret, errstr);
+ }
+ }
+
return 0;
}
@@ -6703,7 +6873,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+ ret = update_block_group(root, ins->objectid, root->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -6981,7 +7151,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
- blocksize = btrfs_level_size(root, wc->level - 1);
+ blocksize = root->nodesize;
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
@@ -7155,7 +7325,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- blocksize = btrfs_level_size(root, level - 1);
+ blocksize = root->nodesize;
next = btrfs_find_tree_block(root, bytenr, blocksize);
if (!next) {
@@ -7216,8 +7386,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(root, bytenr, blocksize, generation);
- if (!next || !extent_buffer_uptodate(next)) {
+ next = read_tree_block(root, bytenr, generation);
+ if (IS_ERR(next)) {
+ return PTR_ERR(next);
+ } else if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
}
@@ -7778,14 +7950,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 min_allocable_bytes;
int ret = -ENOSPC;
-
/*
* We need some metadata space and system metadata space for
* allocating chunks in some corner cases until we force to set
@@ -7802,6 +7973,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
spin_lock(&cache->lock);
if (cache->ro) {
+ cache->ro++;
ret = 0;
goto out;
}
@@ -7813,7 +7985,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
- cache->ro = 1;
+ cache->ro++;
ret = 0;
}
out:
@@ -7822,7 +7994,7 @@ out:
return ret;
}
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
@@ -7830,21 +8002,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
u64 alloc_flags;
int ret;
- BUG_ON(cache->ro);
-
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
+ /*
+ * if we are changing raid levels, try to allocate a corresponding
+ * block group with the new raid level.
+ */
alloc_flags = update_block_group_flags(root, cache->flags);
if (alloc_flags != cache->flags) {
ret = do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
+ /*
+ * ENOSPC is allowed here, we may have enough space
+ * already allocated at the new raid level to
+ * carry on
+ */
+ if (ret == -ENOSPC)
+ ret = 0;
if (ret < 0)
goto out;
}
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -7852,8 +8033,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
out:
+ if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ lock_chunks(root->fs_info->chunk_root);
+ check_system_chunk(trans, root, alloc_flags, true);
+ unlock_chunks(root->fs_info->chunk_root);
+ }
+
btrfs_end_transaction(trans, root);
return ret;
}
@@ -7922,7 +8110,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -7932,10 +8120,12 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
- sinfo->bytes_readonly -= num_bytes;
- cache->ro = 0;
+ if (!--cache->ro) {
+ num_bytes = cache->key.offset - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ }
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -7952,6 +8142,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
struct btrfs_space_info *space_info;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
+ struct btrfs_trans_handle *trans;
u64 min_free;
u64 dev_min = 1;
u64 dev_nr = 0;
@@ -8038,6 +8229,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
do_div(min_free, dev_min);
}
+ /* We need to do this so that we can look at pending chunks */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 dev_offset;
@@ -8048,7 +8246,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
if (device->total_bytes > device->bytes_used + min_free &&
!device->is_tgtdev_for_dev_replace) {
- ret = find_free_dev_extent(device, min_free,
+ ret = find_free_dev_extent(trans, device, min_free,
&dev_offset, NULL);
if (!ret)
dev_nr++;
@@ -8060,6 +8258,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
}
}
mutex_unlock(&root->fs_info->chunk_mutex);
+ btrfs_end_transaction(trans, root);
out:
btrfs_put_block_group(block_group);
return ret;
@@ -8142,14 +8341,24 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->extent_commit_sem);
+ down_write(&info->commit_root_sem);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
put_caching_control(caching_ctl);
}
- up_write(&info->extent_commit_sem);
+ up_write(&info->commit_root_sem);
+
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8369,8 +8578,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
set_avail_alloc_bits(root->fs_info, cache->flags);
- if (btrfs_chunk_readonly(root, cache->key.objectid))
- set_block_group_ro(cache, 1);
+ if (btrfs_chunk_readonly(root, cache->key.objectid)) {
+ inc_block_group_ro(cache, 1);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ /* Should always be true but just in case. */
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
}
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -8384,9 +8603,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* mirrored block groups.
*/
list_for_each_entry(cache, &space_info->block_groups[3], list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
list_for_each_entry(cache, &space_info->block_groups[4], list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
}
init_global_block_rsv(info);
@@ -8405,10 +8624,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret = 0;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
- new_bg_list) {
- list_del_init(&block_group->new_bg_list);
-
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ list_del_init(&block_group->bg_list);
if (ret)
continue;
@@ -8421,6 +8638,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
sizeof(item));
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ ret = btrfs_finish_chunk_alloc(trans, extent_root,
+ key.objectid, key.offset);
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
}
}
@@ -8457,9 +8678,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
+ INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
+ atomic_set(&cache->trimming, 0);
btrfs_set_block_group_used(&cache->item, bytes_used);
btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -8511,7 +8733,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache);
- list_add_tail(&cache->new_bg_list, &trans->new_bgs);
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
@@ -8534,7 +8756,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start)
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em)
{
struct btrfs_path *path;
struct btrfs_block_group_cache *block_group;
@@ -8545,6 +8768,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int ret;
int index;
int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
root = root->fs_info->extent_root;
@@ -8644,8 +8869,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
clear_avail_alloc_bits(root->fs_info, block_group->flags);
up_write(&block_group->space_info->groups_sem);
+ if (block_group->has_caching_ctl)
+ caching_ctl = get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
+ if (block_group->has_caching_ctl) {
+ down_write(&root->fs_info->commit_root_sem);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl,
+ &root->fs_info->caching_block_groups, list)
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ atomic_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ up_write(&root->fs_info->commit_root_sem);
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ put_caching_control(caching_ctl);
+ put_caching_control(caching_ctl);
+ }
+ }
btrfs_remove_free_space_cache(block_group);
@@ -8657,7 +8906,75 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
- btrfs_clear_space_info_full(root->fs_info);
+ lock_chunks(root);
+ if (!list_empty(&em->list)) {
+ /* We're in the transaction->pending_chunks list. */
+ free_extent_map(em);
+ }
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ /*
+ * At this point trimming can't start on this block group, because we
+ * removed the block group from the tree fs_info->block_group_cache_tree
+ * so no one can't find it anymore and even if someone already got this
+ * block group before we removed it from the rbtree, they have already
+ * incremented block_group->trimming - if they didn't, they won't find
+ * any free space entries because we already removed them all when we
+ * called btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is because our
+ * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
+ */
+ remove_em = (atomic_read(&block_group->trimming) == 0);
+ /*
+ * Make sure a trimmer task always sees the em in the pinned_chunks list
+ * if it sees block_group->removed == 1 (needs to lock block_group->lock
+ * before checking block_group->removed).
+ */
+ if (!remove_em) {
+ /*
+ * Our em might be in trans->transaction->pending_chunks which
+ * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+ * and so is the fs_info->pinned_chunks list.
+ *
+ * So at this point we must be holding the chunk_mutex to avoid
+ * any races with chunk allocation (more specifically at
+ * volumes.c:contains_pending_extent()), to ensure it always
+ * sees the em, either in the pending_chunks list or in the
+ * pinned_chunks list.
+ */
+ list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+ }
+ spin_unlock(&block_group->lock);
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ /*
+ * The em might be in the pending_chunks list, so make sure the
+ * chunk mutex is locked, since remove_extent_mapping() will
+ * delete us from that list.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+
+ unlock_chunks(root);
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -8674,6 +8991,213 @@ out:
return ret;
}
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = (struct map_lookup *)em->bdev;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!fs_info->open)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+ int trimming;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ btrfs_block_group_used(&block_group->item) ||
+ (block_group->ro && !block_group->removed) ||
+ list_is_singular(&block_group->list)) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = inc_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->key.objectid);
+ if (IS_ERR(trans)) {
+ btrfs_dec_block_group_ro(root, block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N,
+ * another task might be running finish_extent_commit() for the
+ * previous transaction N - 1, and have seen a range belonging
+ * to the block group in freed_extents[] before we were able to
+ * clear the whole block group range from freed_extents[]. This
+ * means that task can lookup for the block group after we
+ * unpinned it from freed_extents[] and removed it, leading to
+ * a BUG_ON() at btrfs_unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_dec_block_group_ro(root, block_group);
+ goto end_trans;
+ }
+ ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_dec_block_group_ro(root, block_group);
+ goto end_trans;
+ }
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+
+ space_info->bytes_pinned -= block_group->pinned;
+ space_info->bytes_readonly += block_group->pinned;
+ block_group->pinned = 0;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ /* DISCARD can flip during remount */
+ trimming = btrfs_test_opt(root, DISCARD);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_get_block_group_trimming(block_group);
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, root,
+ block_group->key.objectid);
+
+ if (ret) {
+ if (trimming)
+ btrfs_put_block_group_trimming(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
+end_trans:
+ btrfs_end_transaction(trans, root);
+next:
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -8717,16 +9241,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
return unpin_extent_range(root, start, end, false);
}
-int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space. Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time. We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses. For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+ u64 minlen, u64 *trimmed)
{
- return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+ u64 start = 0, len = 0;
+ int ret;
+
+ *trimmed = 0;
+
+ /* Not writeable = nothing to do. */
+ if (!device->writeable)
+ return 0;
+
+ /* No free space = nothing to do. */
+ if (device->total_bytes <= device->bytes_used)
+ return 0;
+
+ ret = 0;
+
+ while (1) {
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct btrfs_transaction *trans;
+ u64 bytes;
+
+ ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+ if (ret)
+ return ret;
+
+ down_read(&fs_info->commit_root_sem);
+
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ret = find_free_dev_extent_start(trans, device, minlen, start,
+ &start, &len);
+ if (trans)
+ btrfs_put_transaction(trans);
+
+ if (ret) {
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (ret)
+ break;
+
+ start += len;
+ *trimmed += bytes;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
}
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_device *device;
+ struct list_head *devices;
u64 group_trimmed;
u64 start;
u64 end;
@@ -8781,6 +9388,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
cache = next_block_group(fs_info->tree_root, cache);
}
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ devices = &root->fs_info->fs_devices->alloc_list;
+ list_for_each_entry(device, devices, dev_alloc_list) {
+ ret = btrfs_trim_free_extents(device, range->minlen,
+ &group_trimmed);
+ if (ret)
+ break;
+
+ trimmed += group_trimmed;
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
range->len = trimmed;
return ret;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 98c47c9782c4..0a6ed40d5fe4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2372,11 +2372,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct page *page = bvec->bv_page;
struct extent_state *cached = NULL;
struct extent_state *state;
+ struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
"mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
(long int)bio->bi_bdev);
- tree = &BTRFS_I(page->mapping->host)->io_tree;
+ tree = &BTRFS_I(inode)->io_tree;
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
@@ -2450,6 +2451,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
if (uptodate) {
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+
+ /* Zero out the end if this page straddles i_size */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && offset)
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8c6f3b4e1e60..c300e6f66958 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -76,6 +76,8 @@ void free_extent_map(struct extent_map *em)
if (atomic_dec_and_test(&em->refs)) {
WARN_ON(em->in_tree);
WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->bdev);
kmem_cache_free(extent_map_cache, em);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 61adc44b7805..9c8d504f632c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -15,6 +15,7 @@
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
struct extent_map {
struct rb_node rb_node;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eec6443f0bdd..a311e61ac928 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1279,6 +1279,7 @@ again:
}
wait_on_page_writeback(pages[i]);
}
+ faili = num_pages - 1;
err = 0;
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1297,8 +1298,10 @@ again:
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- btrfs_wait_ordered_range(inode, start_pos,
- last_pos - start_pos);
+ err = btrfs_wait_ordered_range(inode, start_pos,
+ last_pos - start_pos);
+ if (err)
+ goto fail;
goto again;
}
if (ordered)
@@ -1542,7 +1545,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
dirty_pages);
- if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root);
pos += copied;
@@ -1708,23 +1711,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
mutex_unlock(&inode->i_mutex);
/*
- * we want to make sure fsync finds this change
- * but we haven't joined a transaction running right now.
- *
- * Later on, someone is sure to update the inode and get the
- * real transid recorded.
- *
- * We set last_trans now to the fs_info generation + 1,
- * this will either be one more than the running transaction
- * or the generation used for the next transaction if there isn't
- * one running right now.
- *
* We also have to set last_sub_trans to the current log transid,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occured.
*/
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_sub_trans = root->log_transid;
+ spin_unlock(&BTRFS_I(inode)->lock);
if (num_written > 0 || num_written == -EIOCBQUEUED) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
@@ -1770,6 +1763,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
return 0;
}
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+
+ return ret;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -1786,9 +1793,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
struct btrfs_trans_handle *trans;
+ struct btrfs_log_ctx ctx;
+ int ret = 0;
bool full_sync = 0;
+ const u64 len = end - start + 1;
trace_btrfs_sync_file(file, datasync);
@@ -1798,48 +1807,101 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* multi-task, and make the performance up. See
* btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
- atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- atomic_dec(&BTRFS_I(inode)->sync_writers);
+ ret = start_ordered_ops(inode, start, end);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
-
- /*
- * We flush the dirty pages again to avoid some dirty pages in the
- * range being left.
- */
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- if (full_sync)
- btrfs_wait_ordered_range(inode, start, end - start + 1);
- atomic_inc(&root->log_batch);
-
/*
- * check the transaction that last modified this inode
- * and see if its already been committed
+ * We might have have had more pages made dirty after calling
+ * start_ordered_ops and before acquiring the inode's i_mutex.
*/
- if (!BTRFS_I(inode)->last_trans) {
+ if (full_sync) {
+ /*
+ * For a full sync, we need to make sure any ordered operations
+ * start and finish before we start logging the inode, so that
+ * all extents are persisted and the respective file extent
+ * items are in the fs/subvol btree.
+ */
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Start any new ordered operations before starting to log the
+ * inode. We will wait for them to finish in btrfs_sync_log().
+ *
+ * Right before acquiring the inode's mutex, we might have new
+ * writes dirtying pages, which won't immediately start the
+ * respective ordered operations - that is done through the
+ * fill_delalloc callbacks invoked from the writepage and
+ * writepages address space operations. So make sure we start
+ * all ordered operations before starting to log our inode. Not
+ * doing this means that while logging the inode, writeback
+ * could start and invoke writepage/writepages, which would call
+ * the fill_delalloc callbacks (cow_file_range,
+ * submit_compressed_extents). These callbacks add first an
+ * extent map to the modified list of extents and then create
+ * the respective ordered operation, which means in
+ * tree-log.c:btrfs_log_inode() we might capture all existing
+ * ordered operations (with btrfs_get_logged_extents()) before
+ * the fill_delalloc callback adds its ordered operation, and by
+ * the time we visit the modified list of extent maps (with
+ * btrfs_log_changed_extents()), we see and process the extent
+ * map they created. We then use the extent map to construct a
+ * file extent item for logging without waiting for the
+ * respective ordered operation to finish - this file extent
+ * item points to a disk location that might not have yet been
+ * written to, containing random data - so after a crash a log
+ * replay will make our inode have file extent items that point
+ * to disk locations containing invalid data, as we returned
+ * success to userspace without waiting for the respective
+ * ordered operation to finish, because it wasn't captured by
+ * btrfs_get_logged_extents().
+ */
+ ret = start_ordered_ops(inode, start, end);
+ }
+ if (ret) {
mutex_unlock(&inode->i_mutex);
goto out;
}
+ atomic_inc(&root->log_batch);
/*
- * if the last transaction that changed this file was before
- * the current transaction, we can bail out now without any
- * syncing
+ * If the last transaction that changed this file was before the current
+ * transaction and we have the full sync flag set in our inode, we can
+ * bail out now without any syncing.
+ *
+ * Note that we can't bail out if the full sync flag isn't set. This is
+ * because when the full sync flag is set we start all ordered extents
+ * and wait for them to fully complete - when they complete they update
+ * the inode's last_trans field through:
+ *
+ * btrfs_finish_ordered_io() ->
+ * btrfs_update_inode_fallback() ->
+ * btrfs_update_inode() ->
+ * btrfs_set_inode_last_trans()
+ *
+ * So we are sure that last_trans is up to date and can do this check to
+ * bail out safely. For the fast path, when the full sync flag is not
+ * set in our inode, we can not do it because we start only our ordered
+ * extents and don't wait for them to complete (that is when
+ * btrfs_finish_ordered_io runs), so here at this point their last_trans
+ * value might be less than or equals to fs_info->last_trans_committed,
+ * and setting a speculative last_trans for an inode when a buffered
+ * write is made (such as fs_info->generation + 1 for example) would not
+ * be reliable since after setting the value and before fsync is called
+ * any number of transactions can start and commit (transaction kthread
+ * commits the current transaction periodically), and a transaction
+ * commit does not start nor waits for ordered extents to complete.
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed) {
- BTRFS_I(inode)->last_trans = 0;
-
+ (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed &&
+ (full_sync ||
+ !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -1864,10 +1926,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
- ret = btrfs_log_dentry_safe(trans, root, dentry);
+ btrfs_init_log_ctx(&ctx);
+
+ ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
if (ret < 0) {
- mutex_unlock(&inode->i_mutex);
- goto out;
+ /* Fallthrough and commit/free transaction. */
+ ret = 1;
}
/* we've logged all the items and now have a consistent
@@ -1883,27 +1947,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_unlock(&inode->i_mutex);
if (ret != BTRFS_NO_LOG_SYNC) {
- if (ret > 0) {
- /*
- * If we didn't already wait for ordered extents we need
- * to do that now.
- */
- if (!full_sync)
- btrfs_wait_ordered_range(inode, start,
- end - start + 1);
- ret = btrfs_commit_transaction(trans, root);
- } else {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0) {
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
+ if (!ret) {
ret = btrfs_end_transaction(trans, root);
- } else {
- if (!full_sync)
- btrfs_wait_ordered_range(inode, start,
- end -
- start + 1);
- ret = btrfs_commit_transaction(trans, root);
+ goto out;
}
}
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start,
+ end - start + 1);
+ if (ret)
+ goto out;
+ }
+ ret = btrfs_commit_transaction(trans, root);
} else {
ret = btrfs_end_transaction(trans, root);
}
@@ -2097,6 +2154,39 @@ static void __truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_
truncate_inode_pages_range(mapping, lstart, lend);
}
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ * em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+{
+ struct extent_map *em;
+ int ret = 0;
+ u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+
+ em = btrfs_get_extent(inode, NULL, 0,
+ round_down(*start, sectorsize),
+ round_up(*len, sectorsize), 0);
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ return ret;
+ }
+
+ /* Hole or vacuum extent(only exists in no-hole mode) */
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ ret = 1;
+ *len = em->start + em->len > *start + *len ?
+ 0 : *start + *len - em->start - em->len;
+ *start = em->start + em->len;
+ }
+ free_extent_map(em);
+ return ret;
+}
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
@@ -2105,20 +2195,40 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
struct btrfs_path *path;
struct btrfs_block_rsv *rsv;
struct btrfs_trans_handle *trans;
- u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
- u64 lockend = round_down(offset + len,
- BTRFS_I(inode)->root->sectorsize) - 1;
- u64 cur_offset = lockstart;
+ u64 lockstart;
+ u64 lockend;
+ u64 tail_start;
+ u64 tail_len;
+ u64 orig_start = offset;
+ u64 cur_offset;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
u64 drop_end;
int ret = 0;
int err = 0;
- bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
- ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+ bool same_page;
+ bool truncated_page = false;
+ bool updated_inode = false;
- btrfs_wait_ordered_range(inode, offset, len);
+ ret = btrfs_wait_ordered_range(inode, offset, len);
+ if (ret)
+ return ret;
mutex_lock(&inode->i_mutex);
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ /* Already in a large hole */
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+ lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
+ same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
/*
* We needn't truncate any page which is beyond the end of the file
* because we are sure there is no data there.
@@ -2128,14 +2238,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* entire page.
*/
if (same_page && len < PAGE_CACHE_SIZE) {
- if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+ if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode, offset, len, 0);
- mutex_unlock(&inode->i_mutex);
- return ret;
+ } else {
+ ret = 0;
+ }
+ goto out_only_mutex;
}
/* zero back part of the first page */
if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode, offset, 0, 0);
if (ret) {
mutex_unlock(&inode->i_mutex);
@@ -2143,18 +2257,46 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
}
- /* zero the front end of the last page */
- if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
- ret = btrfs_truncate_page(inode, offset + len, 0, 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- return ret;
+ /* Check the aligned pages after the first unaligned page,
+ * if offset != orig_start, which means the first unaligned page
+ * including serveral following pages are already in holes,
+ * the extra check can be skipped */
+ if (offset == orig_start) {
+ /* after truncate page, check hole again */
+ len = offset + len - lockstart;
+ offset = lockstart;
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+ lockstart = offset;
+ }
+
+ /* Check the tail unaligned part is in a hole */
+ tail_start = lockend + 1;
+ tail_len = offset + len - tail_start;
+ if (tail_len) {
+ ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ if (unlikely(ret < 0))
+ goto out_only_mutex;
+ if (!ret) {
+ /* zero the front end of the last page */
+ if (tail_start + tail_len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ truncated_page = true;
+ ret = btrfs_truncate_page(inode,
+ tail_start + tail_len, 0, 1);
+ if (ret)
+ goto out_only_mutex;
+ }
}
}
if (lockend < lockstart) {
- mutex_unlock(&inode->i_mutex);
- return 0;
+ ret = 0;
+ goto out_only_mutex;
}
while (1) {
@@ -2185,8 +2327,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, &cached_state, GFP_NOFS);
- btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
+ ret = btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
}
path = btrfs_alloc_path();
@@ -2219,6 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
BUG_ON(ret);
trans->block_rsv = rsv;
+ cur_offset = lockstart;
+ len = lockend - cur_offset;
while (cur_offset < lockend) {
ret = __btrfs_drop_extents(trans, root, inode, path,
cur_offset, lockend + 1,
@@ -2256,6 +2404,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
rsv, min_size);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
+
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
}
if (ret) {
@@ -2279,6 +2435,7 @@ out_trans:
trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
+ updated_inode = true;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
out_free:
@@ -2287,6 +2444,23 @@ out_free:
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
+out_only_mutex:
+ if (!updated_inode && truncated_page && !ret && !err) {
+ /*
+ * If we only end up zeroing part of a page, we still need to
+ * update the inode item, so that all the time fields are
+ * updated as well as the necessary btrfs inode in memory fields
+ * for detecting, at fsync time, if the inode isn't yet in the
+ * log tree or it's there but not up to date.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ } else {
+ err = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_end_transaction(trans, root);
+ }
+ }
mutex_unlock(&inode->i_mutex);
if (ret && !err)
err = ret;
@@ -2338,12 +2512,6 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out_reserve_fail;
}
- /*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
- */
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
@@ -2354,8 +2522,26 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start);
if (ret)
goto out;
+ } else if (offset + len > inode->i_size) {
+ /*
+ * If we are fallocating from the end of the file onward we
+ * need to zero out the end of the page if i_size lands in the
+ * middle of a page.
+ */
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ if (ret)
+ goto out;
}
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ ret = btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -2378,8 +2564,10 @@ static long btrfs_fallocate(struct file *file, int mode,
* we can't wait on the range with the transaction
* running or with the extent lock held
*/
- btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
+ ret = btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
} else {
if (ordered)
btrfs_put_ordered_extent(ordered);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a12d0ab374d5..1eedd67e2515 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,6 +27,7 @@
#include "disk-io.h"
#include "extent_io.h"
#include "inode-map.h"
+#include "volumes.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
@@ -1012,8 +1013,13 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (ret)
goto out;
-
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
+ goto out;
+ }
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
@@ -2608,8 +2614,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
- ret = btrfs_error_discard_extent(fs_info->extent_root,
- start, bytes, &trimmed);
+ ret = btrfs_discard_extent(fs_info->extent_root,
+ start, bytes, &trimmed);
if (!ret)
*total_trimmed += trimmed;
@@ -2773,6 +2779,49 @@ next:
return ret;
}
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
+{
+ atomic_inc(&cache->trimming);
+}
+
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ bool cleanup;
+
+ spin_lock(&block_group->lock);
+ cleanup = (atomic_dec_and_test(&block_group->trimming) &&
+ block_group->removed);
+ spin_unlock(&block_group->lock);
+
+ if (cleanup) {
+ lock_chunks(block_group->fs_info->chunk_root);
+ em_tree = &block_group->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+ 1);
+ BUG_ON(!em); /* logic error, can't happen */
+ /*
+ * remove_extent_mapping() will delete us from the pinned_chunks
+ * list, which is protected by the chunk mutex.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ unlock_chunks(block_group->fs_info->chunk_root);
+
+ /* once for us and once for the tree */
+ free_extent_map(em);
+ free_extent_map(em);
+
+ /*
+ * We've left one free space entry and other tasks trimming
+ * this block group have left 1 entry each one. Free them.
+ */
+ __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ }
+}
+
int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen)
{
@@ -2780,12 +2829,21 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
*trimmed = 0;
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_get_block_group_trimming(block_group);
+ spin_unlock(&block_group->lock);
+
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
if (ret)
- return ret;
+ goto out;
ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-
+out:
+ btrfs_put_block_group_trimming(block_group);
return ret;
}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index e0b7034d6343..6b823205a100 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -409,6 +409,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->leave_spinning = 1;
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -427,8 +428,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
- if (ret == -EOVERFLOW)
- ret = -EMLINK;
+ if (ret == -EOVERFLOW) {
+ if (find_name_in_backref(path, name, name_len, &ref))
+ ret = -EEXIST;
+ else
+ ret = -EMLINK;
+ }
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2c66ddbbe670..3b4bacb7a267 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -55,7 +55,7 @@ static int caching_kthread(void *data)
key.type = BTRFS_INODE_ITEM_KEY;
again:
/* need to make sure the commit_root doesn't disappear */
- mutex_lock(&root->fs_commit_mutex);
+ down_read(&fs_info->commit_root_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -90,7 +90,7 @@ again:
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
root->cache_progress = last;
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
} else
@@ -129,7 +129,7 @@ next:
btrfs_unpin_free_ino(root);
out:
wake_up(&root->cache_wait);
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -225,11 +225,11 @@ again:
* or the caching work is done.
*/
- mutex_lock(&root->fs_commit_mutex);
+ down_write(&root->fs_info->commit_root_sem);
spin_lock(&root->cache_lock);
if (root->cached == BTRFS_CACHE_FINISHED) {
spin_unlock(&root->cache_lock);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
goto again;
}
spin_unlock(&root->cache_lock);
@@ -242,7 +242,7 @@ again:
else
__btrfs_add_free_space(pinned, objectid, 1);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
}
}
@@ -252,7 +252,7 @@ again:
* and others will just be dropped, because the commit root we were
* searching has changed.
*
- * Must be called with root->fs_commit_mutex held
+ * Must be called with root->fs_info->commit_root_sem held
*/
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0da60d5d1da9..d45043c40f3a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3703,9 +3703,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
*/
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- int ret;
/*
* 1 for the possible orphan item
@@ -3714,27 +3712,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the inode ref
* 1 for the inode
*/
- trans = btrfs_start_transaction(root, 5);
- if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
- return trans;
-
- if (PTR_ERR(trans) == -ENOSPC) {
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return trans;
- ret = btrfs_cond_migrate_bytes(root->fs_info,
- &root->fs_info->trans_block_rsv,
- num_bytes, 5);
- if (ret) {
- btrfs_end_transaction(trans, root);
- return ERR_PTR(ret);
- }
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- trans->bytes_reserved = num_bytes;
- }
- return trans;
+ return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3912,6 +3890,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
+ u64 last_size = (u64)-1;
u32 found_type = (u8)-1;
int found_extent;
int del_item;
@@ -4009,6 +3988,11 @@ search_again:
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
+ if (del_item)
+ last_size = found_key.offset;
+ else
+ last_size = new_size;
+
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -4126,6 +4110,8 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
error:
+ if (last_size != (u64)-1)
+ btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_free_path(path);
return err;
}
@@ -4265,6 +4251,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
u64 hole_size;
int err = 0;
+ /*
+ * If our size started in the middle of a page we need to zero out the
+ * rest of the page before we expand the i_size, otherwise we could
+ * expose stale data.
+ */
+ err = btrfs_truncate_page(inode, oldsize, 0, 0);
+ if (err)
+ return err;
+
if (size <= hole_start)
return 0;
@@ -4439,8 +4434,26 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
ret = btrfs_truncate(inode);
- if (ret && inode->i_nlink)
- btrfs_orphan_del(NULL, inode);
+ if (ret && inode->i_nlink) {
+ int err;
+
+ /*
+ * failed to truncate, disk_i_size is only adjusted down
+ * as we remove extents, so it should represent the true
+ * size of the inode, so reset the in memory size and
+ * delete our orphan entry.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_orphan_del(NULL, inode);
+ return ret;
+ }
+ i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+ err = btrfs_orphan_del(trans, inode);
+ if (err)
+ btrfs_abort_transaction(trans, root, err);
+ btrfs_end_transaction(trans, root);
+ }
}
return ret;
@@ -7316,8 +7329,18 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
offset, nr_segs))
return 0;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * call btrfs_wait_ordered_range to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ count = iov_length(iov, nr_segs);
+ ret = btrfs_wait_ordered_range(inode, offset, count);
+ if (ret)
+ return ret;
+
if (rw & WRITE) {
- count = iov_length(iov, nr_segs);
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
return ret;
@@ -7614,19 +7637,17 @@ static int btrfs_truncate(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- int ret;
+ int ret = 0;
int err = 0;
struct btrfs_trans_handle *trans;
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ (u64)-1);
if (ret)
return ret;
- btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-
/*
* Yes ladies and gentelment, this is indeed ugly. The fact is we have
* 3 things going on here
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ecd05159ee6c..17afd046cdce 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -57,6 +57,7 @@
#include "rcu-string.h"
#include "send.h"
#include "dev-replace.h"
+#include "tree-log.h"
static int btrfs_clone(struct inode *src, struct inode *inode,
u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -416,7 +417,7 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+ leaf = btrfs_alloc_free_block(trans, root, root->nodesize,
0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
@@ -443,7 +444,7 @@ static noinline int create_subvol(struct inode *dir,
inode_item->generation = cpu_to_le64(1);
inode_item->size = cpu_to_le64(3);
inode_item->nlink = cpu_to_le32(1);
- inode_item->nbytes = cpu_to_le64(root->leafsize);
+ inode_item->nbytes = cpu_to_le64(root->nodesize);
inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
root_item.flags = 0;
@@ -829,7 +830,6 @@ static int find_new_extents(struct btrfs_root *root,
{
struct btrfs_path *path;
struct btrfs_key min_key;
- struct btrfs_key max_key;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *extent;
int type;
@@ -844,15 +844,8 @@ static int find_new_extents(struct btrfs_root *root,
min_key.type = BTRFS_EXTENT_DATA_KEY;
min_key.offset = *off;
- max_key.objectid = ino;
- max_key.type = (u8)-1;
- max_key.offset = (u64)-1;
-
- path->keep_locks = 1;
-
while(1) {
- ret = btrfs_search_forward(root, &min_key, &max_key,
- path, newer_than);
+ ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
if (min_key.objectid != ino)
@@ -1455,7 +1448,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
if (mod < 0) {
if (new_size > old_size) {
@@ -1915,7 +1908,6 @@ static noinline int search_ioctl(struct inode *inode,
{
struct btrfs_root *root;
struct btrfs_key key;
- struct btrfs_key max_key;
struct btrfs_path *path;
struct btrfs_ioctl_search_key *sk = &args->key;
struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
@@ -1947,15 +1939,8 @@ static noinline int search_ioctl(struct inode *inode,
key.type = sk->min_type;
key.offset = sk->min_offset;
- max_key.objectid = sk->max_objectid;
- max_key.type = sk->max_type;
- max_key.offset = sk->max_offset;
-
- path->keep_locks = 1;
-
while(1) {
- ret = btrfs_search_forward(root, &key, &max_key, path,
- sk->min_transid);
+ ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
ret = 0;
@@ -2242,6 +2227,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
+ btrfs_record_snapshot_destroy(trans, dir);
+
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
dentry->d_name.name,
@@ -2453,10 +2440,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
if (!fi_args)
return -ENOMEM;
+ mutex_lock(&fs_devices->device_list_mutex);
fi_args->num_devices = fs_devices->num_devices;
memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
- mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
@@ -2499,8 +2486,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
}
di_args->devid = dev->devid;
- di_args->bytes_used = dev->bytes_used;
- di_args->total_bytes = dev->total_bytes;
+ di_args->bytes_used = btrfs_device_get_bytes_used(dev);
+ di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
if (dev->name) {
struct rcu_string *name;
@@ -2845,7 +2832,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 len = olen_aligned;
ret = -ENOMEM;
- buf = vmalloc(btrfs_level_size(root, 0));
+ buf = vmalloc(root->nodesize);
if (!buf)
return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8ec7ff533551..b13a5c450357 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -414,27 +414,48 @@ out:
}
/* Needs to either be called under a log transaction or the log_mutex */
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_ordered_extent *ordered;
struct rb_node *n;
- int index = log->log_transid % 2;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
- spin_lock(&log->log_extents_lock[index]);
- if (list_empty(&ordered->log_list)) {
- list_add_tail(&ordered->log_list, &log->logged_list[index]);
- atomic_inc(&ordered->refs);
- }
- spin_unlock(&log->log_extents_lock[index]);
+ if (!list_empty(&ordered->log_list))
+ continue;
+ list_add_tail(&ordered->log_list, logged_list);
+ atomic_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ while (!list_empty(logged_list)) {
+ ordered = list_first_entry(logged_list,
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log)
+{
+ int index = log->log_transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ list_splice_tail(logged_list, &log->logged_list[index]);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
struct btrfs_ordered_extent *ordered;
@@ -715,8 +736,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
/*
* Used to wait on ordered extents across a large range of bytes.
*/
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
+ int ret = 0;
u64 end;
u64 orig_end;
struct btrfs_ordered_extent *ordered;
@@ -732,8 +754,9 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc
* extents
*/
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+ if (ret)
+ return ret;
/*
* So with compression we will find and lock a dirty page and clear the
* first one as dirty, setup an async extent, and immediately return
@@ -749,10 +772,15 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* right and you are wrong.
*/
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
- filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ orig_end);
+ if (ret)
+ return ret;
+ }
+ ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ if (ret)
+ return ret;
end = orig_end;
while (1) {
@@ -769,11 +797,14 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
}
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ ret = -EIO;
btrfs_put_ordered_extent(ordered);
- if (end == 0 || end == start)
+ if (ret || end == 0 || end == start)
break;
end--;
}
+ return ret;
}
/*
@@ -844,6 +875,20 @@ out:
return entry;
}
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_extent *oe;
+
+ oe = btrfs_lookup_ordered_range(inode, file_offset, len);
+ if (oe) {
+ btrfs_put_ordered_extent(oe);
+ return true;
+ }
+ return false;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 267ac99095f6..9a1876e4a143 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -188,12 +188,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 file_offset,
u64 len);
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
@@ -203,7 +206,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log);
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index c90cd227566d..6ab02eb7d945 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -335,8 +335,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
for (i = 0; i < nr; i++) {
struct extent_buffer *next = read_tree_block(root,
btrfs_node_blockptr(c, i),
- btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(c, i));
+ if (IS_ERR(next)) {
+ continue;
+ } else if (!extent_buffer_uptodate(next)) {
+ free_extent_buffer(next);
+ continue;
+ }
if (btrfs_is_leaf(next) &&
level != 1)
BUG();
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3129643a25e5..76b92d0052cb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1532,8 +1532,10 @@ out:
}
/*
- * copy the acounting information between qgroups. This is necessary when a
- * snapshot or a subvolume is created
+ * Copy the acounting information between qgroups. This is necessary
+ * when a snapshot or a subvolume is created. Throwing an error will
+ * cause a transaction abort so we take extra care here to only error
+ * when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
@@ -1563,10 +1565,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2 * inherit->num_excl_copies;
for (i = 0; i < nums; ++i) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
- if (!srcgroup) {
- ret = -EINVAL;
- goto out;
- }
+
+ /*
+ * Zero out invalid groups so we can ignore
+ * them later.
+ */
+ if (!srcgroup ||
+ ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
+ *i_qgroups = 0ULL;
+
++i_qgroups;
}
}
@@ -1592,7 +1599,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
if (srcid) {
struct btrfs_root *srcroot;
struct btrfs_key srckey;
- int srcroot_level;
srckey.objectid = srcid;
srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -1604,8 +1610,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
}
rcu_read_lock();
- srcroot_level = btrfs_header_level(srcroot->node);
- level_size = btrfs_level_size(srcroot, srcroot_level);
+ level_size = srcroot->nodesize;
rcu_read_unlock();
}
@@ -1614,17 +1619,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i) {
+ for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+ if (*i_qgroups == 0)
+ continue;
ret = add_qgroup_relation_item(trans, quota_root,
objectid, *i_qgroups);
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
ret = add_qgroup_relation_item(trans, quota_root,
*i_qgroups, objectid);
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
- ++i_qgroups;
}
+ ret = 0;
}
@@ -1653,17 +1660,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
- ret = add_relation_rb(quota_root->fs_info, objectid,
- *i_qgroups);
- if (ret)
- goto unlock;
+ if (*i_qgroups) {
+ ret = add_relation_rb(quota_root->fs_info, objectid,
+ *i_qgroups);
+ if (ret)
+ goto unlock;
+ }
++i_qgroups;
}
- for (i = 0; i < inherit->num_ref_copies; ++i) {
+ for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
@@ -1674,12 +1686,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
- i_qgroups += 2;
}
- for (i = 0; i < inherit->num_excl_copies; ++i) {
+ for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
@@ -1690,7 +1704,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
- i_qgroups += 2;
}
unlock:
@@ -1914,7 +1927,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
found.type != BTRFS_METADATA_ITEM_KEY)
continue;
if (found.type == BTRFS_METADATA_ITEM_KEY)
- num_bytes = fs_info->extent_root->leafsize;
+ num_bytes = fs_info->extent_root->nodesize;
else
num_bytes = found.offset;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 8143fcadb0da..5d17508bede2 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -346,7 +346,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
if (!re)
return NULL;
- blocksize = btrfs_level_size(root, level);
+ blocksize = root->nodesize;
re->logical = logical;
re->blocksize = blocksize;
re->top = *top;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 00ce50daff6f..82656f0d9998 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1758,7 +1758,7 @@ again:
btrfs_node_key_to_cpu(parent, next_key, slot + 1);
old_bytenr = btrfs_node_blockptr(parent, slot);
- blocksize = btrfs_level_size(dest, level - 1);
+ blocksize = dest->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
if (level <= max_level) {
@@ -1785,10 +1785,11 @@ again:
break;
}
- eb = read_tree_block(dest, old_bytenr, blocksize,
- old_ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
- ret = (!eb) ? -ENOMEM : -EIO;
+ eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
+ ret = -EIO;
free_extent_buffer(eb);
break;
}
@@ -1916,7 +1917,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
u64 bytenr;
u64 ptr_gen = 0;
u64 last_snapshot;
- u32 blocksize;
u32 nritems;
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1942,9 +1942,10 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
}
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- blocksize = btrfs_level_size(root, i - 1);
- eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ eb = read_tree_block(root, bytenr, ptr_gen);
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2531,8 +2532,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
if (next->processed && (reserve || next != node))
break;
- num_bytes += btrfs_level_size(rc->extent_root,
- next->level);
+ num_bytes += rc->extent_root->nodesize;
if (list_empty(&next->upper))
break;
@@ -2659,10 +2659,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
- blocksize = btrfs_level_size(root, node->level);
+ blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
- eb = read_tree_block(root, bytenr, blocksize, generation);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ eb = read_tree_block(root, bytenr, generation);
+ if (IS_ERR(eb)) {
+ err = PTR_ERR(eb);
+ goto next;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
goto next;
@@ -2769,7 +2772,7 @@ static void __mark_block_processed(struct reloc_control *rc,
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
- blocksize = btrfs_level_size(rc->extent_root, node->level);
+ blocksize = rc->extent_root->nodesize;
mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
@@ -2823,8 +2826,10 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid, block->key.offset);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ block->key.offset);
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2845,7 +2850,7 @@ static int reada_tree_block(struct reloc_control *rc,
if (block->key.type == BTRFS_METADATA_ITEM_KEY)
readahead_tree_block(rc->extent_root, block->bytenr,
block->key.objectid,
- rc->extent_root->leafsize);
+ rc->extent_root->nodesize);
else
readahead_tree_block(rc->extent_root, block->bytenr,
block->key.objectid, block->key.offset);
@@ -3298,7 +3303,7 @@ static int add_tree_block(struct reloc_control *rc,
return -ENOMEM;
block->bytenr = extent_key->objectid;
- block->key.objectid = rc->extent_root->leafsize;
+ block->key.objectid = rc->extent_root->nodesize;
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
@@ -3638,7 +3643,7 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize = btrfs_level_size(rc->extent_root, 0);
+ u32 blocksize = rc->extent_root->nodesize;
int ret = 0;
int err = 0;
@@ -3781,7 +3786,7 @@ next:
}
if (key.type == BTRFS_METADATA_ITEM_KEY &&
- key.objectid + rc->extent_root->leafsize <=
+ key.objectid + rc->extent_root->nodesize <=
rc->search_start) {
path->slots[0]++;
goto next;
@@ -3799,7 +3804,7 @@ next:
rc->search_start = key.objectid + key.offset;
else
rc->search_start = key.objectid +
- rc->extent_root->leafsize;
+ rc->extent_root->nodesize;
memcpy(extent_key, &key, sizeof(key));
return 0;
}
@@ -4204,14 +4209,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!rc->block_group);
- if (!rc->block_group->ro) {
- ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
- if (ret) {
- err = ret;
- goto out;
- }
- rw = 1;
+ ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
}
+ rw = 1;
path = btrfs_alloc_path();
if (!path) {
@@ -4267,7 +4270,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
(unsigned long long)rc->extents_found);
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+ (u64)-1);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
@@ -4284,7 +4292,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
if (err && rw)
- btrfs_set_block_group_rw(extent_root, rc->block_group);
+ btrfs_dec_block_group_ro(extent_root, rc->block_group);
iput(rc->data_inode);
btrfs_put_block_group(rc->block_group);
kfree(rc);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 212f4c6a1679..f577db004db8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -136,7 +136,6 @@ struct scrub_ctx {
int pages_per_rd_bio;
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
int is_dev_replace;
struct scrub_wr_ctx wr_ctx;
@@ -404,7 +403,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
}
sctx->first_free = 0;
sctx->nodesize = dev->dev_root->nodesize;
- sctx->leafsize = dev->dev_root->leafsize;
sctx->sectorsize = dev->dev_root->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
@@ -1754,7 +1752,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
BTRFS_UUID_SIZE))
++fail;
- WARN_ON(sctx->nodesize != sctx->leafsize);
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -2211,7 +2208,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- WARN_ON(sctx->nodesize != sctx->leafsize);
blocksize = sctx->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
@@ -2468,7 +2464,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
goto next;
if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = root->leafsize;
+ bytes = root->nodesize;
else
bytes = key.offset;
@@ -2550,7 +2546,9 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset, u64 length,
- u64 dev_offset, int is_dev_replace)
+ u64 dev_offset,
+ struct btrfs_block_group_cache *cache,
+ int is_dev_replace)
{
struct btrfs_mapping_tree *map_tree =
&sctx->dev_root->fs_info->mapping_tree;
@@ -2563,8 +2561,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
read_unlock(&map_tree->map_tree.lock);
- if (!em)
- return -EINVAL;
+ if (!em) {
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed)
+ ret = -EINVAL;
+ spin_unlock(&cache->lock);
+
+ return ret;
+ }
map = (struct map_lookup *)em->bdev;
if (em->start != chunk_offset)
@@ -2679,7 +2687,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
chunk_offset, length, found_key.offset,
- is_dev_replace);
+ cache, is_dev_replace);
/*
* flush, submit all pending read and write bios, afterwards
@@ -2760,7 +2768,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >
+ scrub_dev->commit_total_bytes)
break;
ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2840,17 +2849,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (btrfs_fs_closing(fs_info))
return -EINVAL;
- /*
- * check some assumptions
- */
- if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
- printk(KERN_ERR
- "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
- fs_info->chunk_root->nodesize,
- fs_info->chunk_root->leafsize);
- return -EINVAL;
- }
-
if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
/*
* in this case scrub is unable to calculate the checksum
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 82c88e5757bb..229f037f3283 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1220,8 +1220,10 @@ static int find_extent_clone(struct send_ctx *sctx,
}
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+ down_read(&sctx->send_root->fs_info->commit_root_sem);
ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
&found_key, &flags);
+ up_read(&sctx->send_root->fs_info->commit_root_sem);
btrfs_release_path(tmp_path);
if (ret < 0)
@@ -4419,57 +4421,21 @@ out:
static int full_send_tree(struct send_ctx *sctx)
{
int ret;
- struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
struct extent_buffer *eb;
int slot;
- u64 start_ctransid;
- u64 ctransid;
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
- spin_lock(&send_root->root_item_lock);
- start_ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
-join_trans:
- /*
- * We need to make sure the transaction does not get committed
- * while we do anything on commit roots. Join a transaction to prevent
- * this.
- */
- trans = btrfs_join_transaction(send_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
- /*
- * Make sure the tree has not changed after re-joining. We detect this
- * by comparing start_ctransid and ctransid. They should always match.
- */
- spin_lock(&send_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
- if (ctransid != start_ctransid) {
- WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
- "send was modified in between. This is "
- "probably a bug.\n");
- ret = -EIO;
- goto out;
- }
-
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
goto out;
@@ -4477,19 +4443,6 @@ join_trans:
goto out_finish;
while (1) {
- /*
- * When someone want to commit while we iterate, end the
- * joined transaction and rejoin.
- */
- if (btrfs_should_end_transaction(trans, send_root)) {
- ret = btrfs_end_transaction(trans, send_root);
- trans = NULL;
- if (ret < 0)
- goto out;
- btrfs_release_path(path);
- goto join_trans;
- }
-
eb = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -4517,12 +4470,6 @@ out_finish:
out:
btrfs_free_path(path);
- if (trans) {
- if (!ret)
- ret = btrfs_end_transaction(trans, send_root);
- else
- btrfs_end_transaction(trans, send_root);
- }
return ret;
}
@@ -4592,12 +4539,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
* so check its commit root transid with our otransid and if they match
* commit the transaction to make sure everything is updated.
*/
- down_read(&send_root->fs_info->extent_commit_sem);
+ down_read(&send_root->fs_info->commit_root_sem);
if (btrfs_header_generation(send_root->commit_root) ==
btrfs_root_otransid(&send_root->root_item)) {
struct btrfs_trans_handle *trans;
- up_read(&send_root->fs_info->extent_commit_sem);
+ up_read(&send_root->fs_info->commit_root_sem);
trans = btrfs_attach_transaction_barrier(send_root);
if (IS_ERR(trans)) {
@@ -4612,7 +4559,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
} else {
- up_read(&send_root->fs_info->extent_commit_sem);
+ up_read(&send_root->fs_info->commit_root_sem);
}
arg = memdup_user(arg_, sizeof(*arg));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67c91c2ca545..55d2df3e9844 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -65,7 +65,7 @@
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
-static const char *btrfs_decode_error(int errno)
+const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -1368,6 +1368,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
*/
sb->s_flags |= MS_RDONLY;
+ /*
+ * Setting MS_RDONLY will put the cleaner thread to
+ * sleep at the next loop if it's already active.
+ * If it's already asleep, we'll leave unused block
+ * groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
@@ -1628,7 +1637,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
__be32 *fsid = (__be32 *)fs_info->fsid;
int ret;
- /* holding chunk_muext to avoid allocating new chunks */
+ /*
+ * holding chunk_muext to avoid allocating new chunks, holding
+ * device_list_mutex to avoid the device being removed
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -1651,11 +1664,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return ret;
}
buf->f_bavail += total_free_data;
buf->f_bavail = buf->f_bavail >> bits;
mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ac57f882078a..0772a0a29278 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -64,6 +64,14 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(transaction->delayed_refs.root.rb_node);
+ while (!list_empty(&transaction->pending_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&transaction->pending_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -94,10 +102,22 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
spin_unlock(&tree->lock);
}
-static noinline void switch_commit_root(struct btrfs_root *root)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info)
{
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+ struct btrfs_root *root, *tmp;
+
+ down_write(&fs_info->commit_root_sem);
+ list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+ dirty_list) {
+ list_del_init(&root->dirty_list);
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+ if (is_fstree(root->objectid))
+ btrfs_unpin_free_ino(root);
+ clear_btree_io_tree(&root->dirty_log_pages);
+ }
+ up_write(&fs_info->commit_root_sem);
}
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
@@ -203,6 +223,7 @@ loop:
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
+ cur_trans->have_free_bgs = 0;
cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.root = RB_ROOT;
@@ -232,6 +253,9 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
+ INIT_LIST_HEAD(&cur_trans->pending_chunks);
+ INIT_LIST_HEAD(&cur_trans->switch_commits);
+ INIT_LIST_HEAD(&cur_trans->deleted_bgs);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -404,7 +428,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
if (num_items > 0 && root != root->fs_info->chunk_root) {
if (root->fs_info->quota_enabled &&
is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->leafsize;
+ qgroup_reserved = num_items * root->nodesize;
ret = btrfs_qgroup_reserve(root, qgroup_reserved);
if (ret)
return ERR_PTR(ret);
@@ -460,6 +484,7 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
h->bytes_reserved = 0;
+ h->chunk_bytes_reserved = 0;
h->root = root;
h->delayed_ref_updates = 0;
h->use_count = 1;
@@ -517,6 +542,38 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL);
}
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor)
+{
+ struct btrfs_trans_handle *trans;
+ u64 num_bytes;
+ int ret;
+
+ trans = btrfs_start_transaction(root, num_items);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes,
+ min_factor);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->bytes_reserved = num_bytes;
+
+ return trans;
+}
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_root *root, int num_items)
@@ -726,6 +783,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
+
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root) &&
ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -974,7 +1033,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
static int update_cowonly_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- int ret = 0;
+ int ret;
u64 old_root_bytenr;
u64 old_root_used;
struct btrfs_root *tree_root = root->fs_info->tree_root;
@@ -993,20 +1052,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
&root->root_key,
&root->root_item);
if (ret)
- goto out;
+ return ret;
old_root_used = btrfs_root_used(&root->root_item);
ret = btrfs_write_dirty_block_groups(trans, root);
if (ret)
- goto out;
+ return ret;
}
- if (root != root->fs_info->extent_root)
- switch_commit_root(root);
- out:
- clear_btree_io_tree(&root->dirty_log_pages);
-
- return ret;
+ return 0;
}
/*
@@ -1058,15 +1112,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
+ if (root != fs_info->extent_root)
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
}
- down_write(&fs_info->extent_commit_sem);
- switch_commit_root(fs_info->extent_root);
- up_write(&fs_info->extent_commit_sem);
-
+ list_add_tail(&fs_info->extent_root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_after_dev_replace_commit(fs_info);
return 0;
@@ -1123,11 +1178,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
smp_wmb();
if (root->commit_root != root->node) {
- mutex_lock(&root->fs_commit_mutex);
- switch_commit_root(root);
- btrfs_unpin_free_ino(root);
- mutex_unlock(&root->fs_commit_mutex);
-
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_set_root_node(&root->root_item,
root->node);
}
@@ -1918,11 +1970,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
- switch_commit_root(root->fs_info->tree_root);
+ list_add_tail(&root->fs_info->tree_root->dirty_list,
+ &cur_trans->switch_commits);
btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
root->fs_info->chunk_root->node);
- switch_commit_root(root->fs_info->chunk_root);
+ list_add_tail(&root->fs_info->chunk_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
update_super_roots(root);
@@ -1932,9 +1988,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
sizeof(*root->fs_info->super_copy));
+ btrfs_update_commit_device_size(root->fs_info);
+ btrfs_update_commit_device_bytes_used(root, cur_trans);
+
clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ btrfs_trans_release_chunk_metadata(trans);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -1965,6 +2026,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
+ if (cur_trans->have_free_bgs)
+ btrfs_clear_space_info_full(root->fs_info);
+
root->fs_info->last_trans_committed = cur_trans->transid;
/*
* We needn't acquire the lock here because there is no other task
@@ -2002,6 +2066,7 @@ scrub_continue:
btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
if (trans->qgroup_reserved) {
btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6107035e5a60..78004d6efb9a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,11 @@ struct btrfs_transaction {
atomic_t num_writers;
atomic_t use_count;
+ /*
+ * true if there is free bgs operations in this transaction
+ */
+ int have_free_bgs;
+
unsigned long num_joined;
/* Be protected by fs_info->trans_lock when we want to change it. */
@@ -58,6 +63,10 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head ordered_operations;
+ struct list_head pending_chunks;
+ struct list_head switch_commits;
+ /* Protected by spin lock fs_info->unused_bgs_lock. */
+ struct list_head deleted_bgs;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
@@ -82,6 +91,7 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 chunk_bytes_reserved;
u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
@@ -123,15 +133,21 @@ struct btrfs_pending_snapshot {
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ spin_unlock(&BTRFS_I(inode)->lock);
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor);
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_root *root, int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 94e05c1f118a..cbaa73c871a9 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -85,7 +85,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
- ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
+ ret = btrfs_search_forward(root, &key, path, min_trans);
if (ret < 0)
goto out;
if (ret > 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 16d0822d4a3d..36b94c6aa990 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -96,8 +96,11 @@
#define LOG_WALK_REPLAY_ALL 3
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only);
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -136,8 +139,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
{
+ int index;
int ret;
int err = 0;
@@ -152,6 +157,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ }
mutex_unlock(&root->log_mutex);
return 0;
}
@@ -171,6 +180,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->tree_log_mutex);
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ }
mutex_unlock(&root->log_mutex);
return err;
}
@@ -434,11 +447,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
+ path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */
- if (ret == -EEXIST) {
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
@@ -469,8 +484,28 @@ insert:
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0)
+ if (btrfs_inode_generation(eb, src_item) == 0) {
+ struct extent_buffer *dst_eb = path->nodes[0];
+ const u64 ino_size = btrfs_inode_size(eb, src_item);
+
+ /*
+ * For regular files an ino_size == 0 is used only when
+ * logging that an inode exists, as part of a directory
+ * fsync, and the inode wasn't fsynced before. In this
+ * case don't set the size of the inode in the fs/subvol
+ * tree, otherwise we would be throwing valid data away.
+ */
+ if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+ ino_size != 0) {
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+ btrfs_set_token_inode_size(dst_eb, dst_item,
+ ino_size, &token);
+ }
goto no_copy;
+ }
if (overwrite_root &&
S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -825,7 +860,7 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- char *name, int namelen)
+ const char *name, int namelen)
{
struct btrfs_path *path;
struct btrfs_inode_ref *ref;
@@ -1262,6 +1297,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1277,7 +1313,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
}
btrfs_release_path(path);
- if (ret < 0)
+ if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
@@ -1364,9 +1400,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
nlink = ret;
ret = count_inode_extrefs(root, inode, path);
- if (ret == -ENOENT)
- ret = 0;
-
if (ret < 0)
goto out;
@@ -1524,6 +1557,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+ const char *name, const int name_len,
+ const u64 dirid, const u64 ino)
+{
+ struct btrfs_key search_key;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = dirid;
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ return false;
+}
+
+/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
@@ -1605,6 +1662,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
found_key.type == log_key.type &&
found_key.offset == log_key.offset &&
btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ update_size = false;
goto out;
}
@@ -1632,10 +1690,17 @@ out:
return ret;
insert:
+ if (name_in_log_ref(root->log_root, name, name_len,
+ key->objectid, log_key.objectid)) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
btrfs_release_path(path);
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
- if (ret && ret != -ENOENT)
+ if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
update_size = false;
ret = 0;
@@ -1870,6 +1935,104 @@ out:
return ret;
}
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const u64 ino)
+{
+ struct btrfs_key search_key;
+ struct btrfs_path *log_path;
+ int i;
+ int nritems;
+ int ret;
+
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_XATTR_ITEM_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+process_leaf:
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ u32 total_size;
+ u32 cur;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size_nr(path->nodes[0], i);
+ cur = 0;
+ while (cur < total_size) {
+ u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ char *name;
+
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(path->nodes[0], name,
+ (unsigned long)(di + 1), name_len);
+
+ log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+ name, name_len, 0);
+ btrfs_release_path(log_path);
+ if (!log_di) {
+ /* Doesn't exist in log tree, so delete it. */
+ btrfs_release_path(path);
+ di = btrfs_lookup_xattr(trans, root, path, ino,
+ name, name_len, -1);
+ kfree(name);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ ASSERT(di);
+ ret = btrfs_delete_one_dir_name(trans, root,
+ path, di);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ search_key = key;
+ goto again;
+ }
+ kfree(name);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ }
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
+ }
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ goto process_leaf;
+out:
+ btrfs_free_path(log_path);
+ btrfs_release_path(path);
+ return ret;
+}
+
+
/*
* deletion replay happens before we copy any new directory items
* out of the log or out of backreferences from inodes. It
@@ -2023,6 +2186,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ ret = replay_xattr_deletes(wc->trans, root, log,
+ path, key.objectid);
+ if (ret)
+ break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
@@ -2128,7 +2295,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
+ blocksize = root->nodesize;
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
@@ -2342,8 +2509,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static int wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long transid)
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2358,36 +2525,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ if (root->log_transid < transid + 2 &&
atomic_read(&root->log_commit[index]))
schedule();
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
- } while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ } while (root->log_transid < transid + 2 &&
atomic_read(&root->log_commit[index]));
- return 0;
}
static void wait_for_writer(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers)) {
+
+ while (atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers))
+ if (atomic_read(&root->log_writers))
schedule();
finish_wait(&root->log_writer_wait, &wait);
mutex_lock(&root->log_mutex);
}
}
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ if (!ctx)
+ return;
+
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx->list);
+ mutex_unlock(&root->log_mutex);
+}
+
+/*
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+ int index, int error)
+{
+ struct btrfs_log_ctx *ctx;
+
+ if (!error) {
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+ return;
+ }
+
+ list_for_each_entry(ctx, &root->log_ctxs[index], list)
+ ctx->log_ret = error;
+
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
@@ -2401,7 +2595,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
* that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
int index1;
int index2;
@@ -2410,6 +2604,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
unsigned long log_transid = 0;
+ struct btrfs_log_ctx root_log_ctx;
mutex_lock(&root->log_mutex);
log_transid = root->log_transid;
@@ -2417,13 +2612,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (atomic_read(&root->log_commit[index1])) {
wait_log_commit(trans, root, root->log_transid);
mutex_unlock(&root->log_mutex);
- return 0;
+ return ctx->log_ret;
}
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
wait_log_commit(trans, root, root->log_transid - 1);
+
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
@@ -2503,13 +2699,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
index2 = log_root_tree->log_transid % 2;
+
+ btrfs_init_log_ctx(&root_log_ctx);
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+
if (atomic_read(&log_root_tree->log_commit[index2])) {
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_wait_logged_extents(log, log_transid);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = 0;
+ ret = root_log_ctx.log_ret;
goto out;
}
atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2576,12 +2776,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
out_wake_log_root:
+ /*
+ * We needn't get log_mutex here because we are sure all
+ * the other tasks are blocked.
+ */
+ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+ /*
+ * It is dangerous if log_commit is changed before we set
+ * ->log_ret of log ctx. Because the readers may not get
+ * the return value.
+ */
+ smp_wmb();
+
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
+ /* See above. */
+ btrfs_remove_all_log_ctxs(root, index1, ret);
+
+ /* See above. */
+ smp_wmb();
atomic_set(&root->log_commit[index1], 0);
+
smp_mb();
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
@@ -2855,10 +3074,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path, int key_type,
+ struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
- struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src;
int err = 0;
@@ -2870,18 +3089,12 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
u64 ino = btrfs_ino(inode);
log = root->log_root;
- max_key.objectid = ino;
- max_key.offset = (u64)-1;
- max_key.type = key_type;
min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = min_offset;
- path->keep_locks = 1;
-
- ret = btrfs_search_forward(root, &min_key, &max_key,
- path, trans->transid);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
/*
* we didn't find anything from this transaction, see if there
@@ -2948,6 +3161,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
src = path->nodes[0];
nritems = btrfs_header_nritems(src);
for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+
btrfs_item_key_to_cpu(src, &min_key, i);
if (min_key.objectid != ino || min_key.type != key_type)
@@ -2958,6 +3173,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
err = ret;
goto done;
}
+
+ /*
+ * We must make sure that when we log a directory entry,
+ * the corresponding inode, after log replay, has a
+ * matching link count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our
+ * file inode would have a link count of 1, but we get
+ * two directory entries pointing to the same inode.
+ * After removing one of the names, it would not be
+ * possible to remove the other name, which resulted
+ * always in stale file handle errors, and would not
+ * be possible to rmdir the parent directory, since
+ * its i_size could never decrement to the value
+ * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+ */
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(src, di, &tmp);
+ if (ctx &&
+ (btrfs_dir_transid(src, di) == trans->transid ||
+ btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+ tmp.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
}
path->slots[0] = nritems;
@@ -3019,7 +3265,8 @@ done:
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
u64 min_key;
u64 max_key;
@@ -3031,7 +3278,7 @@ again:
max_key = 0;
while (1) {
ret = log_dir_items(trans, root, inode, path,
- dst_path, key_type, min_key,
+ dst_path, key_type, ctx, min_key,
&max_key);
if (ret)
return ret;
@@ -3107,7 +3354,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only)
+ struct inode *inode, int log_inode_only,
+ u64 logged_isize)
{
struct btrfs_map_token token;
@@ -3120,7 +3368,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* to say 'update this inode with these values'
*/
btrfs_set_token_inode_generation(leaf, item, 0, &token);
- btrfs_set_token_inode_size(leaf, item, 0, &token);
+ btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
} else {
btrfs_set_token_inode_generation(leaf, item,
BTRFS_I(inode)->generation,
@@ -3173,7 +3421,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
btrfs_release_path(path);
return 0;
}
@@ -3182,7 +3430,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *dst_path,
struct extent_buffer *src,
- int start_slot, int nr, int inode_only)
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize)
{
unsigned long src_offset;
unsigned long dst_offset;
@@ -3229,7 +3478,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
dst_path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
- inode, inode_only == LOG_INODE_EXISTS);
+ inode, inode_only == LOG_INODE_EXISTS,
+ logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
src_offset, ins_sizes[i]);
@@ -3318,7 +3568,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_one_extent(struct btrfs_trans_handle *trans,
struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path)
+ struct extent_map *em, struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
@@ -3334,7 +3585,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
- int index = log->log_transid % 2;
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
@@ -3417,17 +3667,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
* First check and see if our csums are on our outstanding ordered
* extents.
*/
-again:
- spin_lock_irq(&log->log_extents_lock[index]);
- list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+ list_for_each_entry(ordered, logged_list, log_list) {
struct btrfs_ordered_sum *sum;
if (!mod_len)
break;
- if (ordered->inode != inode)
- continue;
-
if (ordered->file_offset + ordered->len <= mod_start ||
mod_start + mod_len <= ordered->file_offset)
continue;
@@ -3470,27 +3715,16 @@ again:
if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
&ordered->flags))
continue;
- atomic_inc(&ordered->refs);
- spin_unlock_irq(&log->log_extents_lock[index]);
- /*
- * we've dropped the lock, we must either break or
- * start over after this.
- */
wait_event(ordered->wait, ordered->csum_bytes_left == 0);
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
- if (ret) {
- btrfs_put_ordered_extent(ordered);
+ if (ret)
goto unlocked;
- }
}
- btrfs_put_ordered_extent(ordered);
- goto again;
}
- spin_unlock_irq(&log->log_extents_lock[index]);
unlocked:
if (!mod_len || ret)
@@ -3523,7 +3757,8 @@ unlocked:
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -3581,7 +3816,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path);
+ ret = log_one_extent(trans, inode, root, em, path, logged_list);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -3593,6 +3828,228 @@ process:
return ret;
}
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *size_ret = 0;
+ } else {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ }
+
+ btrfs_release_path(path);
+ return 0;
+}
+
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ int ret;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ int ins_nr = 0;
+ int start_slot = 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ int nritems = btrfs_header_nritems(leaf);
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, leaf,
+ start_slot, ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path->nodes[0],
+ start_slot, ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ const int slot,
+ const struct btrfs_key *key,
+ struct inode *inode)
+{
+ int ret;
+ struct btrfs_path *search_path;
+ char *name = NULL;
+ u32 name_len = 0;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+ search_path = btrfs_alloc_path();
+ if (!search_path)
+ return -ENOMEM;
+ search_path->search_commit_root = 1;
+ search_path->skip_locking = 1;
+
+ while (cur_offset < item_size) {
+ u64 parent;
+ u32 this_name_len;
+ u32 this_len;
+ unsigned long name_ptr;
+ struct btrfs_dir_item *di;
+
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+
+ iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ parent = key->offset;
+ this_name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_ptr = (unsigned long)(iref + 1);
+ this_len = sizeof(*iref) + this_name_len;
+ } else {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ this_name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_ptr = (unsigned long)&extref->name;
+ this_len = sizeof(*extref) + this_name_len;
+ }
+
+ if (this_name_len > name_len) {
+ char *new_name;
+
+ new_name = krealloc(name, this_name_len, GFP_NOFS);
+ if (!new_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ name_len = this_name_len;
+ name = new_name;
+ }
+
+ read_extent_buffer(eb, name, name_ptr, this_name_len);
+ di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
+ search_path, parent,
+ name, this_name_len, 0);
+ if (di && !IS_ERR(di)) {
+ ret = 1;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ btrfs_release_path(search_path);
+
+ cur_offset += this_len;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(search_path);
+ kfree(name);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -3608,8 +4065,11 @@ process:
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only)
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
struct btrfs_path *dst_path;
@@ -3617,6 +4077,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
+ LIST_HEAD(logged_list);
int err = 0;
int ret;
int nritems;
@@ -3624,6 +4085,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 logged_isize = 0;
+ bool need_log_inode_item = true;
+ bool xattrs_logged = false;
log = root->log_root;
@@ -3653,20 +4118,27 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /* Only run delayed items if we are a dir or a new file */
+ /*
+ * Only run delayed items if we are a dir or a new file.
+ * Otherwise commit the delayed inode only, which is needed in
+ * order for the log replay code to mark inodes for link count
+ * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ */
if (S_ISDIR(inode->i_mode) ||
- BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+ BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
ret = btrfs_commit_inode_delayed_items(trans, inode);
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
- }
+ else
+ ret = btrfs_commit_inode_delayed_inode(inode);
+
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
}
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(log, inode);
+ btrfs_get_logged_extents(inode, &logged_list);
/*
* a brute force approach to making sure we get the most uptodate
@@ -3679,14 +4151,42 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key_type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
- if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags);
- ret = btrfs_truncate_inode_items(trans, log,
- inode, 0, 0);
+ if (inode_only == LOG_INODE_EXISTS) {
+ /*
+ * Make sure the new inode item we write to the log has
+ * the same isize as the current one (if it exists).
+ * This is necessary to prevent data loss after log
+ * replay, and also to prevent doing a wrong expanding
+ * truncate - for e.g. create file, write 4K into offset
+ * 0, fsync, write 4K into offset 4096, add hard link,
+ * fsync some other file (to sync log), power fail - if
+ * we use the inode's current i_size, after log replay
+ * we get a 8Kb file, with the last 4Kb extent as a hole
+ * (zeroes), as if an expanding truncate happened,
+ * instead of getting a file of 4Kb only.
+ */
+ err = logged_inode_size(log, inode, path,
+ &logged_isize);
+ if (err)
+ goto out_unlock;
+ }
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ ret = drop_objectid_items(trans, log, path, ino,
+ max_key.type);
+ } else {
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
+ ret = btrfs_truncate_inode_items(trans, log,
+ inode, 0, 0);
+ }
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags)) {
+ &BTRFS_I(inode)->runtime_flags) ||
+ inode_only == LOG_INODE_EXISTS) {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
@@ -3695,11 +4195,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
- ret = log_inode_item(trans, log, dst_path, inode);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
goto log_extents;
}
@@ -3708,11 +4203,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
err = ret;
goto out_unlock;
}
- path->keep_locks = 1;
while (1) {
ins_nr = 0;
- ret = btrfs_search_forward(root, &min_key, &max_key,
+ ret = btrfs_search_forward(root, &min_key,
path, trans->transid);
if (ret != 0)
break;
@@ -3723,6 +4217,44 @@ again:
if (min_key.type > max_key.type)
break;
+ if (min_key.type == BTRFS_INODE_ITEM_KEY)
+ need_log_inode_item = false;
+
+ if ((min_key.type == BTRFS_INODE_REF_KEY ||
+ min_key.type == BTRFS_INODE_EXTREF_KEY) &&
+ BTRFS_I(inode)->generation == trans->transid) {
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0],
+ &min_key, inode);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ } else if (ret > 0) {
+ err = 1;
+ ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = trans->transid;
+ goto out_unlock;
+ }
+ }
+
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path->nodes[0],
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+ if (ret) {
+ btrfs_release_path(path);
+ continue;
+ }
+ goto next_slot;
+ }
+
src = path->nodes[0];
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
@@ -3734,7 +4266,8 @@ again:
}
ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
- ins_nr, inode_only);
+ ins_nr, inode_only,
+ logged_isize);
if (ret) {
err = ret;
goto out_unlock;
@@ -3753,7 +4286,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, src,
ins_start_slot,
- ins_nr, inode_only);
+ ins_nr, inode_only, logged_isize);
if (ret) {
err = ret;
goto out_unlock;
@@ -3773,7 +4306,7 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
- ins_nr, inode_only);
+ ins_nr, inode_only, logged_isize);
if (ret) {
err = ret;
goto out_unlock;
@@ -3781,37 +4314,82 @@ next_slot:
ins_nr = 0;
}
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ if (err)
+ goto out_unlock;
+ xattrs_logged = true;
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
+ if (need_log_inode_item) {
+ err = log_inode_item(trans, log, dst_path, inode);
+ if (!err && !xattrs_logged) {
+ err = btrfs_log_all_xattrs(trans, root, inode, path,
+ dst_path);
+ btrfs_release_path(path);
+ }
+ if (err)
+ goto out_unlock;
+ }
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+ ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+ &logged_list);
if (ret) {
err = ret;
goto out_unlock;
}
- } else {
- struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+ } else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
- write_lock(&tree->lock);
- list_for_each_entry_safe(em, n, &tree->modified_extents, list)
- list_del_init(&em->list);
- write_unlock(&tree->lock);
+ write_lock(&em_tree->lock);
+ /*
+ * We can't just remove every em if we're called for a ranged
+ * fsync - that is, one that doesn't cover the whole possible
+ * file range (0 to LLONG_MAX). This is because we can have
+ * em's that fall outside the range we're logging and therefore
+ * their ordered operations haven't completed yet
+ * (btrfs_finish_ordered_io() not invoked yet). This means we
+ * didn't get their respective file extent item in the fs/subvol
+ * tree yet, and need to let the next fast fsync (one which
+ * consults the list of modified extent maps) find the em so
+ * that it logs a matching file extent item and waits for the
+ * respective ordered operation to complete (if it's still
+ * running).
+ *
+ * Removing every em outside the range we're logging would make
+ * the next fast fsync not log their matching file extent items,
+ * therefore making us lose data after a log replay.
+ */
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+ list) {
+ const u64 mod_end = em->mod_start + em->mod_len - 1;
+
+ if (em->mod_start >= start && mod_end <= end)
+ list_del_init(&em->list);
+ }
+ write_unlock(&em_tree->lock);
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path);
+ ret = log_directory_changes(trans, root, inode, path, dst_path,
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
}
}
+
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+ spin_unlock(&BTRFS_I(inode)->lock);
out_unlock:
- if (err)
- btrfs_free_logged_extents(log, log->log_transid);
+ if (unlikely(err))
+ btrfs_put_logged_extents(&logged_list);
+ else
+ btrfs_submit_logged_extents(&logged_list, log);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
@@ -3820,6 +4398,42 @@ out_unlock:
}
/*
+ * Check if we must fallback to a transaction commit when logging an inode.
+ * This must be called after logging the inode and is used only in the context
+ * when fsyncing an inode requires the need to log some other inode - in which
+ * case we can't lock the i_mutex of each other inode we need to log as that
+ * can lead to deadlocks with concurrent fsync against other inodes (as we can
+ * log inodes up or down in the hierarchy) or rename operations for example. So
+ * we take the log_mutex of the inode after we have logged it and then check for
+ * its last_unlink_trans value - this is safe because any task setting
+ * last_unlink_trans must take the log_mutex and it must do this before it does
+ * the actual unlink operation, so if we do this check before a concurrent task
+ * sets last_unlink_trans it means we've logged a consistent version/state of
+ * all the inode items, otherwise we are not sure and must do a transaction
+ * commit (the concurrent task migth have only updated last_unlink_trans before
+ * we logged the inode or it might have also done the unlink).
+ */
+static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ bool ret = false;
+
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+ if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
+ /*
+ * Make sure any commits to the log are forced to be full
+ * commits.
+ */
+ fs_info->last_trans_log_full_commit = trans->transid;
+ ret = true;
+ }
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+ return ret;
+}
+
+/*
* follow the dentry parent pointers up the chain and see if any
* of the directories in it require a full commit before they can
* be logged. Returns zero if nothing special needs to be done or 1 if
@@ -3832,7 +4446,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
u64 last_committed)
{
int ret = 0;
- struct btrfs_root *root;
struct dentry *old_parent = NULL;
struct inode *orig_inode = inode;
@@ -3864,15 +4477,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->logged_trans = trans->transid;
smp_mb();
- if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
- root = BTRFS_I(inode)->root;
-
- /*
- * make sure any commits to the log are forced
- * to be full commits
- */
- root->fs_info->last_trans_log_full_commit =
- trans->transid;
+ if (btrfs_must_commit_transaction(trans, inode)) {
ret = 1;
break;
}
@@ -3894,6 +4499,275 @@ out:
return ret;
}
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names. This does not result in a problem because if a dir_item key is
+ * logged but its matching dir_index key is not logged, at log replay time we
+ * don't use it to replay the respective name (see replay_one_name()). On the
+ * other hand if only the dir_index key ends up being logged, the respective
+ * name is added to the fs/subvol tree with both the dir_item and dir_index
+ * keys created (see replay_one_name()).
+ * The directory's inode item with a wrong i_size is not a problem as well,
+ * since we don't use it at log replay time to set the i_size in the inode
+ * item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+ dir_elem->ino = btrfs_ino(start_inode);
+ list_add_tail(&dir_elem->list, &dir_list);
+
+ while (!list_empty(&dir_list)) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ int nritems;
+ int i;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+ list);
+ if (ret)
+ goto next_dir_inode;
+
+ min_key.objectid = dir_elem->ino;
+ min_key.type = BTRFS_DIR_ITEM_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+
+process_leaf:
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ struct btrfs_dir_list *new_dir_elem;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != dir_elem->ino ||
+ min_key.type != BTRFS_DIR_ITEM_KEY)
+ goto next_dir_inode;
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid &&
+ type != BTRFS_FT_DIR)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+ root, NULL);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto next_dir_inode;
+ }
+
+ if (btrfs_inode_in_log(di_inode, trans->transid)) {
+ iput(di_inode);
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, root, di_inode,
+ log_mode, 0, LLONG_MAX, ctx);
+ if (!ret &&
+ btrfs_must_commit_transaction(trans, di_inode))
+ ret = 1;
+ iput(di_inode);
+ if (ret)
+ goto next_dir_inode;
+ if (ctx->log_new_dentries) {
+ new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+ GFP_NOFS);
+ if (!new_dir_elem) {
+ ret = -ENOMEM;
+ goto next_dir_inode;
+ }
+ new_dir_elem->ino = di_key.objectid;
+ list_add_tail(&new_dir_elem->list, &dir_list);
+ }
+ break;
+ }
+ if (i == nritems) {
+ ret = btrfs_next_leaf(log, path);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+ goto process_leaf;
+ }
+ if (min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+next_dir_inode:
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_log_ctx *ctx)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+ unsigned long ptr;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+ if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ struct btrfs_key inode_key;
+ struct inode *dir_inode;
+
+ inode_key.type = BTRFS_INODE_ITEM_KEY;
+ inode_key.offset = 0;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ inode_key.objectid = btrfs_inode_extref_parent(
+ leaf, extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ inode_key.objectid = key.offset;
+ cur_offset = item_size;
+ }
+
+ dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+ root, NULL);
+ /* If parent inode was deleted, skip it. */
+ if (IS_ERR(dir_inode))
+ continue;
+
+ ret = btrfs_log_inode(trans, root, dir_inode,
+ LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ if (!ret &&
+ btrfs_must_commit_transaction(trans, dir_inode))
+ ret = 1;
+ iput(dir_inode);
+ if (ret)
+ goto out;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -3902,13 +4776,19 @@ out:
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only)
+ struct dentry *parent,
+ const loff_t start,
+ const loff_t end,
+ int exists_only,
+ struct btrfs_log_ctx *ctx)
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
+ bool log_dentries = false;
+ struct inode *orig_inode = inode;
sb = inode->i_sb;
@@ -3939,11 +4819,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- ret = start_log_trans(trans, root);
+ ret = start_log_trans(trans, root, ctx);
if (ret)
goto end_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
if (ret)
goto end_trans;
@@ -3960,7 +4840,56 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- inode_only = LOG_INODE_EXISTS;
+ if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+ log_dentries = true;
+
+ /*
+ * On unlink we must make sure all our current and old parent directores
+ * inodes are fully logged. This is to prevent leaving dangling
+ * directory index entries in directories that were our parents but are
+ * not anymore. Not doing this results in old parent directory being
+ * impossible to delete after log replay (rmdir will always fail with
+ * error -ENOTEMPTY).
+ *
+ * Example 1:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * ln testdir/foo testdir/bar
+ * sync
+ * unlink testdir/bar
+ * xfs_io -c fsync testdir/foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * If we don't log the parent directory (testdir), after log replay the
+ * directory still has an entry pointing to the file inode using the bar
+ * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+ * the file inode has a link count of 1.
+ *
+ * Example 2:
+ *
+ * mkdir testdir
+ * touch foo
+ * ln foo testdir/foo2
+ * ln foo testdir/foo3
+ * sync
+ * unlink testdir/foo3
+ * xfs_io -c fsync foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * Similar as the first example, after log replay the parent directory
+ * testdir still has an entry pointing to the inode file with name foo3
+ * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+ * and has a link count of 2.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ ret = btrfs_log_all_parents(trans, inode, ctx);
+ if (ret)
+ goto end_trans;
+ }
+
while (1) {
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
@@ -3969,9 +4898,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
- if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed) {
- ret = btrfs_log_inode(trans, root, inode, inode_only);
+ if (BTRFS_I(inode)->generation > last_committed) {
+ ret = btrfs_log_inode(trans, root, inode,
+ LOG_INODE_EXISTS,
+ 0, LLONG_MAX, ctx);
if (ret)
goto end_trans;
}
@@ -3982,13 +4912,19 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
dput(old_parent);
old_parent = parent;
}
- ret = 0;
+ if (log_dentries)
+ ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+ else
+ ret = 0;
end_trans:
dput(old_parent);
if (ret < 0) {
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 1;
}
+
+ if (ret)
+ btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
end_no_trans:
return ret;
@@ -4001,12 +4937,16 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+ struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+ ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+ start, end, 0, ctx);
dput(parent);
return ret;
@@ -4164,6 +5104,9 @@ error:
* They revolve around files there were unlinked from the directory, and
* this function updates the parent directory so that a full commit is
* properly done if it is fsync'd later after the unlinks are done.
+ *
+ * Must be called before the unlink operations (updates to the subvolume tree,
+ * inodes, etc) are done.
*/
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
@@ -4179,8 +5122,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode)) {
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ }
/*
* if this directory was already logged any new
@@ -4211,7 +5157,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
return;
record:
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
BTRFS_I(dir)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
+}
+
+/*
+ * Make sure that if someone attempts to fsync the parent directory of a deleted
+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
+ * that after replaying the log tree of the parent directory's root we will not
+ * see the snapshot anymore and at log replay time we will not see any log tree
+ * corresponding to the deleted snapshot's root, which could lead to replaying
+ * it after replaying the log tree of the parent directory (which would replay
+ * the snapshot delete operation).
+ *
+ * Must be called before the actual snapshot destroy operation (updates to the
+ * parent root and tree of tree roots trees, etc) are done.
+ */
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct inode *dir)
+{
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
}
/*
@@ -4231,7 +5199,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
* this will force the logging code to walk the dentry chain
* up for the file
*/
- if (S_ISREG(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode))
BTRFS_I(inode)->last_unlink_trans = trans->transid;
/*
@@ -4244,6 +5212,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
root->fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+ return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+ LLONG_MAX, 1, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..ff16b75fae68 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,30 @@
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+struct btrfs_log_ctx {
+ int log_ret;
+ bool log_new_dentries;
+ struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+ ctx->log_ret = 0;
+ ctx->log_new_dentries = false;
+ INIT_LIST_HEAD(&ctx->list);
+}
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry);
+ struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -43,6 +59,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
int for_rename);
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct inode *dir);
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *old_dir,
struct dentry *parent);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 280a6ad05db6..525fae984a52 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -49,14 +49,47 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
-static void lock_chunks(struct btrfs_root *root)
+static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
- mutex_lock(&root->fs_info->chunk_mutex);
+ struct btrfs_fs_devices *fs_devs;
+
+ fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+ if (!fs_devs)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&fs_devs->device_list_mutex);
+
+ INIT_LIST_HEAD(&fs_devs->devices);
+ INIT_LIST_HEAD(&fs_devs->resized_devices);
+ INIT_LIST_HEAD(&fs_devs->alloc_list);
+ INIT_LIST_HEAD(&fs_devs->list);
+
+ return fs_devs;
}
-static void unlock_chunks(struct btrfs_root *root)
+/**
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid: a pointer to UUID for this FS. If NULL a new UUID is
+ * generated.
+ *
+ * Return: a pointer to a new &struct btrfs_fs_devices on success;
+ * ERR_PTR() on error. Returned struct is not linked onto any lists and
+ * can be destroyed with kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
- mutex_unlock(&root->fs_info->chunk_mutex);
+ struct btrfs_fs_devices *fs_devs;
+
+ fs_devs = __alloc_fs_devices();
+ if (IS_ERR(fs_devs))
+ return fs_devs;
+
+ if (fsid)
+ memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+ else
+ generate_random_uuid(fs_devs->fsid);
+
+ return fs_devs;
}
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -98,6 +131,29 @@ void btrfs_cleanup_fs_uuids(void)
}
}
+static struct btrfs_device *__alloc_device(void)
+{
+ struct btrfs_device *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_NOFS);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&dev->dev_list);
+ INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->resized_list);
+
+ spin_lock_init(&dev->io_lock);
+
+ spin_lock_init(&dev->reada_lock);
+ atomic_set(&dev->reada_in_flight, 0);
+ btrfs_device_data_ordered_init(dev);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
+ return dev;
+}
+
static noinline struct btrfs_device *__find_device(struct list_head *head,
u64 devid, u8 *uuid)
{
@@ -401,16 +457,14 @@ static noinline int device_list_add(const char *path,
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
- fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
- if (!fs_devices)
- return -ENOMEM;
- INIT_LIST_HEAD(&fs_devices->devices);
- INIT_LIST_HEAD(&fs_devices->alloc_list);
+ fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (IS_ERR(fs_devices))
+ return PTR_ERR(fs_devices);
+
list_add(&fs_devices->list, &fs_uuids);
- memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
fs_devices->latest_devid = devid;
fs_devices->latest_trans = found_transid;
- mutex_init(&fs_devices->device_list_mutex);
+
device = NULL;
} else {
device = __find_device(&fs_devices->devices, devid,
@@ -420,17 +474,12 @@ static noinline int device_list_add(const char *path,
if (fs_devices->opened)
return -EBUSY;
- device = kzalloc(sizeof(*device), GFP_NOFS);
- if (!device) {
+ device = btrfs_alloc_device(NULL, &devid,
+ disk_super->dev_item.uuid);
+ if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
- return -ENOMEM;
+ return PTR_ERR(device);
}
- device->devid = devid;
- device->dev_stats_valid = 0;
- device->work.func = pending_bios_fn;
- memcpy(device->uuid, disk_super->dev_item.uuid,
- BTRFS_UUID_SIZE);
- spin_lock_init(&device->io_lock);
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
@@ -438,23 +487,14 @@ static noinline int device_list_add(const char *path,
return -ENOMEM;
}
rcu_assign_pointer(device->name, name);
- INIT_LIST_HEAD(&device->dev_alloc_list);
-
- /* init readahead state */
- spin_lock_init(&device->reada_lock);
- device->reada_curr_zone = NULL;
- atomic_set(&device->reada_in_flight, 0);
- device->reada_next = 0;
- INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
- INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
mutex_lock(&fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
ret = 1;
device->fs_devices = fs_devices;
- fs_devices->num_devices++;
} else if (!device->name || strcmp(device->name->str, path)) {
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
@@ -482,25 +522,21 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_device *device;
struct btrfs_device *orig_dev;
- fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
- if (!fs_devices)
- return ERR_PTR(-ENOMEM);
+ fs_devices = alloc_fs_devices(orig->fsid);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
- INIT_LIST_HEAD(&fs_devices->devices);
- INIT_LIST_HEAD(&fs_devices->alloc_list);
- INIT_LIST_HEAD(&fs_devices->list);
- mutex_init(&fs_devices->device_list_mutex);
fs_devices->latest_devid = orig->latest_devid;
fs_devices->latest_trans = orig->latest_trans;
fs_devices->total_devices = orig->total_devices;
- memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
/* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
- device = kzalloc(sizeof(*device), GFP_NOFS);
- if (!device)
+ device = btrfs_alloc_device(NULL, &orig_dev->devid,
+ orig_dev->uuid);
+ if (IS_ERR(device))
goto error;
/*
@@ -514,13 +550,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
}
rcu_assign_pointer(device->name, name);
- device->devid = orig_dev->devid;
- device->work.func = pending_bios_fn;
- memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
- spin_lock_init(&device->io_lock);
- INIT_LIST_HEAD(&device->dev_list);
- INIT_LIST_HEAD(&device->dev_alloc_list);
-
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
@@ -993,13 +1022,69 @@ out:
return ret;
}
+static int contains_pending_extent(struct btrfs_transaction *transaction,
+ struct btrfs_device *device,
+ u64 *start, u64 len)
+{
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct extent_map *em;
+ struct list_head *search_list = &fs_info->pinned_chunks;
+ int ret = 0;
+ u64 physical_start = *start;
+
+ if (transaction)
+ search_list = &transaction->pending_chunks;
+again:
+ list_for_each_entry(em, search_list, list) {
+ struct map_lookup *map;
+ int i;
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ u64 end;
+
+ if (map->stripes[i].dev != device)
+ continue;
+ if (map->stripes[i].physical >= physical_start + len ||
+ map->stripes[i].physical + em->orig_block_len <=
+ physical_start)
+ continue;
+ /*
+ * Make sure that while processing the pinned list we do
+ * not override our *start with a lower value, because
+ * we can have pinned chunks that fall within this
+ * device hole and that have lower physical addresses
+ * than the pending chunks we processed before. If we
+ * do not take this special care we can end up getting
+ * 2 pending chunks that start at the same physical
+ * device offsets because the end offset of a pinned
+ * chunk can be equal to the start offset of some
+ * pending chunk.
+ */
+ end = map->stripes[i].physical + em->orig_block_len;
+ if (end > *start) {
+ *start = end;
+ ret = 1;
+ }
+ }
+ }
+ if (search_list != &fs_info->pinned_chunks) {
+ search_list = &fs_info->pinned_chunks;
+ goto again;
+ }
+
+ return ret;
+}
+
+
/*
- * find_free_dev_extent - find free space in the specified device
- * @device: the device which we search the free space in
- * @num_bytes: the size of the free space that we need
- * @start: store the start of the free space.
- * @len: the size of the free space. that we find, or the size of the max
- * free space if we don't find suitable free space
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size
+ * of the max free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
@@ -1013,8 +1098,9 @@ out:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *len)
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -1024,34 +1110,36 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
- u64 search_start;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
+ u64 min_search_start;
- /* FIXME use last free of some kind */
-
- /* we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
+ /*
+ * We don't want to overwrite the superblock on the drive nor any area
+ * used by the boot loader (grub for example), so we make sure to start
+ * at an offset of at least 1MB.
*/
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ search_start = max(search_start, min_search_start);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
max_hole_start = search_start;
max_hole_size = 0;
hole_size = 0;
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
- goto error;
+ goto out;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto error;
- }
path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
@@ -1092,6 +1180,21 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
if (key.offset > search_start) {
hole_size = key.offset - search_start;
+ /*
+ * Have to check before we set max_hole_start, otherwise
+ * we could end up sending back this offset anyway.
+ */
+ if (contains_pending_extent(transaction, device,
+ &search_start,
+ hole_size)) {
+ if (key.offset >= search_start) {
+ hole_size = key.offset - search_start;
+ } else {
+ WARN_ON_ONCE(1);
+ hole_size = 0;
+ }
+ }
+
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
@@ -1135,6 +1238,11 @@ next:
max_hole_size = hole_size;
}
+ if (contains_pending_extent(transaction, device, &search_start, hole_size)) {
+ btrfs_release_path(path);
+ goto again;
+ }
+
/* See above. */
if (hole_size < num_bytes)
ret = -ENOSPC;
@@ -1143,16 +1251,24 @@ next:
out:
btrfs_free_path(path);
-error:
*start = max_hole_start;
if (len)
*len = max_hole_size;
return ret;
}
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
+{
+ /* FIXME use last free of some kind */
+ return find_free_dev_extent_start(trans->transaction, device,
+ num_bytes, 0, start, len);
+}
+
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
- u64 start)
+ u64 start, u64 *dev_extent_len)
{
int ret;
struct btrfs_path *path;
@@ -1194,17 +1310,14 @@ again:
goto out;
}
- if (device->bytes_used > 0) {
- u64 len = btrfs_dev_extent_length(leaf, extent);
- device->bytes_used -= len;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += len;
- spin_unlock(&root->fs_info->free_chunk_lock);
- }
+ *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_error(root->fs_info, ret,
"Failed to remove dev extent item");
+ } else {
+ trans->transaction->have_free_bgs = 1;
}
out:
btrfs_free_path(path);
@@ -1255,47 +1368,22 @@ out:
return ret;
}
-static noinline int find_next_chunk(struct btrfs_root *root,
- u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
- int ret;
- struct btrfs_key key;
- struct btrfs_chunk *chunk;
- struct btrfs_key found_key;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = objectid;
- key.offset = (u64)-1;
- key.type = BTRFS_CHUNK_ITEM_KEY;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
-
- BUG_ON(ret == 0); /* Corruption */
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct rb_node *n;
+ u64 ret = 0;
- ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
- if (ret) {
- *offset = 0;
- } else {
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
- if (found_key.objectid != objectid)
- *offset = 0;
- else {
- chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_chunk);
- *offset = found_key.offset +
- btrfs_chunk_length(path->nodes[0], chunk);
- }
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ n = rb_last(&em_tree->map);
+ if (n) {
+ em = rb_entry(n, struct extent_map, rb_node);
+ ret = em->start + em->len;
}
- ret = 0;
-error:
- btrfs_free_path(path);
+ read_unlock(&em_tree->lock);
+
return ret;
}
@@ -1376,8 +1464,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1417,7 +1507,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
- lock_chunks(root);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -1433,7 +1522,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
goto out;
out:
btrfs_free_path(path);
- unlock_chunks(root);
btrfs_commit_transaction(trans, root);
return ret;
}
@@ -1610,15 +1698,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
struct btrfs_fs_devices *fs_devices;
fs_devices = root->fs_info->fs_devices;
while (fs_devices) {
- if (fs_devices->seed == cur_devices)
+ if (fs_devices->seed == cur_devices) {
+ fs_devices->seed = cur_devices->seed;
break;
+ }
fs_devices = fs_devices->seed;
}
- fs_devices->seed = cur_devices->seed;
cur_devices->seed = NULL;
- lock_chunks(root);
__btrfs_close_devices(cur_devices);
- unlock_chunks(root);
free_fs_devices(cur_devices);
}
@@ -1784,9 +1871,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
if (!fs_devices->seeding)
return -EINVAL;
- seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
- if (!seed_devices)
- return -ENOMEM;
+ seed_devices = __alloc_fs_devices();
+ if (IS_ERR(seed_devices))
+ return PTR_ERR(seed_devices);
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
@@ -1805,12 +1892,12 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ list_for_each_entry(device, &seed_devices->devices, dev_list)
+ device->fs_devices = seed_devices;
+ lock_chunks(root);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- list_for_each_entry(device, &seed_devices->devices, dev_list) {
- device->fs_devices = seed_devices;
- }
+ unlock_chunks(root);
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
@@ -1821,6 +1908,8 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
generate_random_uuid(fs_devices->fsid);
memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
@@ -1947,10 +2036,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- device = kzalloc(sizeof(*device), GFP_NOFS);
- if (!device) {
+ device = btrfs_alloc_device(root->fs_info, NULL, NULL);
+ if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
- ret = -ENOMEM;
+ ret = PTR_ERR(device);
goto error;
}
@@ -1962,13 +2051,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
}
rcu_assign_pointer(device->name, name);
- ret = find_next_devid(root, &device->devid);
- if (ret) {
- rcu_string_free(device->name);
- kfree(device);
- goto error;
- }
-
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
rcu_string_free(device->name);
@@ -1977,21 +2059,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- lock_chunks(root);
-
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
device->writeable = 1;
- device->work.func = pending_bios_fn;
- generate_random_uuid(device->uuid);
- spin_lock_init(&device->io_lock);
device->generation = trans->transid;
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
+ device->commit_total_bytes = device->total_bytes;
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2007,11 +2085,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->fs_devices = root->fs_info->fs_devices;
- /*
- * we don't want write_supers to jump in here with our device
- * half setup
- */
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ lock_chunks(root);
list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
@@ -2037,34 +2112,39 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+ */
+ btrfs_clear_space_info_full(root->fs_info);
+
+ unlock_chunks(root);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
+ lock_chunks(root);
ret = init_first_rw_device(trans, root, device);
+ unlock_chunks(root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+ }
+
+ ret = btrfs_add_device(trans, root, device);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error_trans;
+ }
+
+ if (seeding_dev) {
ret = btrfs_finish_sprout(trans, root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
- } else {
- ret = btrfs_add_device(trans, root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto error_trans;
- }
}
- /*
- * we've got more storage, clear any full flags on the space
- * infos
- */
- btrfs_clear_space_info_full(root->fs_info);
-
- unlock_chunks(root);
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root);
@@ -2094,7 +2174,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
return ret;
error_trans:
- unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
kfree(device);
@@ -2108,6 +2187,7 @@ error:
}
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct request_queue *q;
@@ -2116,30 +2196,45 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *devices;
struct rcu_string *name;
+ u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding)
+ if (fs_info->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
+ }
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
- if (IS_ERR(bdev))
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
+ }
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
+ btrfs_err(fs_info, "target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
- device = kzalloc(sizeof(*device), GFP_NOFS);
- if (!device) {
- ret = -ENOMEM;
+
+ if (i_size_read(bdev->bd_inode) <
+ btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info, "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
+ device = btrfs_alloc_device(NULL, &devid, NULL);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
goto error;
}
@@ -2156,16 +2251,16 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
device->can_discard = 1;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
device->writeable = 1;
- device->work.func = pending_bios_fn;
- generate_random_uuid(device->uuid);
- device->devid = BTRFS_DEV_REPLACE_DEVID;
- spin_lock_init(&device->io_lock);
device->generation = 0;
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
- device->total_bytes = i_size_read(bdev->bd_inode);
- device->disk_total_bytes = device->total_bytes;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ ASSERT(list_empty(&srcdev->resized_list));
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
device->dev_root = fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
@@ -2236,8 +2331,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
- btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
btrfs_mark_buffer_dirty(leaf);
out:
@@ -2245,40 +2342,44 @@ out:
return ret;
}
-static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
device->dev_root->fs_info->super_copy;
- u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 diff = new_size - device->total_bytes;
+ struct btrfs_fs_devices *fs_devices;
+ u64 old_total;
+ u64 diff;
if (!device->writeable)
return -EACCES;
+
+ lock_chunks(device->dev_root);
+ old_total = btrfs_super_total_bytes(super_copy);
+ diff = new_size - device->total_bytes;
+
if (new_size <= device->total_bytes ||
- device->is_tgtdev_for_dev_replace)
+ device->is_tgtdev_for_dev_replace) {
+ unlock_chunks(device->dev_root);
return -EINVAL;
+ }
+
+ fs_devices = device->dev_root->fs_info->fs_devices;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
device->fs_devices->total_rw_bytes += diff;
- device->total_bytes = new_size;
- device->disk_total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
+ btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->dev_root->fs_info);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &fs_devices->resized_devices);
+ unlock_chunks(device->dev_root);
return btrfs_update_device(trans, device);
}
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 new_size)
-{
- int ret;
- lock_chunks(device->dev_root);
- ret = __btrfs_grow_device(trans, device, new_size);
- unlock_chunks(device->dev_root);
- return ret;
-}
-
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid,
@@ -2330,6 +2431,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
u32 cur;
struct btrfs_key key;
+ lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2359,94 +2461,149 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
cur += len;
}
}
+ unlock_chunks(root);
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset)
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset)
{
struct extent_map_tree *em_tree;
- struct btrfs_root *extent_root;
- struct btrfs_trans_handle *trans;
struct extent_map *em;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
struct map_lookup *map;
- int ret;
- int i;
+ u64 dev_extent_len = 0;
+ u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ u64 chunk_tree = root->fs_info->chunk_root->objectid;
+ int i, ret = 0;
+ /* Just in case */
root = root->fs_info->chunk_root;
- extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
- ret = btrfs_can_relocate(extent_root, chunk_offset);
- if (ret)
- return -ENOSPC;
-
- /* step one, relocate all the extents inside this chunk */
- ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
- return ret;
- }
-
- lock_chunks(root);
-
- /*
- * step two, delete the device extents and the
- * chunk tree entries
- */
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
- BUG_ON(!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset);
+ if (!em || em->start > chunk_offset ||
+ em->start + em->len < chunk_offset) {
+ /*
+ * This is a logic error, but we don't want to just rely on the
+ * user having built with ASSERT enabled, so if ASSERT doens't
+ * do anything we still error out.
+ */
+ ASSERT(0);
+ if (em)
+ free_extent_map(em);
+ return -EINVAL;
+ }
map = (struct map_lookup *)em->bdev;
+ lock_chunks(root->fs_info->chunk_root);
+ check_system_chunk(trans, extent_root, map->type, false);
+ unlock_chunks(root->fs_info->chunk_root);
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
- map->stripes[i].physical);
- BUG_ON(ret);
+ struct btrfs_device *device = map->stripes[i].dev;
+ ret = btrfs_free_dev_extent(trans, device,
+ map->stripes[i].physical,
+ &dev_extent_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ if (device->bytes_used > 0) {
+ lock_chunks(root);
+ btrfs_device_set_bytes_used(device,
+ device->bytes_used - dev_extent_len);
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += dev_extent_len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ btrfs_clear_space_info_full(root->fs_info);
+ unlock_chunks(root);
+ }
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
}
ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
chunk_offset);
-
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
}
- ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
- BUG_ON(ret);
-
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- kfree(map);
- em->bdev = NULL;
+ ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
- /* once for the tree */
- free_extent_map(em);
+out:
/* once for us */
free_extent_map(em);
+ return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ struct btrfs_root *extent_root;
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+
+ root = root->fs_info->chunk_root;
+ extent_root = root->fs_info->extent_root;
+
+ /*
+ * Prevent races with automatic removal of unused block groups.
+ * After we relocate and before we remove the chunk with offset
+ * chunk_offset, automatic removal of the block group can kick in,
+ * resulting in a failure when calling btrfs_remove_chunk() below.
+ *
+ * Make sure to acquire this mutex before doing a tree search (dev
+ * or chunk trees) to find chunks. Otherwise the cleaner kthread might
+ * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
+ * we release the path used to search the chunk/dev tree and before
+ * the current task acquires this mutex and calls us.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ if (ret)
+ return ret;
+
+ /*
+ * step two, flag the chunk as removed and let
+ * btrfs_delete_unused_bgs() remove it.
+ */
+ block_group = btrfs_lookup_block_group(root->fs_info, chunk_offset);
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
- unlock_chunks(root);
- btrfs_end_transaction(trans, root);
return 0;
}
@@ -2474,13 +2631,18 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto error;
+ }
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto error;
if (ret > 0)
@@ -2503,6 +2665,7 @@ again:
else if (ret)
BUG();
}
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (found_key.offset == 0)
break;
@@ -2892,11 +3055,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
- old_size = device->total_bytes;
+ old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
if (!device->writeable ||
- device->total_bytes - device->bytes_used > size_to_free ||
+ btrfs_device_get_total_bytes(device) -
+ btrfs_device_get_bytes_used(device) > size_to_free ||
device->is_tgtdev_for_dev_replace)
continue;
@@ -2937,9 +3101,12 @@ again:
goto error;
}
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
+ }
/*
* this shouldn't happen, it means the last relocate
@@ -2951,6 +3118,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ret = 0;
break;
}
@@ -2959,12 +3127,10 @@ again:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- if (found_key.objectid != key.objectid)
- break;
-
- /* chunk zero is special */
- if (found_key.offset == 0)
+ if (found_key.objectid != key.objectid) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
break;
+ }
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -2977,10 +3143,13 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
btrfs_release_path(path);
- if (!ret)
+ if (!ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
+ }
if (counting) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -2991,6 +3160,7 @@ again:
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto error;
if (ret == -ENOSPC) {
@@ -3001,6 +3171,8 @@ again:
spin_unlock(&fs_info->balance_lock);
}
loop:
+ if (found_key.offset == 0)
+ break;
key.offset = found_key.offset - 1;
}
@@ -3440,12 +3612,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
int slot;
int failed = 0;
bool retried = false;
+ bool checked_pending_chunks = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
- u64 old_size = device->total_bytes;
- u64 diff = device->total_bytes - new_size;
+ u64 old_size = btrfs_device_get_total_bytes(device);
+ u64 diff = old_size - new_size;
if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
@@ -3458,7 +3631,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
lock_chunks(root);
- device->total_bytes = new_size;
+ btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
spin_lock(&root->fs_info->free_chunk_lock);
@@ -3473,11 +3646,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto done;
+ }
ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto done;
if (ret) {
@@ -3491,6 +3669,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -3499,6 +3678,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -3510,6 +3690,7 @@ again:
ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
chunk_offset);
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
@@ -3522,15 +3703,6 @@ again:
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
- lock_chunks(root);
-
- device->total_bytes = old_size;
- if (device->writeable)
- device->fs_devices->total_rw_bytes += diff;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += diff;
- spin_unlock(&root->fs_info->free_chunk_lock);
- unlock_chunks(root);
goto done;
}
@@ -3543,20 +3715,59 @@ again:
lock_chunks(root);
- device->disk_total_bytes = new_size;
- /* Now btrfs_update_device() will change the on-disk size. */
- ret = btrfs_update_device(trans, device);
- if (ret) {
- unlock_chunks(root);
- btrfs_end_transaction(trans, root);
- goto done;
+ /*
+ * We checked in the above loop all device extents that were already in
+ * the device tree. However before we have updated the device's
+ * total_bytes to the new size, we might have had chunk allocations that
+ * have not complete yet (new block groups attached to transaction
+ * handles), and therefore their device extents were not yet in the
+ * device tree and we missed them in the loop above. So if we have any
+ * pending chunk using a device extent that overlaps the device range
+ * that we can not use anymore, commit the current transaction and
+ * repeat the search on the device tree - this way we guarantee we will
+ * not have chunks using device extents that end beyond 'new_size'.
+ */
+ if (!checked_pending_chunks) {
+ u64 start = new_size;
+ u64 len = old_size - new_size;
+
+ if (contains_pending_extent(trans->transaction, device,
+ &start, len)) {
+ unlock_chunks(root);
+ checked_pending_chunks = true;
+ failed = 0;
+ retried = false;
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ goto done;
+ goto again;
+ }
}
+
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &root->fs_info->fs_devices->resized_devices);
+
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy, old_total - diff);
unlock_chunks(root);
+
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
+ if (ret) {
+ lock_chunks(root);
+ btrfs_device_set_total_bytes(device, old_size);
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ unlock_chunks(root);
+ }
return ret;
}
@@ -3646,10 +3857,8 @@ static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
};
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes_out, u64 *stripe_size_out,
- u64 start, u64 type)
+ struct btrfs_root *extent_root, u64 start,
+ u64 type)
{
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3753,7 +3962,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(device,
+ ret = find_free_dev_extent(trans, device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
@@ -3833,73 +4042,58 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
map->type = type;
map->sub_stripes = sub_stripes;
- *map_ret = map;
num_bytes = stripe_size * (num_stripes / ncopies);
- *stripe_size_out = stripe_size;
- *num_bytes_out = num_bytes;
-
trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
em = alloc_extent_map();
if (!em) {
+ kfree(map);
ret = -ENOMEM;
goto error;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = start;
em->len = num_bytes;
em->block_start = 0;
em->block_len = em->len;
+ em->orig_block_len = stripe_size;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
+ if (!ret) {
+ list_add_tail(&em->list, &trans->transaction->pending_chunks);
+ atomic_inc(&em->refs);
+ }
write_unlock(&em_tree->lock);
if (ret) {
free_extent_map(em);
goto error;
}
- for (i = 0; i < map->num_stripes; ++i) {
- struct btrfs_device *device;
- u64 dev_offset;
-
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
-
- ret = btrfs_alloc_dev_extent(trans, device,
- info->chunk_root->root_key.objectid,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, dev_offset, stripe_size);
- if (ret)
- goto error_dev_extent;
- }
-
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
- if (ret) {
- i = map->num_stripes - 1;
- goto error_dev_extent;
+ if (ret)
+ goto error_del_extent;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+ btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
}
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
free_extent_map(em);
kfree(devices_info);
return 0;
-error_dev_extent:
- for (; i >= 0; i--) {
- struct btrfs_device *device;
- int err;
-
- device = map->stripes[i].dev;
- err = btrfs_free_dev_extent(trans, device, start);
- if (err) {
- btrfs_abort_transaction(trans, extent_root, err);
- break;
- }
- }
+error_del_extent:
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
@@ -3908,57 +4102,85 @@ error_dev_extent:
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
+ /* One for the pending_chunks list reference */
+ free_extent_map(em);
error:
- kfree(map);
kfree(devices_info);
return ret;
}
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
- struct map_lookup *map, u64 chunk_offset,
- u64 chunk_size, u64 stripe_size)
+ u64 chunk_offset, u64 chunk_size)
{
- u64 dev_offset;
struct btrfs_key key;
struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- size_t item_size = btrfs_chunk_item_size(map->num_stripes);
- int index = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ size_t item_size;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i = 0;
int ret;
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(extent_root->fs_info, "unable to find logical "
+ "%Lu len %Lu", chunk_offset, chunk_size);
+ return -EINVAL;
+ }
+
+ if (em->start != chunk_offset || em->len != chunk_size) {
+ btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+ " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+ chunk_size, em->start, em->len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+ stripe_size = em->orig_block_len;
+
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk)
- return -ENOMEM;
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
- index = 0;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
- device->bytes_used += stripe_size;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out_free;
- index++;
+ goto out;
+ ret = btrfs_alloc_dev_extent(trans, device,
+ chunk_root->root_key.objectid,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ goto out;
}
- spin_lock(&extent_root->fs_info->free_chunk_lock);
- extent_root->fs_info->free_chunk_space -= (stripe_size *
- map->num_stripes);
- spin_unlock(&extent_root->fs_info->free_chunk_lock);
-
- index = 0;
stripe = &chunk->stripe;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
- dev_offset = map->stripes[index].physical;
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
- index++;
}
btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -3976,7 +4198,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-
if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
/*
* TODO: Cleanup of inserted chunk root in case of
@@ -3986,8 +4207,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
item_size);
}
-out_free:
+out:
kfree(chunk);
+ free_extent_map(em);
return ret;
}
@@ -4002,27 +4224,10 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 type)
{
u64 chunk_offset;
- u64 chunk_size;
- u64 stripe_size;
- struct map_lookup *map;
- struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
- int ret;
-
- ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- &chunk_offset);
- if (ret)
- return ret;
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, type);
- if (ret)
- return ret;
-
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret)
- return ret;
- return 0;
+ ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
+ chunk_offset = find_next_chunk(extent_root->fs_info);
+ return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4031,66 +4236,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
u64 sys_chunk_offset;
- u64 chunk_size;
- u64 sys_chunk_size;
- u64 stripe_size;
- u64 sys_stripe_size;
u64 alloc_profile;
- struct map_lookup *map;
- struct map_lookup *sys_map;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
- ret = find_next_chunk(fs_info->chunk_root,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
- if (ret)
- return ret;
-
+ chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
- &stripe_size, chunk_offset, alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+ alloc_profile);
if (ret)
return ret;
- sys_chunk_offset = chunk_offset + chunk_size;
-
+ sys_chunk_offset = find_next_chunk(root->fs_info);
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
- &sys_chunk_size, &sys_stripe_size,
- sys_chunk_offset, alloc_profile);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- ret = btrfs_add_device(trans, fs_info->chunk_root, device);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- /*
- * Modifying chunk tree needs allocating new blocks from both
- * system block group and metadata block group. So we only can
- * do operations require modifying the chunk tree after both
- * block groups were created.
- */
- ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
- chunk_size, stripe_size);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto out;
- }
-
- ret = __finish_chunk_alloc(trans, extent_root, sys_map,
- sys_chunk_offset, sys_chunk_size,
- sys_stripe_size);
- if (ret)
- btrfs_abort_transaction(trans, root, ret);
-
-out:
-
+ ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+ alloc_profile);
return ret;
}
@@ -4141,7 +4302,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
write_unlock(&tree->map_tree.lock);
if (!em)
break;
- kfree(em->bdev);
/* once for us */
free_extent_map(em);
/* once for the tree */
@@ -5106,24 +5266,73 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
- device = kzalloc(sizeof(*device), GFP_NOFS);
- if (!device)
+ device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ if (IS_ERR(device))
return NULL;
- list_add(&device->dev_list,
- &fs_devices->devices);
- device->dev_root = root->fs_info->dev_root;
- device->devid = devid;
- device->work.func = pending_bios_fn;
+
+ list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
- device->missing = 1;
fs_devices->num_devices++;
+
+ device->missing = 1;
+ device->dev_root = root->fs_info->dev_root;
fs_devices->missing_devices++;
- spin_lock_init(&device->io_lock);
- INIT_LIST_HEAD(&device->dev_alloc_list);
- memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+
return device;
}
+/**
+ * btrfs_alloc_device - allocate struct btrfs_device
+ * @fs_info: used only for generating a new devid, can be NULL if
+ * devid is provided (i.e. @devid != NULL).
+ * @devid: a pointer to devid for this device. If NULL a new devid
+ * is generated.
+ * @uuid: a pointer to UUID for this device. If NULL a new UUID
+ * is generated.
+ *
+ * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
+ * on error. Returned struct is not linked onto any lists and can be
+ * destroyed with kfree() right away.
+ */
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+ const u64 *devid,
+ const u8 *uuid)
+{
+ struct btrfs_device *dev;
+ u64 tmp;
+
+ if (!devid && !fs_info) {
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
+ dev = __alloc_device();
+ if (IS_ERR(dev))
+ return dev;
+
+ if (devid)
+ tmp = *devid;
+ else {
+ int ret;
+
+ ret = find_next_devid(fs_info->chunk_root, &tmp);
+ if (ret) {
+ kfree(dev);
+ return ERR_PTR(ret);
+ }
+ }
+ dev->devid = tmp;
+
+ if (uuid)
+ memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
+ else
+ generate_random_uuid(dev->uuid);
+
+ dev->work.func = pending_bios_fn;
+
+ return dev;
+}
+
static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
@@ -5164,6 +5373,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
return -ENOMEM;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
@@ -5188,7 +5398,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -5196,7 +5405,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(root, devid, uuid);
if (!map->stripes[i].dev) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -5222,7 +5430,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
device->devid = btrfs_device_id(leaf, dev_item);
device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
device->total_bytes = device->disk_total_bytes;
+ device->commit_total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->commit_bytes_used = device->bytes_used;
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5765,3 +5975,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
return 0;
}
+
+/*
+ * Update the size of all devices, which is used for writing out the
+ * super blocks.
+ */
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *curr, *next;
+
+ if (list_empty(&fs_devices->resized_devices))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ lock_chunks(fs_info->dev_root);
+ list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
+ resized_list) {
+ list_del_init(&curr->resized_list);
+ curr->commit_total_bytes = curr->disk_total_bytes;
+ }
+ unlock_chunks(fs_info->dev_root);
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+/* Must be invoked during the transaction commit */
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *dev;
+ int i;
+
+ if (list_empty(&transaction->pending_chunks))
+ return;
+
+ /* In order to kick the device replace finish process */
+ lock_chunks(root);
+ list_for_each_entry(em, &transaction->pending_chunks, list) {
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ dev = map->stripes[i].dev;
+ dev->commit_bytes_used = dev->bytes_used;
+ }
+ }
+ unlock_chunks(root);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ed312ffd8bb8..3b5fbd073180 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -32,6 +32,19 @@ struct btrfs_pending_bios {
struct bio *tail;
};
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device) \
+ seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
+
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
@@ -52,6 +65,10 @@ struct btrfs_device {
int can_discard;
int is_tgtdev_for_dev_replace;
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
+ seqcount_t data_seqcount;
+#endif
+
spinlock_t io_lock;
struct block_device *bdev;
@@ -88,6 +105,23 @@ struct btrfs_device {
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
+ /*
+ * size of the device on the current transaction
+ *
+ * This variant is update when committing the transaction,
+ * and protected by device_list_mutex
+ */
+ u64 commit_total_bytes;
+
+ /* bytes used on the current transaction */
+ u64 commit_bytes_used;
+ /*
+ * used to manage the device which is resized
+ *
+ * It is protected by chunk_lock.
+ */
+ struct list_head resized_list;
+
/* per-device scrub information */
struct scrub_ctx *scrub_device;
@@ -115,6 +149,73 @@ struct btrfs_device {
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
};
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ unsigned int seq; \
+ \
+ do { \
+ seq = read_seqcount_begin(&dev->data_seqcount); \
+ size = dev->name; \
+ } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ write_seqcount_begin(&dev->data_seqcount); \
+ dev->name = size; \
+ write_seqcount_end(&dev->data_seqcount); \
+ preempt_enable(); \
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ \
+ preempt_disable(); \
+ size = dev->name; \
+ preempt_enable(); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ dev->name = size; \
+ preempt_enable(); \
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ return dev->name; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ dev->name = size; \
+}
+#endif
+
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
@@ -137,6 +238,7 @@ struct btrfs_fs_devices {
struct mutex device_list_mutex;
struct list_head devices;
+ struct list_head resized_devices;
/* devices not currently being allocated */
struct list_head alloc_list;
struct list_head list;
@@ -278,6 +380,9 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device);
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+ const u64 *devid,
+ const u8 *uuid);
int btrfs_rm_device(struct btrfs_root *root, char *device_path);
void btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
@@ -288,6 +393,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
struct btrfs_device **device_out);
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
@@ -296,7 +402,11 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *max_avail);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_root *root,
@@ -312,6 +422,12 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
int btrfs_scratch_superblock(struct btrfs_device *device);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset);
+
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
{
@@ -347,4 +463,20 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
{
btrfs_dev_stat_set(dev, index, 0);
}
+
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction);
+
+static inline void lock_chunks(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static inline void unlock_chunks(struct btrfs_root *root)
+{
+ mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+
#endif