Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2017-05-02 16:56:31 +0200
committerTakashi Iwai <tiwai@suse.de>2017-05-02 16:56:31 +0200
commita9c8266de914cd9051f97184e05d49a16a5ee121 (patch)
treef55e3f03699ed3d071596de82af6c0fcb22b3cc9
parentd0a2065e5a43f5d40921a71e96c24694bf8cc31b (diff)
Revert "- md/raid5: sort bios..." (bsc#1037027)rpm-4.4.63-2
We revert the recent MD changes due to regressions for RAID0. suse-commit: 27e2849cbe8faa13c9131647d343d87e46f24970
-rw-r--r--Documentation/device-mapper/cache.txt4
-rw-r--r--block/blk-mq.c32
-rw-r--r--drivers/md/bcache/util.h1
-rw-r--r--drivers/md/bitmap.c2
-rw-r--r--drivers/md/dm-cache-metadata.c355
-rw-r--r--drivers/md/dm-cache-metadata.h11
-rw-r--r--drivers/md/dm-cache-target.c38
-rw-r--r--drivers/md/dm-raid.c8
-rw-r--r--drivers/md/dm-round-robin.c67
-rw-r--r--drivers/md/dm-rq.c1
-rw-r--r--drivers/md/dm-verity-fec.c18
-rw-r--r--drivers/md/dm-verity-fec.h4
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c76
-rw-r--r--drivers/md/md-cluster.c4
-rw-r--r--drivers/md/md.c224
-rw-r--r--drivers/md/md.h67
-rw-r--r--drivers/md/persistent-data/dm-array.c21
-rw-r--r--drivers/md/persistent-data/dm-array.h1
-rw-r--r--drivers/md/persistent-data/dm-bitset.c146
-rw-r--r--drivers/md/persistent-data/dm-bitset.h39
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c8
-rw-r--r--drivers/md/persistent-data/dm-btree.c18
-rw-r--r--drivers/md/persistent-data/dm-btree.h1
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c16
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c4
-rw-r--r--drivers/md/raid0.c73
-rw-r--r--drivers/md/raid1.c645
-rw-r--r--drivers/md/raid1.h12
-rw-r--r--drivers/md/raid10.c718
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5-cache.c359
-rw-r--r--drivers/md/raid5-log.h31
-rw-r--r--drivers/md/raid5-ppl.c114
-rw-r--r--drivers/md/raid5.c570
-rw-r--r--drivers/md/raid5.h77
-rw-r--r--include/linux/blk-mq.h2
-rw-r--r--include/linux/percpu-refcount.h1
-rw-r--r--lib/percpu-refcount.c17
39 files changed, 1533 insertions, 2255 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index f228604ddbcd..785eab87aa71 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -207,10 +207,6 @@ Optional feature arguments are:
block, then the cache block is invalidated.
To enable passthrough mode the cache must be clean.
- metadata2 : use version 2 of the metadata. This stores the dirty bits
- in a separate btree, which improves speed of shutting
- down the cache.
-
A policy called 'default' is always registered. This is an alias for
the policy we currently think is giving best all round performance.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9796a8792c42..9f8f83f1cdc6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -981,8 +981,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
return hctx->next_cpu;
}
-static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
- unsigned long msecs)
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
if (unlikely(blk_mq_hctx_stopped(hctx) ||
!blk_mq_hw_queue_mapped(hctx)))
@@ -999,24 +998,7 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
put_cpu();
}
- if (msecs == 0)
- kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->run_work);
- else
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->delayed_run_work,
- msecs_to_jiffies(msecs));
-}
-
-void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
-{
- __blk_mq_delay_run_hw_queue(hctx, true, msecs);
-}
-EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
-
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
-{
- __blk_mq_delay_run_hw_queue(hctx, async, 0);
+ kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
}
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -1120,15 +1102,6 @@ static void blk_mq_run_work_fn(struct work_struct *work)
__blk_mq_run_hw_queue(hctx);
}
-static void blk_mq_delayed_run_work_fn(struct work_struct *work)
-{
- struct blk_mq_hw_ctx *hctx;
-
- hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
-
- __blk_mq_run_hw_queue(hctx);
-}
-
static void blk_mq_delay_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
@@ -1812,7 +1785,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
node = hctx->numa_node = set->numa_node;
INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
- INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 03af094d0202..cf2cbc211d83 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,6 +4,7 @@
#include <linux/blkdev.h>
#include <linux/errno.h>
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/llist.h>
#include <linux/ratelimit.h>
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e8f1c8d0b96d..01f6833d6420 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -697,7 +697,7 @@ re_read:
out:
kunmap_atomic(sb);
- /* Assigning chunksize is required for "re_read" */
+ /* Assiging chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 6735c8d6a445..624fe4319b24 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -25,7 +25,7 @@
* defines a range of metadata versions that this module can handle.
*/
#define MIN_CACHE_VERSION 1
-#define MAX_CACHE_VERSION 2
+#define MAX_CACHE_VERSION 1
#define CACHE_METADATA_CACHE_SIZE 64
@@ -55,7 +55,6 @@ enum mapping_bits {
/*
* The data on the cache is different from that on the origin.
- * This flag is only used by metadata format 1.
*/
M_DIRTY = 2
};
@@ -94,18 +93,12 @@ struct cache_disk_superblock {
__le32 write_misses;
__le32 policy_version[CACHE_POLICY_VERSION_SIZE];
-
- /*
- * Metadata format 2 fields.
- */
- __le64 dirty_root;
} __packed;
struct dm_cache_metadata {
atomic_t ref_count;
struct list_head list;
- unsigned version;
struct block_device *bdev;
struct dm_block_manager *bm;
struct dm_space_map *metadata_sm;
@@ -149,18 +142,11 @@ struct dm_cache_metadata {
bool fail_io:1;
/*
- * Metadata format 2 fields.
- */
- dm_block_t dirty_root;
- struct dm_disk_bitset dirty_info;
-
- /*
* These structures are used when loading metadata. They're too
* big to put on the stack.
*/
struct dm_array_cursor mapping_cursor;
struct dm_array_cursor hint_cursor;
- struct dm_bitset_cursor dirty_cursor;
};
/*-------------------------------------------------------------------
@@ -184,7 +170,6 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
static int check_metadata_version(struct cache_disk_superblock *disk_super)
{
uint32_t metadata_version = le32_to_cpu(disk_super->version);
-
if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
@@ -325,11 +310,6 @@ static void __copy_sm_root(struct dm_cache_metadata *cmd,
sizeof(cmd->metadata_space_map_root));
}
-static bool separate_dirty_bits(struct dm_cache_metadata *cmd)
-{
- return cmd->version >= 2;
-}
-
static int __write_initial_superblock(struct dm_cache_metadata *cmd)
{
int r;
@@ -361,7 +341,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->flags = 0;
memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
- disk_super->version = cpu_to_le32(cmd->version);
+ disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
@@ -382,9 +362,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->write_hits = cpu_to_le32(0);
disk_super->write_misses = cpu_to_le32(0);
- if (separate_dirty_bits(cmd))
- disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
-
return dm_tm_commit(cmd->tm, sblock);
}
@@ -405,13 +382,6 @@ static int __format_metadata(struct dm_cache_metadata *cmd)
if (r < 0)
goto bad;
- if (separate_dirty_bits(cmd)) {
- dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
- r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root);
- if (r < 0)
- goto bad;
- }
-
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
if (r < 0)
@@ -437,10 +407,9 @@ bad:
static int __check_incompat_features(struct cache_disk_superblock *disk_super,
struct dm_cache_metadata *cmd)
{
- uint32_t incompat_flags, features;
+ uint32_t features;
- incompat_flags = le32_to_cpu(disk_super->incompat_flags);
- features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
+ features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
if (features) {
DMERR("could not access metadata due to unsupported optional features (%lx).",
(unsigned long)features);
@@ -501,7 +470,6 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
}
__setup_mapping_info(cmd);
- dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
sb_flags = le32_to_cpu(disk_super->flags);
cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
@@ -580,7 +548,6 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
static void read_superblock_fields(struct dm_cache_metadata *cmd,
struct cache_disk_superblock *disk_super)
{
- cmd->version = le32_to_cpu(disk_super->version);
cmd->flags = le32_to_cpu(disk_super->flags);
cmd->root = le64_to_cpu(disk_super->mapping_root);
cmd->hint_root = le64_to_cpu(disk_super->hint_root);
@@ -600,9 +567,6 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
- if (separate_dirty_bits(cmd))
- cmd->dirty_root = le64_to_cpu(disk_super->dirty_root);
-
cmd->changed = false;
}
@@ -661,13 +625,6 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
*/
BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
- if (separate_dirty_bits(cmd)) {
- r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root,
- &cmd->dirty_root);
- if (r)
- return r;
- }
-
r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
&cmd->discard_root);
if (r)
@@ -692,8 +649,6 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
update_flags(disk_super, mutator);
disk_super->mapping_root = cpu_to_le64(cmd->root);
- if (separate_dirty_bits(cmd))
- disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
@@ -743,8 +698,7 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size,
- unsigned metadata_version)
+ size_t policy_hint_size)
{
int r;
struct dm_cache_metadata *cmd;
@@ -755,7 +709,6 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
return ERR_PTR(-ENOMEM);
}
- cmd->version = metadata_version;
atomic_set(&cmd->ref_count, 1);
init_rwsem(&cmd->root_lock);
cmd->bdev = bdev;
@@ -804,8 +757,7 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev)
static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size,
- unsigned metadata_version)
+ size_t policy_hint_size)
{
struct dm_cache_metadata *cmd, *cmd2;
@@ -816,8 +768,7 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
if (cmd)
return cmd;
- cmd = metadata_open(bdev, data_block_size, may_format_device,
- policy_hint_size, metadata_version);
+ cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size);
if (!IS_ERR(cmd)) {
mutex_lock(&table_lock);
cmd2 = lookup(bdev);
@@ -849,11 +800,10 @@ static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size)
struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size,
- unsigned metadata_version)
+ size_t policy_hint_size)
{
- struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device,
- policy_hint_size, metadata_version);
+ struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size,
+ may_format_device, policy_hint_size);
if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) {
dm_cache_metadata_close(cmd);
@@ -879,8 +829,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
/*
* Checks that the given cache block is either unmapped or clean.
*/
-static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b,
- bool *result)
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+ bool *result)
{
int r;
__le64 value;
@@ -888,8 +838,10 @@ static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t
unsigned flags;
r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
- if (r)
+ if (r) {
+ DMERR("block_unmapped_or_clean failed");
return r;
+ }
unpack_value(value, &ob, &flags);
*result = !((flags & M_VALID) && (flags & M_DIRTY));
@@ -897,19 +849,17 @@ static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t
return 0;
}
-static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd,
- dm_cblock_t begin, dm_cblock_t end,
- bool *result)
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
{
int r;
*result = true;
while (begin != end) {
- r = block_clean_combined_dirty(cmd, begin, result);
- if (r) {
- DMERR("block_clean_combined_dirty failed");
+ r = block_unmapped_or_clean(cmd, begin, result);
+ if (r)
return r;
- }
if (!*result) {
DMERR("cache block %llu is dirty",
@@ -923,69 +873,6 @@ static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd,
return 0;
}
-static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd,
- dm_cblock_t begin, dm_cblock_t end,
- bool *result)
-{
- int r;
- bool dirty_flag;
- *result = true;
-
- r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
- from_cblock(cmd->cache_blocks), &cmd->dirty_cursor);
- if (r) {
- DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__);
- return r;
- }
-
- r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin));
- if (r) {
- DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__);
- dm_bitset_cursor_end(&cmd->dirty_cursor);
- return r;
- }
-
- while (begin != end) {
- /*
- * We assume that unmapped blocks have their dirty bit
- * cleared.
- */
- dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor);
- if (dirty_flag) {
- DMERR("%s: cache block %llu is dirty", __func__,
- (unsigned long long) from_cblock(begin));
- dm_bitset_cursor_end(&cmd->dirty_cursor);
- *result = false;
- return 0;
- }
-
- begin = to_cblock(from_cblock(begin) + 1);
- if (begin == end)
- break;
-
- r = dm_bitset_cursor_next(&cmd->dirty_cursor);
- if (r) {
- DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__);
- dm_bitset_cursor_end(&cmd->dirty_cursor);
- return r;
- }
- }
-
- dm_bitset_cursor_end(&cmd->dirty_cursor);
-
- return 0;
-}
-
-static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
- dm_cblock_t begin, dm_cblock_t end,
- bool *result)
-{
- if (separate_dirty_bits(cmd))
- return blocks_are_clean_separate_dirty(cmd, begin, end, result);
- else
- return blocks_are_clean_combined_dirty(cmd, begin, end, result);
-}
-
static bool cmd_write_lock(struct dm_cache_metadata *cmd)
{
down_write(&cmd->root_lock);
@@ -1063,18 +950,8 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
from_cblock(new_cache_size),
&null_mapping, &cmd->root);
- if (r)
- goto out;
-
- if (separate_dirty_bits(cmd)) {
- r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root,
- from_cblock(cmd->cache_blocks), from_cblock(new_cache_size),
- false, &cmd->dirty_root);
- if (r)
- goto out;
- }
-
- cmd->cache_blocks = new_cache_size;
+ if (!r)
+ cmd->cache_blocks = new_cache_size;
cmd->changed = true;
out:
@@ -1118,6 +995,14 @@ static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
from_dblock(b), &cmd->discard_root);
}
+static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
+ bool *is_discarded)
+{
+ return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
+ from_dblock(b), &cmd->discard_root,
+ is_discarded);
+}
+
static int __discard(struct dm_cache_metadata *cmd,
dm_dblock_t dblock, bool discard)
{
@@ -1147,38 +1032,22 @@ static int __load_discards(struct dm_cache_metadata *cmd,
load_discard_fn fn, void *context)
{
int r = 0;
- uint32_t b;
- struct dm_bitset_cursor c;
-
- if (from_dblock(cmd->discard_nr_blocks) == 0)
- /* nothing to do */
- return 0;
+ dm_block_t b;
+ bool discard;
- if (cmd->clean_when_opened) {
- r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root);
- if (r)
- return r;
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ dm_dblock_t dblock = to_dblock(b);
- r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root,
- from_dblock(cmd->discard_nr_blocks), &c);
- if (r)
- return r;
-
- for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
- r = fn(context, cmd->discard_block_size, to_dblock(b),
- dm_bitset_cursor_get_value(&c));
- if (r)
- break;
- }
-
- dm_bitset_cursor_end(&c);
-
- } else {
- for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
- r = fn(context, cmd->discard_block_size, to_dblock(b), false);
+ if (cmd->clean_when_opened) {
+ r = __is_discarded(cmd, dblock, &discard);
if (r)
return r;
- }
+ } else
+ discard = false;
+
+ r = fn(context, cmd->discard_block_size, dblock, discard);
+ if (r)
+ break;
}
return r;
@@ -1308,11 +1177,11 @@ static bool hints_array_available(struct dm_cache_metadata *cmd,
hints_array_initialized(cmd);
}
-static int __load_mapping_v1(struct dm_cache_metadata *cmd,
- uint64_t cb, bool hints_valid,
- struct dm_array_cursor *mapping_cursor,
- struct dm_array_cursor *hint_cursor,
- load_mapping_fn fn, void *context)
+static int __load_mapping(struct dm_cache_metadata *cmd,
+ uint64_t cb, bool hints_valid,
+ struct dm_array_cursor *mapping_cursor,
+ struct dm_array_cursor *hint_cursor,
+ load_mapping_fn fn, void *context)
{
int r = 0;
@@ -1337,51 +1206,8 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd,
r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
le32_to_cpu(hint), hints_valid);
- if (r) {
- DMERR("policy couldn't load cache block %llu",
- (unsigned long long) from_cblock(to_cblock(cb)));
- }
- }
-
- return r;
-}
-
-static int __load_mapping_v2(struct dm_cache_metadata *cmd,
- uint64_t cb, bool hints_valid,
- struct dm_array_cursor *mapping_cursor,
- struct dm_array_cursor *hint_cursor,
- struct dm_bitset_cursor *dirty_cursor,
- load_mapping_fn fn, void *context)
-{
- int r = 0;
-
- __le64 mapping;
- __le32 hint = 0;
-
- __le64 *mapping_value_le;
- __le32 *hint_value_le;
-
- dm_oblock_t oblock;
- unsigned flags;
- bool dirty;
-
- dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
- memcpy(&mapping, mapping_value_le, sizeof(mapping));
- unpack_value(mapping, &oblock, &flags);
-
- if (flags & M_VALID) {
- if (hints_valid) {
- dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
- memcpy(&hint, hint_value_le, sizeof(hint));
- }
-
- dirty = dm_bitset_cursor_get_value(dirty_cursor);
- r = fn(context, oblock, to_cblock(cb), dirty,
- le32_to_cpu(hint), hints_valid);
- if (r) {
- DMERR("policy couldn't load cache block %llu",
- (unsigned long long) from_cblock(to_cblock(cb)));
- }
+ if (r)
+ DMERR("policy couldn't load cblock");
}
return r;
@@ -1412,28 +1238,10 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
}
}
- if (separate_dirty_bits(cmd)) {
- r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
- from_cblock(cmd->cache_blocks),
- &cmd->dirty_cursor);
- if (r) {
- dm_array_cursor_end(&cmd->hint_cursor);
- dm_array_cursor_end(&cmd->mapping_cursor);
- return r;
- }
- }
-
for (cb = 0; ; cb++) {
- if (separate_dirty_bits(cmd))
- r = __load_mapping_v2(cmd, cb, hints_valid,
- &cmd->mapping_cursor,
- &cmd->hint_cursor,
- &cmd->dirty_cursor,
- fn, context);
- else
- r = __load_mapping_v1(cmd, cb, hints_valid,
- &cmd->mapping_cursor, &cmd->hint_cursor,
- fn, context);
+ r = __load_mapping(cmd, cb, hints_valid,
+ &cmd->mapping_cursor, &cmd->hint_cursor,
+ fn, context);
if (r)
goto out;
@@ -1456,23 +1264,12 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
goto out;
}
}
-
- if (separate_dirty_bits(cmd)) {
- r = dm_bitset_cursor_next(&cmd->dirty_cursor);
- if (r) {
- DMERR("dm_bitset_cursor_next for dirty failed");
- goto out;
- }
- }
}
out:
dm_array_cursor_end(&cmd->mapping_cursor);
if (hints_valid)
dm_array_cursor_end(&cmd->hint_cursor);
- if (separate_dirty_bits(cmd))
- dm_bitset_cursor_end(&cmd->dirty_cursor);
-
return r;
}
@@ -1555,55 +1352,13 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty
}
-static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
-{
- int r;
- unsigned i;
- for (i = 0; i < nr_bits; i++) {
- r = __dirty(cmd, to_cblock(i), test_bit(i, bits));
- if (r)
- return r;
- }
-
- return 0;
-}
-
-static int is_dirty_callback(uint32_t index, bool *value, void *context)
-{
- unsigned long *bits = context;
- *value = test_bit(index, bits);
- return 0;
-}
-
-static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
-{
- int r = 0;
-
- /* nr_bits is really just a sanity check */
- if (nr_bits != from_cblock(cmd->cache_blocks)) {
- DMERR("dirty bitset is wrong size");
- return -EINVAL;
- }
-
- r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root);
- if (r)
- return r;
-
- cmd->changed = true;
- return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits);
-}
-
-int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
- unsigned nr_bits,
- unsigned long *bits)
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, bool dirty)
{
int r;
WRITE_LOCK(cmd);
- if (separate_dirty_bits(cmd))
- r = __set_dirty_bits_v2(cmd, nr_bits, bits);
- else
- r = __set_dirty_bits_v1(cmd, nr_bits, bits);
+ r = __dirty(cmd, cblock, dirty);
WRITE_UNLOCK(cmd);
return r;
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4f07c08cf107..8528744195e5 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -45,20 +45,18 @@
* As these various flags are defined they should be added to the
* following masks.
*/
-
#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
/*
- * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on
- * failure. If reopening then features must match.
+ * Reopens or creates a new, empty metadata volume.
+ * Returns an ERR_PTR on failure.
*/
struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size,
- unsigned metadata_version);
+ size_t policy_hint_size);
void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
@@ -93,8 +91,7 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
load_mapping_fn fn,
void *context);
-int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
- unsigned nr_bits, unsigned long *bits);
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
struct dm_cache_statistics {
uint32_t read_hits;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..74f1cd1fe4b8 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -179,7 +179,6 @@ enum cache_io_mode {
struct cache_features {
enum cache_metadata_mode mode;
enum cache_io_mode io_mode;
- unsigned metadata_version;
};
struct cache_stats {
@@ -2535,14 +2534,13 @@ static void init_features(struct cache_features *cf)
{
cf->mode = CM_WRITE;
cf->io_mode = CM_IO_WRITEBACK;
- cf->metadata_version = 1;
}
static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
char **error)
{
static struct dm_arg _args[] = {
- {0, 2, "Invalid number of cache feature arguments"},
+ {0, 1, "Invalid number of cache feature arguments"},
};
int r;
@@ -2568,9 +2566,6 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
else if (!strcasecmp(arg, "passthrough"))
cf->io_mode = CM_IO_PASSTHROUGH;
- else if (!strcasecmp(arg, "metadata2"))
- cf->metadata_version = 2;
-
else {
*error = "Unrecognised cache feature requested";
return -EINVAL;
@@ -2825,8 +2820,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
ca->block_size, may_format,
- dm_cache_policy_get_hint_size(cache->policy),
- ca->features.metadata_version);
+ dm_cache_policy_get_hint_size(cache->policy));
if (IS_ERR(cmd)) {
*error = "Error creating metadata object";
r = PTR_ERR(cmd);
@@ -3171,16 +3165,21 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
static int write_dirty_bitset(struct cache *cache)
{
- int r;
+ unsigned i, r;
if (get_cache_mode(cache) >= CM_READ_ONLY)
return -EINVAL;
- r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
- if (r)
- metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
+ for (i = 0; i < from_cblock(cache->cache_size); i++) {
+ r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
+ is_dirty(cache, to_cblock(i)));
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_set_dirty", r);
+ return r;
+ }
+ }
- return r;
+ return 0;
}
static int write_discard_bitset(struct cache *cache)
@@ -3556,19 +3555,14 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned) atomic_read(&cache->stats.promotion),
(unsigned long) atomic_read(&cache->nr_dirty));
- if (cache->features.metadata_version == 2)
- DMEMIT("2 metadata2 ");
- else
- DMEMIT("1 ");
-
if (writethrough_mode(&cache->features))
- DMEMIT("writethrough ");
+ DMEMIT("1 writethrough ");
else if (passthrough_mode(&cache->features))
- DMEMIT("passthrough ");
+ DMEMIT("1 passthrough ");
else if (writeback_mode(&cache->features))
- DMEMIT("writeback ");
+ DMEMIT("1 writeback ");
else {
DMERR("%s: internal error: unknown io mode: %d",
@@ -3816,7 +3810,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 10, 0},
+ .version = {1, 9, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1e217ba84d09..350527f60834 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -101,8 +101,6 @@ struct raid_dev {
#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
-#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
-
/*
* Definitions of various constructor flags to
* be used in checks of valid / invalid flags
@@ -3726,7 +3724,7 @@ static int raid_preresume(struct dm_target *ti)
return r;
/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
- if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
+ if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) &&
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
r = bitmap_resize(mddev->bitmap, mddev->dev_sectors,
to_bytes(rs->requested_bitmap_chunk_sectors), 0);
@@ -3758,6 +3756,8 @@ static int raid_preresume(struct dm_target *ti)
return r;
}
+#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
+
static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
@@ -3791,7 +3791,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 10, 1},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index bdbb7e6e8212..6c25213ab38c 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -17,8 +17,8 @@
#include <linux/module.h>
#define DM_MSG_PREFIX "multipath round-robin"
-#define RR_MIN_IO 1
-#define RR_VERSION "1.2.0"
+#define RR_MIN_IO 1000
+#define RR_VERSION "1.1.0"
/*-----------------------------------------------------------------
* Path-handling code, paths are held in lists
@@ -47,19 +47,44 @@ struct selector {
struct list_head valid_paths;
struct list_head invalid_paths;
spinlock_t lock;
+ struct dm_path * __percpu *current_path;
+ struct percpu_counter repeat_count;
};
+static void set_percpu_current_path(struct selector *s, struct dm_path *path)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(s->current_path, cpu) = path;
+}
+
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s) {
- INIT_LIST_HEAD(&s->valid_paths);
- INIT_LIST_HEAD(&s->invalid_paths);
- spin_lock_init(&s->lock);
- }
+ if (!s)
+ return NULL;
+
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->invalid_paths);
+ spin_lock_init(&s->lock);
+
+ s->current_path = alloc_percpu(struct dm_path *);
+ if (!s->current_path)
+ goto out_current_path;
+ set_percpu_current_path(s, NULL);
+
+ if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
+ goto out_repeat_count;
return s;
+
+out_repeat_count:
+ free_percpu(s->current_path);
+out_current_path:
+ kfree(s);
+ return NULL;;
}
static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
@@ -80,6 +105,8 @@ static void rr_destroy(struct path_selector *ps)
free_paths(&s->valid_paths);
free_paths(&s->invalid_paths);
+ free_percpu(s->current_path);
+ percpu_counter_destroy(&s->repeat_count);
kfree(s);
ps->context = NULL;
}
@@ -130,11 +157,6 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
- if (repeat_count > 1) {
- DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
- repeat_count = 1;
- }
-
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
@@ -161,6 +183,9 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
struct path_info *pi = p->pscontext;
spin_lock_irqsave(&s->lock, flags);
+ if (p == *this_cpu_ptr(s->current_path))
+ set_percpu_current_path(s, NULL);
+
list_move(&pi->list, &s->invalid_paths);
spin_unlock_irqrestore(&s->lock, flags);
}
@@ -183,15 +208,29 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
unsigned long flags;
struct selector *s = ps->context;
struct path_info *pi = NULL;
+ struct dm_path *current_path = NULL;
+
+ local_irq_save(flags);
+ current_path = *this_cpu_ptr(s->current_path);
+ if (current_path) {
+ percpu_counter_dec(&s->repeat_count);
+ if (percpu_counter_read_positive(&s->repeat_count) > 0) {
+ local_irq_restore(flags);
+ return current_path;
+ }
+ }
- spin_lock_irqsave(&s->lock, flags);
+ spin_lock(&s->lock);
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
+ percpu_counter_set(&s->repeat_count, pi->repeat_count);
+ set_percpu_current_path(s, pi->path);
+ current_path = pi->path;
}
spin_unlock_irqrestore(&s->lock, flags);
- return pi ? pi->path : NULL;
+ return current_path;
}
static struct path_selector_type rr_ps = {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 50e6f5c4f751..05bc224b94f8 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -892,7 +892,6 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
/* Undo dm_start_request() before requeuing */
rq_end_stats(md, rq);
rq_completed(md, rq_data_dir(rq), false);
- blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
return BLK_MQ_RQ_QUEUE_BUSY;
}
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 78f36012eaca..0f0eb8a3d922 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -146,6 +146,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
block = fec_buffer_rs_block(v, fio, n, i);
res = fec_decode_rs8(v, fio, block, &par[offset], neras);
if (res < 0) {
+ dm_bufio_release(buf);
+
r = res;
goto error;
}
@@ -170,8 +172,6 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
done:
r = corrected;
error:
- dm_bufio_release(buf);
-
if (r < 0 && neras)
DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
v->data_dev->name, (unsigned long long)rsb, r);
@@ -269,7 +269,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
&is_zero) == 0) {
/* skip known zero blocks entirely */
if (is_zero)
- goto done;
+ continue;
/*
* skip if we have already found the theoretical
@@ -439,13 +439,6 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
if (!verity_fec_is_enabled(v))
return -EOPNOTSUPP;
- if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) {
- DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name);
- return -EIO;
- }
-
- fio->level++;
-
if (type == DM_VERITY_BLOCK_TYPE_METADATA)
block += v->data_blocks;
@@ -477,7 +470,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
if (r < 0) {
r = fec_decode_rsb(v, io, fio, rsb, offset, true);
if (r < 0)
- goto done;
+ return r;
}
if (dest)
@@ -487,8 +480,6 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
r = verity_for_bv_block(v, io, iter, fec_bv_copy);
}
-done:
- fio->level--;
return r;
}
@@ -529,7 +520,6 @@ void verity_fec_init_io(struct dm_verity_io *io)
memset(fio->bufs, 0, sizeof(fio->bufs));
fio->nbufs = 0;
fio->output = NULL;
- fio->level = 0;
}
/*
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index bb31ce87a933..7fa0298b995e 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -27,9 +27,6 @@
#define DM_VERITY_FEC_BUF_MAX \
(1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
-/* maximum recursion level for verity_fec_decode */
-#define DM_VERITY_FEC_MAX_RECURSION 4
-
#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device"
#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks"
#define DM_VERITY_OPT_FEC_START "fec_start"
@@ -61,7 +58,6 @@ struct dm_verity_fec_io {
unsigned nbufs; /* number of buffers allocated */
u8 *output; /* buffer for corrected output */
size_t output_pos;
- unsigned level; /* recursion level */
};
#ifdef CONFIG_DM_VERITY_FEC
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index b0536cfd8e17..685aa2d77e25 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
}
}
if (failit) {
- struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
b->bi_bdev = conf->rdev->bdev;
b->bi_private = bio;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index f16316fbf658..a6a50be86c27 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -224,8 +224,7 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
* oldconf until no one uses it anymore.
*/
mddev_suspend(mddev);
- oldconf = rcu_dereference_protected(mddev->private,
- lockdep_is_held(&mddev->reconfig_mutex));
+ oldconf = rcu_dereference(mddev->private);
mddev->raid_disks++;
WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
"copied raid_disks doesn't match mddev->raid_disks");
@@ -249,48 +248,53 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
{
char b[BDEVNAME_SIZE];
struct dev_info *tmp_dev;
+ struct bio *split;
sector_t start_sector, end_sector, data_offset;
- sector_t bio_sector = bio->bi_iter.bi_sector;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- tmp_dev = which_dev(mddev, bio_sector);
- start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
- end_sector = tmp_dev->end_sector;
- data_offset = tmp_dev->rdev->data_offset;
-
- if (unlikely(bio_sector >= end_sector ||
- bio_sector < start_sector))
- goto out_of_bounds;
-
- if (unlikely(bio_end_sector(bio) > end_sector)) {
- /* This bio crosses a device boundary, so we have to split it */
- struct bio *split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, mddev->bio_set);
- bio_chain(split, bio);
- generic_make_request(bio);
- bio = split;
- }
+ do {
+ sector_t bio_sector = bio->bi_iter.bi_sector;
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+ bio->bi_bdev = tmp_dev->rdev->bdev;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to
+ * split it.
+ */
+ split = bio_split(bio, end_sector - bio_sector,
+ GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
- bio->bi_bdev = tmp_dev->rdev->bdev;
- bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
- start_sector + data_offset;
-
- if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(bio);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, bio);
- generic_make_request(bio);
- }
+ split->bi_iter.bi_sector = split->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(split);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+ split, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, split);
+ generic_make_request(split);
+ }
+ } while (split != bio);
return;
out_of_bounds:
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d426c7f0e46b..69065d3451c8 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1127,8 +1127,8 @@ int cluster_check_sync_size(struct mddev *mddev)
bm_lockres = lockres_init(mddev, str, NULL, 1);
if (!bm_lockres) {
pr_err("md-cluster: Cannot initialize %s\n", str);
- bitmap_free(bitmap);
- return -1;
+ lockres_free(bm_lockres);
+ return -ENOMEM;
}
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cf29a3093014..7eb4ab190274 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -64,8 +64,6 @@
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
#include <linux/slab.h>
-#include <linux/percpu-refcount.h>
-
#include <trace/events/block.h>
#include "md.h"
#include "bitmap.h"
@@ -173,16 +171,6 @@ static const struct block_device_operations md_fops;
static int start_readonly;
-/*
- * The original mechanism for creating an md device is to create
- * a device node in /dev and to open it. This causes races with device-close.
- * The preferred method is to write to the "new_array" module parameter.
- * This can avoid races.
- * Setting create_on_open to false disables the original mechanism
- * so all the races disappear.
- */
-static bool create_on_open = true;
-
/* bio_clone_mddev
* like bio_clone, but with a local bio set
*/
@@ -202,6 +190,16 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+ struct mddev *mddev)
+{
+ if (!mddev || !mddev->bio_set)
+ return bio_clone(bio, gfp_mask);
+
+ return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
+}
+EXPORT_SYMBOL_GPL(bio_clone_mddev);
+
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
@@ -451,6 +449,14 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
}
EXPORT_SYMBOL(md_flush_request);
+void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct mddev *mddev = cb->data;
+ md_wakeup_thread(mddev->thread);
+ kfree(cb);
+}
+EXPORT_SYMBOL(md_unplug);
+
static inline struct mddev *mddev_get(struct mddev *mddev)
{
atomic_inc(&mddev->active);
@@ -1909,7 +1915,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
}
sb = page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
- sb->super_offset = cpu_to_le64(rdev->sb_start);
+ sb->super_offset = rdev->sb_start;
sb->sb_csum = calc_sb_1_csum(sb);
do {
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -2267,33 +2273,6 @@ static void export_array(struct mddev *mddev)
mddev->major_version = 0;
}
-static bool set_in_sync(struct mddev *mddev)
-{
- WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
- if (!mddev->in_sync) {
- mddev->sync_checkers++;
- spin_unlock(&mddev->lock);
- percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
- spin_lock(&mddev->lock);
- if (!mddev->in_sync &&
- percpu_ref_is_zero(&mddev->writes_pending)) {
- mddev->in_sync = 1;
- /*
- * Ensure ->in_sync is visible before we clear
- * ->sync_checkers.
- */
- smp_mb();
- set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- }
- if (--mddev->sync_checkers == 0)
- percpu_ref_switch_to_percpu(&mddev->writes_pending);
- }
- if (mddev->safemode == 1)
- mddev->safemode = 0;
- return mddev->in_sync;
-}
-
static void sync_sbs(struct mddev *mddev, int nospares)
{
/* Update each superblock (in-memory image), but
@@ -2348,7 +2327,7 @@ static bool does_sb_need_changing(struct mddev *mddev)
/* Check if any mddev parameters have changed */
if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
- (mddev->layout != le32_to_cpu(sb->layout)) ||
+ (mddev->layout != le64_to_cpu(sb->layout)) ||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
return true;
@@ -4038,7 +4017,6 @@ array_state_show(struct mddev *mddev, char *page)
st = read_auto;
break;
case 0:
- spin_lock(&mddev->lock);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
st = write_pending;
else if (mddev->in_sync)
@@ -4047,7 +4025,6 @@ array_state_show(struct mddev *mddev, char *page)
st = active_idle;
else
st = active;
- spin_unlock(&mddev->lock);
}
else {
if (list_empty(&mddev->disks) &&
@@ -4068,7 +4045,7 @@ static int restart_array(struct mddev *mddev);
static ssize_t
array_state_store(struct mddev *mddev, const char *buf, size_t len)
{
- int err = 0;
+ int err;
enum array_state st = match_word(buf, array_states);
if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
@@ -4081,9 +4058,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wake_up(&mddev->sb_wait);
+ err = 0;
} else /* st == clean */ {
restart_array(mddev);
- if (!set_in_sync(mddev))
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ }
+ err = 0;
+ } else
err = -EBUSY;
}
if (!err)
@@ -4141,7 +4127,15 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
break;
spin_lock(&mddev->lock);
- if (!set_in_sync(mddev))
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ }
+ err = 0;
+ } else
err = -EBUSY;
spin_unlock(&mddev->lock);
} else
@@ -4963,10 +4957,8 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
return err;
/* cluster raid doesn't support change array_sectors */
- if (mddev_is_clustered(mddev)) {
- mddev_unlock(mddev);
+ if (mddev_is_clustered(mddev))
return -EINVAL;
- }
if (strncmp(buf, "default", 7) == 0) {
if (mddev->pers)
@@ -5145,7 +5137,6 @@ static void md_free(struct kobject *ko)
del_gendisk(mddev->gendisk);
put_disk(mddev->gendisk);
}
- percpu_ref_exit(&mddev->writes_pending);
kfree(mddev);
}
@@ -5171,19 +5162,8 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
}
-static void no_op(struct percpu_ref *r) {}
-
static int md_alloc(dev_t dev, char *name)
{
- /*
- * If dev is zero, name is the name of a device to allocate with
- * an arbitrary minor number. It will be "md_???"
- * If dev is non-zero it must be a device number with a MAJOR of
- * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
- * the device is being created by opening a node in /dev.
- * If "name" is not NULL, the device is being created by
- * writing to /sys/module/md_mod/parameters/new_array.
- */
static DEFINE_MUTEX(disks_mutex);
struct mddev *mddev = mddev_find(dev);
struct gendisk *disk;
@@ -5209,7 +5189,7 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->gendisk)
goto abort;
- if (name && !dev) {
+ if (name) {
/* Need to ensure that 'name' is not a duplicate.
*/
struct mddev *mddev2;
@@ -5223,11 +5203,6 @@ static int md_alloc(dev_t dev, char *name)
}
spin_unlock(&all_mddevs_lock);
}
- if (name && dev)
- /*
- * Creating /dev/mdNNN via "newarray", so adjust hold_active.
- */
- mddev->hold_active = UNTIL_STOP;
error = -ENOMEM;
mddev->queue = blk_alloc_queue(GFP_KERNEL);
@@ -5238,10 +5213,6 @@ static int md_alloc(dev_t dev, char *name)
blk_queue_make_request(mddev->queue, md_make_request);
blk_set_stacking_limits(&mddev->queue->limits);
- if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
- goto abort;
- /* We want to start with the refcount at zero */
- percpu_ref_put(&mddev->writes_pending);
disk = alloc_disk(1 << shift);
if (!disk) {
blk_cleanup_queue(mddev->queue);
@@ -5298,48 +5269,38 @@ static int md_alloc(dev_t dev, char *name)
static struct kobject *md_probe(dev_t dev, int *part, void *data)
{
- if (create_on_open)
- md_alloc(dev, NULL);
+ md_alloc(dev, NULL);
return NULL;
}
static int add_named_array(const char *val, struct kernel_param *kp)
{
- /*
- * val must be "md_*" or "mdNNN".
- * For "md_*" we allocate an array with a large free minor number, and
+ /* val must be "md_*" where * is not all digits.
+ * We allocate an array with a large free minor number, and
* set the name to val. val must not already be an active name.
- * For "mdNNN" we allocate an array with the minor number NNN
- * which must not already be in use.
*/
int len = strlen(val);
char buf[DISK_NAME_LEN];
- unsigned long devnum;
while (len && val[len-1] == '\n')
len--;
if (len >= DISK_NAME_LEN)
return -E2BIG;
strlcpy(buf, val, len+1);
- if (strncmp(buf, "md_", 3) == 0)
- return md_alloc(0, buf);
- if (strncmp(buf, "md", 2) == 0 &&
- isdigit(buf[2]) &&
- kstrtoul(buf+2, 10, &devnum) == 0 &&
- devnum <= MINORMASK)
- return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
-
- return -EINVAL;
+ if (strncmp(buf, "md_", 3) != 0)
+ return -EINVAL;
+ return md_alloc(0, buf);
}
static void md_safemode_timeout(unsigned long data)
{
struct mddev *mddev = (struct mddev *) data;
- mddev->safemode = 1;
- if (mddev->external)
- sysfs_notify_dirent_safe(mddev->sysfs_state);
-
+ if (!atomic_read(&mddev->writes_pending)) {
+ mddev->safemode = 1;
+ if (mddev->external)
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ }
md_wakeup_thread(mddev->thread);
}
@@ -5551,6 +5512,7 @@ int md_run(struct mddev *mddev)
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
mddev->ro = 0;
+ atomic_set(&mddev->writes_pending,0);
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
@@ -6647,10 +6609,11 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->layout = info->layout;
mddev->chunk_sectors = info->chunk_size >> 9;
+ mddev->max_disks = MD_SB_DISKS;
+
if (mddev->persistent) {
- mddev->max_disks = MD_SB_DISKS;
- mddev->flags = 0;
- mddev->sb_flags = 0;
+ mddev->flags = 0;
+ mddev->sb_flags = 0;
}
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
@@ -6972,7 +6935,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
int ro;
- bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd))
return -ENOTTY;
@@ -7062,9 +7024,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
err = -EBUSY;
goto out;
}
- WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
set_bit(MD_CLOSING, &mddev->flags);
- did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
@@ -7257,8 +7217,6 @@ unlock:
mddev->hold_active = 0;
mddev_unlock(mddev);
out:
- if(did_set_md_closing)
- clear_bit(MD_CLOSING, &mddev->flags);
return err;
}
#ifdef CONFIG_COMPAT
@@ -7409,8 +7367,8 @@ void md_wakeup_thread(struct md_thread *thread)
{
if (thread) {
pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
- if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
- wake_up(&thread->wqueue);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
}
}
EXPORT_SYMBOL(md_wakeup_thread);
@@ -7957,13 +7915,10 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
md_wakeup_thread(mddev->sync_thread);
did_change = 1;
}
- rcu_read_lock();
- percpu_ref_get(&mddev->writes_pending);
- smp_mb(); /* Match smp_mb in set_in_sync() */
+ atomic_inc(&mddev->writes_pending);
if (mddev->safemode == 1)
mddev->safemode = 0;
- /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
- if (mddev->in_sync || !mddev->sync_checkers) {
+ if (mddev->in_sync) {
spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
@@ -7974,7 +7929,6 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
}
spin_unlock(&mddev->lock);
}
- rcu_read_unlock();
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
wait_event(mddev->sb_wait,
@@ -7982,38 +7936,15 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
}
EXPORT_SYMBOL(md_write_start);
-/* md_write_inc can only be called when md_write_start() has
- * already been called at least once of the current request.
- * It increments the counter and is useful when a single request
- * is split into several parts. Each part causes an increment and
- * so needs a matching md_write_end().
- * Unlike md_write_start(), it is safe to call md_write_inc() inside
- * a spinlocked region.
- */
-void md_write_inc(struct mddev *mddev, struct bio *bi)
-{
- if (bio_data_dir(bi) != WRITE)
- return;
- WARN_ON_ONCE(mddev->in_sync || mddev->ro);
- percpu_ref_get(&mddev->writes_pending);
-}
-EXPORT_SYMBOL(md_write_inc);
-
void md_write_end(struct mddev *mddev)
{
- percpu_ref_put(&mddev->writes_pending);
-
- if (mddev->safemode == 2)
- md_wakeup_thread(mddev->thread);
- else if (mddev->safemode_delay)
- /* The roundup() ensures this only performs locking once
- * every ->safemode_delay jiffies
- */
- mod_timer(&mddev->safemode_timer,
- roundup(jiffies, mddev->safemode_delay) +
- mddev->safemode_delay);
+ if (atomic_dec_and_test(&mddev->writes_pending)) {
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else if (mddev->safemode_delay)
+ mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+ }
}
-
EXPORT_SYMBOL(md_write_end);
/* md_allow_write(mddev)
@@ -8614,7 +8545,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->external == 0 && mddev->safemode == 1) ||
- (mddev->safemode == 2
+ (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
&& !mddev->in_sync && mddev->recovery_cp == MaxSector)
))
return;
@@ -8663,10 +8594,22 @@ void md_check_recovery(struct mddev *mddev)
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (!mddev->external) {
+ int did_change = 0;
spin_lock(&mddev->lock);
- set_in_sync(mddev);
+ if (mddev->safemode &&
+ !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync &&
+ mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ did_change = 1;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ }
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
spin_unlock(&mddev->lock);
+ if (did_change)
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
}
if (mddev->sb_flags)
@@ -9220,7 +9163,6 @@ static int set_ro(const char *val, struct kernel_param *kp)
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
-module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 168136c15e40..2e84c8dd3e2f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -414,8 +414,7 @@ struct mddev {
*/
unsigned int safemode_delay;
struct timer_list safemode_timer;
- struct percpu_ref writes_pending;
- int sync_checkers; /* # of threads checking writes_pending */
+ atomic_t writes_pending;
struct request_queue *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */
@@ -654,7 +653,6 @@ extern void md_wakeup_thread(struct md_thread *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
extern void md_write_start(struct mddev *mddev, struct bio *bi);
-extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
@@ -687,13 +685,21 @@ extern void md_rdev_clear(struct md_rdev *rdev);
extern void mddev_suspend(struct mddev *mddev);
extern void mddev_resume(struct mddev *mddev);
+extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+ struct mddev *mddev);
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev);
+extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
+static inline int mddev_check_plugged(struct mddev *mddev)
+{
+ return !!blk_check_plugged(md_unplug, mddev,
+ sizeof(struct blk_plug_cb));
+}
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{
@@ -723,59 +729,4 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
{
mddev->flags &= ~unsupported_flags;
}
-
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-
-/* for managing resync I/O pages */
-struct resync_pages {
- unsigned idx; /* for get/put page from the pool */
- void *raid_bio;
- struct page *pages[RESYNC_PAGES];
-};
-
-static inline int resync_alloc_pages(struct resync_pages *rp,
- gfp_t gfp_flags)
-{
- int i;
-
- for (i = 0; i < RESYNC_PAGES; i++) {
- rp->pages[i] = alloc_page(gfp_flags);
- if (!rp->pages[i])
- goto out_free;
- }
-
- return 0;
-
-out_free:
- while (--i >= 0)
- put_page(rp->pages[i]);
- return -ENOMEM;
-}
-
-static inline void resync_free_pages(struct resync_pages *rp)
-{
- int i;
-
- for (i = 0; i < RESYNC_PAGES; i++)
- put_page(rp->pages[i]);
-}
-
-static inline void resync_get_all_pages(struct resync_pages *rp)
-{
- int i;
-
- for (i = 0; i < RESYNC_PAGES; i++)
- get_page(rp->pages[i]);
-}
-
-static inline struct page *resync_fetch_page(struct resync_pages *rp,
- unsigned idx)
-{
- if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
- return NULL;
- return rp->pages[idx];
-}
-
#endif /* _MD_MD_H */
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 185dc60360b5..7938cd21fa4c 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -976,27 +976,6 @@ int dm_array_cursor_next(struct dm_array_cursor *c)
}
EXPORT_SYMBOL_GPL(dm_array_cursor_next);
-int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
-{
- int r;
-
- do {
- uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index;
-
- if (count < remaining) {
- c->index += count;
- return 0;
- }
-
- count -= remaining;
- r = dm_array_cursor_next(c);
-
- } while (!r);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(dm_array_cursor_skip);
-
void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le)
{
*value_le = element_at(c->info, c->ab, c->index);
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
index d7d2d579c662..27ee49a55473 100644
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -207,7 +207,6 @@ void dm_array_cursor_end(struct dm_array_cursor *c);
uint32_t dm_array_cursor_index(struct dm_array_cursor *c);
int dm_array_cursor_next(struct dm_array_cursor *c);
-int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count);
/*
* value_le is only valid while the cursor points at the current value.
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
index b7208d82e748..36f7cc2c7109 100644
--- a/drivers/md/persistent-data/dm-bitset.c
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -39,48 +39,6 @@ int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
}
EXPORT_SYMBOL_GPL(dm_bitset_empty);
-struct packer_context {
- bit_value_fn fn;
- unsigned nr_bits;
- void *context;
-};
-
-static int pack_bits(uint32_t index, void *value, void *context)
-{
- int r;
- struct packer_context *p = context;
- unsigned bit, nr = min(64u, p->nr_bits - (index * 64));
- uint64_t word = 0;
- bool bv;
-
- for (bit = 0; bit < nr; bit++) {
- r = p->fn(index * 64 + bit, &bv, p->context);
- if (r)
- return r;
-
- if (bv)
- set_bit(bit, (unsigned long *) &word);
- else
- clear_bit(bit, (unsigned long *) &word);
- }
-
- *((__le64 *) value) = cpu_to_le64(word);
-
- return 0;
-}
-
-int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
- uint32_t size, bit_value_fn fn, void *context)
-{
- struct packer_context p;
- p.fn = fn;
- p.nr_bits = size;
- p.context = context;
-
- return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p);
-}
-EXPORT_SYMBOL_GPL(dm_bitset_new);
-
int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
uint32_t old_nr_entries, uint32_t new_nr_entries,
bool default_value, dm_block_t *new_root)
@@ -210,108 +168,4 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
}
EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
-static int cursor_next_array_entry(struct dm_bitset_cursor *c)
-{
- int r;
- __le64 *value;
-
- r = dm_array_cursor_next(&c->cursor);
- if (r)
- return r;
-
- dm_array_cursor_get_value(&c->cursor, (void **) &value);
- c->array_index++;
- c->bit_index = 0;
- c->current_bits = le64_to_cpu(*value);
- return 0;
-}
-
-int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
- dm_block_t root, uint32_t nr_entries,
- struct dm_bitset_cursor *c)
-{
- int r;
- __le64 *value;
-
- if (!nr_entries)
- return -ENODATA;
-
- c->info = info;
- c->entries_remaining = nr_entries;
-
- r = dm_array_cursor_begin(&info->array_info, root, &c->cursor);
- if (r)
- return r;
-
- dm_array_cursor_get_value(&c->cursor, (void **) &value);
- c->array_index = 0;
- c->bit_index = 0;
- c->current_bits = le64_to_cpu(*value);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin);
-
-void dm_bitset_cursor_end(struct dm_bitset_cursor *c)
-{
- return dm_array_cursor_end(&c->cursor);
-}
-EXPORT_SYMBOL_GPL(dm_bitset_cursor_end);
-
-int dm_bitset_cursor_next(struct dm_bitset_cursor *c)
-{
- int r = 0;
-
- if (!c->entries_remaining)
- return -ENODATA;
-
- c->entries_remaining--;
- if (++c->bit_index > 63)
- r = cursor_next_array_entry(c);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(dm_bitset_cursor_next);
-
-int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count)
-{
- int r;
- __le64 *value;
- uint32_t nr_array_skip;
- uint32_t remaining_in_word = 64 - c->bit_index;
-
- if (c->entries_remaining < count)
- return -ENODATA;
-
- if (count < remaining_in_word) {
- c->bit_index += count;
- c->entries_remaining -= count;
- return 0;
-
- } else {
- c->entries_remaining -= remaining_in_word;
- count -= remaining_in_word;
- }
-
- nr_array_skip = (count / 64) + 1;
- r = dm_array_cursor_skip(&c->cursor, nr_array_skip);
- if (r)
- return r;
-
- dm_array_cursor_get_value(&c->cursor, (void **) &value);
- c->entries_remaining -= count;
- c->array_index += nr_array_skip;
- c->bit_index = count & 63;
- c->current_bits = le64_to_cpu(*value);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip);
-
-bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c)
-{
- return test_bit(c->bit_index, (unsigned long *) &c->current_bits);
-}
-EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value);
-
/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
index df888da04ee1..c2287d672ef5 100644
--- a/drivers/md/persistent-data/dm-bitset.h
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -93,22 +93,6 @@ void dm_disk_bitset_init(struct dm_transaction_manager *tm,
int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
/*
- * Creates a new bitset populated with values provided by a callback
- * function. This is more efficient than creating an empty bitset,
- * resizing, and then setting values since that process incurs a lot of
- * copying.
- *
- * info - describes the array
- * root - the root block of the array on disk
- * size - the number of entries in the array
- * fn - the callback
- * context - passed to the callback
- */
-typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context);
-int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
- uint32_t size, bit_value_fn fn, void *context);
-
-/*
* Resize the bitset.
*
* info - describes the bitset
@@ -177,29 +161,6 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
dm_block_t *new_root);
-struct dm_bitset_cursor {
- struct dm_disk_bitset *info;
- struct dm_array_cursor cursor;
-
- uint32_t entries_remaining;
- uint32_t array_index;
- uint32_t bit_index;
- uint64_t current_bits;
-};
-
-/*
- * Make sure you've flush any dm_disk_bitset and updated the root before
- * using this.
- */
-int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
- dm_block_t root, uint32_t nr_entries,
- struct dm_bitset_cursor *c);
-void dm_bitset_cursor_end(struct dm_bitset_cursor *c);
-
-int dm_bitset_cursor_next(struct dm_bitset_cursor *c);
-int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count);
-bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c);
-
/*----------------------------------------------------------------*/
#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 8212f14214f1..a6dde7cab458 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -462,7 +462,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
int r;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -498,7 +498,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -531,7 +531,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
int r;
p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
if (unlikely(!p))
return -EWOULDBLOCK;
@@ -567,7 +567,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
- if (unlikely(IS_ERR(p)))
+ if (IS_ERR(p))
return PTR_ERR(p);
memset(p, 0, dm_bm_block_size(bm));
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 02e2ee0d8a00..20a40329d84a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -272,12 +272,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
int r;
struct del_stack *s;
- /*
- * dm_btree_del() is called via an ioctl, as such should be
- * considered an FS op. We can't recurse back into the FS, so we
- * allocate GFP_NOFS.
- */
- s = kmalloc(sizeof(*s), GFP_NOFS);
+ s = kmalloc(sizeof(*s), GFP_NOIO);
if (!s)
return -ENOMEM;
s->info = info;
@@ -1144,17 +1139,6 @@ int dm_btree_cursor_next(struct dm_btree_cursor *c)
}
EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
-int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
-{
- int r = 0;
-
- while (count-- && !r)
- r = dm_btree_cursor_next(c);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
-
int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
{
if (c->depth) {
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 3dc5bb1a4748..db9bd26adf31 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -209,7 +209,6 @@ int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
bool prefetch_leaves, struct dm_btree_cursor *c);
void dm_btree_cursor_end(struct dm_btree_cursor *c);
int dm_btree_cursor_next(struct dm_btree_cursor *c);
-int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count);
int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le);
#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 829b4ce057d8..4c28608a0c94 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -626,19 +626,13 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
void *root_le, size_t len)
{
int r;
- struct disk_sm_root smr;
+ struct disk_sm_root *smr = root_le;
if (len < sizeof(struct disk_sm_root)) {
DMERR("sm_metadata root too small");
return -ENOMEM;
}
- /*
- * We don't know the alignment of the root_le buffer, so need to
- * copy into a new structure.
- */
- memcpy(&smr, root_le, sizeof(smr));
-
r = sm_ll_init(ll, tm);
if (r < 0)
return r;
@@ -650,10 +644,10 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
ll->max_entries = metadata_ll_max_entries;
ll->commit = metadata_ll_commit;
- ll->nr_blocks = le64_to_cpu(smr.nr_blocks);
- ll->nr_allocated = le64_to_cpu(smr.nr_allocated);
- ll->bitmap_root = le64_to_cpu(smr.bitmap_root);
- ll->ref_count_root = le64_to_cpu(smr.ref_count_root);
+ ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
+ ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
+ ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
+ ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
return ll->open_index(ll);
}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 4aed69d9dd17..20557e2c60c6 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -544,7 +544,7 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
-static const struct dm_space_map ops = {
+static struct dm_space_map ops = {
.destroy = sm_metadata_destroy,
.extend = sm_metadata_extend,
.get_nr_blocks = sm_metadata_get_nr_blocks,
@@ -671,7 +671,7 @@ static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
return -EINVAL;
}
-static const struct dm_space_map bootstrap_ops = {
+static struct dm_space_map bootstrap_ops = {
.destroy = sm_bootstrap_destroy,
.extend = sm_bootstrap_extend,
.get_nr_blocks = sm_bootstrap_get_nr_blocks,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 47357d9b9c09..93347ca7c7a6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -461,53 +461,52 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
struct md_rdev *tmp_dev;
- sector_t bio_sector;
- sector_t sector;
- unsigned chunk_sects;
- unsigned sectors;
+ struct bio *split;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- bio_sector = bio->bi_iter.bi_sector;
- sector = bio_sector;
- chunk_sects = mddev->chunk_sectors;
+ do {
+ sector_t bio_sector = bio->bi_iter.bi_sector;
+ sector_t sector = bio_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
- sectors = chunk_sects -
- (likely(is_power_of_2(chunk_sects))
- ? (sector & (chunk_sects-1))
- : sector_div(sector, chunk_sects));
+ unsigned sectors = chunk_sects -
+ (likely(is_power_of_2(chunk_sects))
+ ? (sector & (chunk_sects-1))
+ : sector_div(sector, chunk_sects));
- /* Restore due to sector_div */
- sector = bio_sector;
+ /* Restore due to sector_div */
+ sector = bio_sector;
- if (sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
- bio_chain(split, bio);
- generic_make_request(bio);
- bio = split;
- }
+ if (sectors < bio_sectors(bio)) {
+ split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
- zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
- bio->bi_bdev = tmp_dev->bdev;
- bio->bi_iter.bi_sector = sector + zone->dev_start +
- tmp_dev->data_offset;
-
- if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(bio);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, bio);
- generic_make_request(bio);
- }
+ zone = find_zone(mddev->private, &sector);
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ split->bi_bdev = tmp_dev->bdev;
+ split->bi_iter.bi_sector = sector + zone->dev_start +
+ tmp_dev->data_offset;
+
+ if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(split);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+ split, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, split);
+ generic_make_request(split);
+ }
+ } while (split != bio);
}
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 447c0c8fe93d..1c320302196d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -77,24 +77,6 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
-/*
- * 'strct resync_pages' stores actual pages used for doing the resync
- * IO, and it is per-bio, so make .bi_private points to it.
- */
-static inline struct resync_pages *get_resync_pages(struct bio *bio)
-{
- return bio->bi_private;
-}
-
-/*
- * for resync bio, r1bio pointer can be retrieved from the per-bio
- * 'struct resync_pages'.
- */
-static inline struct r1bio *get_resync_r1bio(struct bio *bio)
-{
- return get_resync_pages(bio)->raid_bio;
-}
-
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
@@ -109,8 +91,10 @@ static void r1bio_pool_free(void *r1_bio, void *data)
kfree(r1_bio);
}
+#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
@@ -122,18 +106,12 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
struct r1bio *r1_bio;
struct bio *bio;
int need_pages;
- int j;
- struct resync_pages *rps;
+ int i, j;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
if (!r1_bio)
return NULL;
- rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
- gfp_flags);
- if (!rps)
- goto out_free_r1bio;
-
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
@@ -153,22 +131,19 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
need_pages = pi->raid_disks;
else
need_pages = 1;
- for (j = 0; j < pi->raid_disks; j++) {
- struct resync_pages *rp = &rps[j];
-
+ for (j = 0; j < need_pages; j++) {
bio = r1_bio->bios[j];
+ bio->bi_vcnt = RESYNC_PAGES;
- if (j < need_pages) {
- if (resync_alloc_pages(rp, gfp_flags))
- goto out_free_pages;
- } else {
- memcpy(rp, &rps[0], sizeof(*rp));
- resync_get_all_pages(rp);
- }
-
- rp->idx = 0;
- rp->raid_bio = r1_bio;
- bio->bi_private = rp;
+ if (bio_alloc_pages(bio, gfp_flags))
+ goto out_free_pages;
+ }
+ /* If not user-requests, copy the page pointers to all bios */
+ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
+ for (i=0; i<RESYNC_PAGES ; i++)
+ for (j=1; j<pi->raid_disks; j++)
+ r1_bio->bios[j]->bi_io_vec[i].bv_page =
+ r1_bio->bios[0]->bi_io_vec[i].bv_page;
}
r1_bio->master_bio = NULL;
@@ -177,14 +152,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
out_free_pages:
while (--j >= 0)
- resync_free_pages(&rps[j]);
+ bio_free_pages(r1_bio->bios[j]);
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
- kfree(rps);
-
-out_free_r1bio:
r1bio_pool_free(r1_bio, data);
return NULL;
}
@@ -192,18 +164,18 @@ out_free_r1bio:
static void r1buf_pool_free(void *__r1_bio, void *data)
{
struct pool_info *pi = data;
- int i;
+ int i,j;
struct r1bio *r1bio = __r1_bio;
- struct resync_pages *rp = NULL;
- for (i = pi->raid_disks; i--; ) {
- rp = get_resync_pages(r1bio->bios[i]);
- resync_free_pages(rp);
+ for (i = 0; i < RESYNC_PAGES; i++)
+ for (j = pi->raid_disks; j-- ;) {
+ if (j == 0 ||
+ r1bio->bios[j]->bi_io_vec[i].bv_page !=
+ r1bio->bios[0]->bi_io_vec[i].bv_page)
+ safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+ }
+ for (i=0 ; i < pi->raid_disks; i++)
bio_put(r1bio->bios[i]);
- }
-
- /* resync pages array stored in the 1st bio's .bi_private */
- kfree(rp);
r1bio_pool_free(r1bio, data);
}
@@ -270,17 +242,35 @@ static void reschedule_retry(struct r1bio *r1_bio)
static void call_bio_endio(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
+ int done;
struct r1conf *conf = r1_bio->mddev->private;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
+
+ if (bio->bi_phys_segments) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio->bi_phys_segments--;
+ done = (bio->bi_phys_segments == 0);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /*
+ * make_request() might be waiting for
+ * bi_phys_segments to decrease
+ */
+ wake_up(&conf->wait_barrier);
+ } else
+ done = 1;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_error = -EIO;
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf, r1_bio->sector);
+ if (done) {
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf, bi_sector);
+ }
}
static void raid_end_bio_io(struct r1bio *r1_bio)
@@ -384,9 +374,12 @@ static void close_write(struct r1bio *r1_bio)
{
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- bio_free_pages(r1_bio->behind_master_bio);
- bio_put(r1_bio->behind_master_bio);
- r1_bio->behind_master_bio = NULL;
+ /* free extra copy of the data pages */
+ int i = r1_bio->behind_page_count;
+ while (i--)
+ safe_put_page(r1_bio->behind_bvecs[i].bv_page);
+ kfree(r1_bio->behind_bvecs);
+ r1_bio->behind_bvecs = NULL;
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -488,10 +481,6 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
- /* we release behind master bio when all write are done */
- if (r1_bio->behind_master_bio == bio)
- to_put = NULL;
-
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -783,30 +772,6 @@ static int raid1_congested(struct mddev *mddev, int bits)
return ret;
}
-static void flush_bio_list(struct r1conf *conf, struct bio *bio)
-{
- /* flush any pending bitmap writes to disk before proceeding w/ I/O */
- bitmap_unplug(conf->mddev->bitmap);
- wake_up(&conf->wait_barrier);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio->bi_bdev = rdev->bdev;
- if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
- bio_endio(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
- /* Just ignore it */
- bio_endio(bio);
- else
- generic_make_request(bio);
- bio = next;
- }
-}
-
static void flush_pending_writes(struct r1conf *conf)
{
/* Any writes that have been queued but are awaiting
@@ -819,7 +784,27 @@ static void flush_pending_writes(struct r1conf *conf)
bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
- flush_bio_list(conf, bio);
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ struct md_rdev *rdev = (void*)bio->bi_bdev;
+ bio->bi_next = NULL;
+ bio->bi_bdev = rdev->bdev;
+ if (test_bit(Faulty, &rdev->flags)) {
+ bio->bi_error = -EIO;
+ bio_endio(bio);
+ } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
} else
spin_unlock_irq(&conf->device_lock);
}
@@ -1039,7 +1024,7 @@ static int get_unqueued_pending(struct r1conf *conf)
static void freeze_array(struct r1conf *conf, int extra)
{
/* Stop sync I/O and normal I/O and wait for everything to
- * go quiet.
+ * go quite.
* This is called in two situations:
* 1) management command handlers (reshape, remove disk, quiesce).
* 2) one normal I/O request failed.
@@ -1080,49 +1065,39 @@ static void unfreeze_array(struct r1conf *conf)
wake_up(&conf->wait_barrier);
}
-static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
- struct bio *bio)
+/* duplicate the data pages for behind I/O
+ */
+static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
{
- int size = bio->bi_iter.bi_size;
- unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- int i = 0;
- struct bio *behind_bio = NULL;
-
- behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
- if (!behind_bio)
- goto fail;
-
- /* discard op, we don't support writezero/writesame yet */
- if (!bio_has_data(bio))
- goto skip_copy;
-
- while (i < vcnt && size) {
- struct page *page;
- int len = min_t(int, PAGE_SIZE, size);
-
- page = alloc_page(GFP_NOIO);
- if (unlikely(!page))
- goto free_pages;
-
- bio_add_page(behind_bio, page, len, 0);
-
- size -= len;
- i++;
- }
+ int i;
+ struct bio_vec *bvec;
+ struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+ GFP_NOIO);
+ if (unlikely(!bvecs))
+ return;
- bio_copy_data(behind_bio, bio);
-skip_copy:
- r1_bio->behind_master_bio = behind_bio;;
+ bio_for_each_segment_all(bvec, bio, i) {
+ bvecs[i] = *bvec;
+ bvecs[i].bv_page = alloc_page(GFP_NOIO);
+ if (unlikely(!bvecs[i].bv_page))
+ goto do_sync_io;
+ memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
+ kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+ kunmap(bvecs[i].bv_page);
+ kunmap(bvec->bv_page);
+ }
+ r1_bio->behind_bvecs = bvecs;
+ r1_bio->behind_page_count = bio->bi_vcnt;
set_bit(R1BIO_BehindIO, &r1_bio->state);
+ return;
- return behind_bio;
-
-free_pages:
+do_sync_io:
+ for (i = 0; i < bio->bi_vcnt; i++)
+ if (bvecs[i].bv_page)
+ put_page(bvecs[i].bv_page);
+ kfree(bvecs);
pr_debug("%dB behind alloc failed, doing sync I/O\n",
bio->bi_iter.bi_size);
- bio_free_pages(behind_bio);
-fail:
- return behind_bio;
}
struct raid1_plug_cb {
@@ -1152,64 +1127,57 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- flush_bio_list(conf, bio);
- kfree(plug);
-}
+ bitmap_unplug(mddev->bitmap);
+ wake_up(&conf->wait_barrier);
-static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
-{
- r1_bio->master_bio = bio;
- r1_bio->sectors = bio_sectors(bio);
- r1_bio->state = 0;
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_iter.bi_sector;
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ struct md_rdev *rdev = (void*)bio->bi_bdev;
+ bio->bi_next = NULL;
+ bio->bi_bdev = rdev->bdev;
+ if (test_bit(Faulty, &rdev->flags)) {
+ bio->bi_error = -EIO;
+ bio_endio(bio);
+ } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+ kfree(plug);
}
static inline struct r1bio *
-alloc_r1bio(struct mddev *mddev, struct bio *bio)
+alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
- /* Ensure no bio records IO_BLOCKED */
- memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
- init_r1bio(r1_bio, mddev, bio);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+
return r1_bio;
}
-static void raid1_read_request(struct mddev *mddev, struct bio *bio,
- int max_read_sectors, struct r1bio *r1_bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio)
{
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
+ struct r1bio *r1_bio;
struct bio *read_bio;
struct bitmap *bitmap = mddev->bitmap;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+ int sectors_handled;
int max_sectors;
int rdisk;
- bool print_msg = !!r1_bio;
- char b[BDEVNAME_SIZE];
-
- /*
- * If r1_bio is set, we are blocking the raid1d thread
- * so there is a tiny risk of deadlock. So ask for
- * emergency memory if needed.
- */
- gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
-
- if (print_msg) {
- /* Need to get the block device name carefully */
- struct md_rdev *rdev;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
- if (rdev)
- bdevname(rdev->bdev, b);
- else
- strcpy(b, "???");
- rcu_read_unlock();
- }
/*
* Still need barrier for READ in case that whole
@@ -1217,37 +1185,33 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
*/
wait_read_barrier(conf, bio->bi_iter.bi_sector);
- if (!r1_bio)
- r1_bio = alloc_r1bio(mddev, bio);
- else
- init_r1bio(r1_bio, mddev, bio);
- r1_bio->sectors = max_read_sectors;
+ r1_bio = alloc_r1bio(mddev, bio, 0);
+
+ /*
+ * We might need to issue multiple reads to different
+ * devices if there are bad blocks around, so we keep
+ * track of the number of reads in bio->bi_phys_segments.
+ * If this is 0, there is only one r1_bio and no locking
+ * will be needed when requests complete. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ bio_clear_flag(bio, BIO_SEG_VALID);
/*
* make_request() can abort the operation when read-ahead is being
* used and no empty request is available.
*/
+read_again:
rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) {
/* couldn't find anywhere to read from */
- if (print_msg) {
- pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev),
- b,
- (unsigned long long)r1_bio->sector);
- }
raid_end_bio_io(r1_bio);
return;
}
mirror = conf->mirrors + rdisk;
- if (print_msg)
- pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
- mdname(mddev),
- (unsigned long long)r1_bio->sector,
- bdevname(mirror->rdev->bdev, b));
-
if (test_bit(WriteMostly, &mirror->rdev->flags) &&
bitmap) {
/*
@@ -1258,20 +1222,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
}
-
- if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, conf->bio_split);
- bio_chain(split, bio);
- generic_make_request(bio);
- bio = split;
- r1_bio->master_bio = bio;
- r1_bio->sectors = max_sectors;
- }
-
r1_bio->read_disk = rdisk;
- read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
r1_bio->bios[rdisk] = read_bio;
@@ -1290,11 +1245,35 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
read_bio, disk_devt(mddev->gendisk),
r1_bio->sector);
- generic_make_request(read_bio);
+ if (max_sectors < r1_bio->sectors) {
+ /*
+ * could not read all from this device, so we will need another
+ * r1_bio.
+ */
+ sectors_handled = (r1_bio->sector + max_sectors
+ - bio->bi_iter.bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+
+ /*
+ * Cannot call generic_make_request directly as that will be
+ * queued in __make_request and subsequent mempool_alloc might
+ * block waiting for it. So hand bio over to raid1d.
+ */
+ reschedule_retry(r1_bio);
+
+ r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
}
-static void raid1_write_request(struct mddev *mddev, struct bio *bio,
- int max_write_sectors)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
@@ -1305,6 +1284,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
int first_clone;
+ int sectors_handled;
int max_sectors;
/*
@@ -1343,8 +1323,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
}
wait_barrier(conf, bio->bi_iter.bi_sector);
- r1_bio = alloc_r1bio(mddev, bio);
- r1_bio->sectors = max_write_sectors;
+ r1_bio = alloc_r1bio(mddev, bio, 0);
+
+ /* We might need to issue multiple writes to different
+ * devices if there are bad blocks around, so we keep
+ * track of the number of writes in bio->bi_phys_segments.
+ * If this is 0, there is only one r1_bio and no locking
+ * will be needed when requests complete. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ bio_clear_flag(bio, BIO_SEG_VALID);
if (conf->pending_count >= max_queued_requests) {
md_wakeup_thread(mddev->thread);
@@ -1443,26 +1432,32 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
goto retry_write;
}
- if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- GFP_NOIO, conf->bio_split);
- bio_chain(split, bio);
- generic_make_request(bio);
- bio = split;
- r1_bio->master_bio = bio;
+ if (max_sectors < r1_bio->sectors) {
+ /* We are splitting this write into multiple parts, so
+ * we need to prepare for allocating another r1_bio.
+ */
r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
}
+ sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
first_clone = 1;
-
for (i = 0; i < disks; i++) {
- struct bio *mbio = NULL;
+ struct bio *mbio;
if (!r1_bio->bios[i])
continue;
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
if (first_clone) {
/* do behind I/O ?
@@ -1472,9 +1467,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (bitmap &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait)) {
- mbio = alloc_behind_master_bio(r1_bio, bio);
- }
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(mbio, r1_bio);
bitmap_startwrite(bitmap, r1_bio->sector,
r1_bio->sectors,
@@ -1482,17 +1476,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
&r1_bio->state));
first_clone = 0;
}
+ if (r1_bio->behind_bvecs) {
+ struct bio_vec *bvec;
+ int j;
- if (!mbio) {
- if (r1_bio->behind_master_bio)
- mbio = bio_clone_fast(r1_bio->behind_master_bio,
- GFP_NOIO,
- mddev->bio_set);
- else
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- }
-
- if (r1_bio->behind_master_bio) {
+ /*
+ * We trimmed the bio, so _all is legit
+ */
+ bio_for_each_segment_all(bvec, mbio, j)
+ bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
@@ -1536,6 +1528,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (!plug)
md_wakeup_thread(mddev->thread);
}
+ /* Mustn't call r1_bio_write_done before this next test,
+ * as it could result in the bio being freed.
+ */
+ if (sectors_handled < bio_sectors(bio)) {
+ r1_bio_write_done(r1_bio);
+ /* We need another r1_bio. It has already been counted
+ * in bio->bi_phys_segments
+ */
+ r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
+ goto retry_write;
+ }
r1_bio_write_done(r1_bio);
@@ -1545,6 +1548,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
static void raid1_make_request(struct mddev *mddev, struct bio *bio)
{
+ struct bio *split;
sector_t sectors;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
@@ -1552,20 +1556,22 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
return;
}
- /*
- * There is a limit to the maximum size, but
- * the read/write handler might find a lower limit
- * due to bad blocks. To avoid multiple splits,
- * we pass the maximum number of sectors down
- * and let the lower level perform the split.
- */
- sectors = align_to_barrier_unit_end(
- bio->bi_iter.bi_sector, bio_sectors(bio));
-
- if (bio_data_dir(bio) == READ)
- raid1_read_request(mddev, bio, sectors, NULL);
- else
- raid1_write_request(mddev, bio, sectors);
+ /* if bio exceeds barrier unit boundary, split it */
+ do {
+ sectors = align_to_barrier_unit_end(
+ bio->bi_iter.bi_sector, bio_sectors(bio));
+ if (sectors < bio_sectors(bio)) {
+ split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
+
+ if (bio_data_dir(split) == READ)
+ raid1_read_request(mddev, split);
+ else
+ raid1_write_request(mddev, split);
+ } while (split != bio);
}
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1840,7 +1846,7 @@ abort:
static void end_sync_read(struct bio *bio)
{
- struct r1bio *r1_bio = get_resync_r1bio(bio);
+ struct r1bio *r1_bio = bio->bi_private;
update_head_pos(r1_bio->read_disk, r1_bio);
@@ -1859,7 +1865,7 @@ static void end_sync_read(struct bio *bio)
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_error;
- struct r1bio *r1_bio = get_resync_r1bio(bio);
+ struct r1bio *r1_bio = bio->bi_private;
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
sector_t first_bad;
@@ -1938,7 +1944,6 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
struct bio *bio = r1_bio->bios[r1_bio->read_disk];
- struct page **pages = get_resync_pages(bio)->pages;
sector_t sect = r1_bio->sector;
int sectors = r1_bio->sectors;
int idx = 0;
@@ -1972,7 +1977,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
*/
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev, sect, s<<9,
- pages[idx],
+ bio->bi_io_vec[idx].bv_page,
REQ_OP_READ, 0, false)) {
success = 1;
break;
@@ -2027,7 +2032,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- pages[idx],
+ bio->bi_io_vec[idx].bv_page,
WRITE) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
@@ -2042,7 +2047,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- pages[idx],
+ bio->bi_io_vec[idx].bv_page,
READ) != 0)
atomic_add(s, &rdev->corrected_errors);
}
@@ -2076,9 +2081,7 @@ static void process_checks(struct r1bio *r1_bio)
int j;
int size;
int error;
- struct bio_vec *bi;
struct bio *b = r1_bio->bios[i];
- struct resync_pages *rp = get_resync_pages(b);
if (b->bi_end_io != end_sync_read)
continue;
/* fixup the bio for reuse, but preserve errno */
@@ -2091,11 +2094,12 @@ static void process_checks(struct r1bio *r1_bio)
conf->mirrors[i].rdev->data_offset;
b->bi_bdev = conf->mirrors[i].rdev->bdev;
b->bi_end_io = end_sync_read;
- rp->raid_bio = r1_bio;
- b->bi_private = rp;
+ b->bi_private = r1_bio;
size = b->bi_iter.bi_size;
- bio_for_each_segment_all(bi, b, j) {
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &b->bi_io_vec[j];
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
@@ -2117,24 +2121,20 @@ static void process_checks(struct r1bio *r1_bio)
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
int error = sbio->bi_error;
- struct page **ppages = get_resync_pages(pbio)->pages;
- struct page **spages = get_resync_pages(sbio)->pages;
- struct bio_vec *bi;
- int page_len[RESYNC_PAGES] = { 0 };
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_error = 0;
- bio_for_each_segment_all(bi, sbio, j)
- page_len[j] = bi->bv_len;
-
if (!error) {
for (j = vcnt; j-- ; ) {
- if (memcmp(page_address(ppages[j]),
- page_address(spages[j]),
- page_len[j]))
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ sbio->bi_io_vec[j].bv_len))
break;
}
} else
@@ -2181,8 +2181,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
(i == r1_bio->read_disk ||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
continue;
- if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
- continue;
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
@@ -2352,14 +2350,20 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
/* Write at 'sector' for 'sectors'*/
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- wbio = bio_clone_fast(r1_bio->behind_master_bio,
- GFP_NOIO,
- mddev->bio_set);
- /* We really need a _all clone */
- wbio->bi_iter = (struct bvec_iter){ 0 };
+ unsigned vcnt = r1_bio->behind_page_count;
+ struct bio_vec *vec = r1_bio->behind_bvecs;
+
+ while (!vec->bv_page) {
+ vec++;
+ vcnt--;
+ }
+
+ wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+ memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+
+ wbio->bi_vcnt = vcnt;
} else {
- wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
- mddev->bio_set);
+ wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
}
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2455,8 +2459,11 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
{
+ int disk;
+ int max_sectors;
struct mddev *mddev = conf->mddev;
struct bio *bio;
+ char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
dev_t bio_dev;
sector_t bio_sector;
@@ -2472,6 +2479,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
*/
bio = r1_bio->bios[r1_bio->read_disk];
+ bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
bio_put(bio);
@@ -2489,12 +2497,61 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
}
rdev_dec_pending(rdev, conf->mddev);
- allow_barrier(conf, r1_bio->sector);
- bio = r1_bio->master_bio;
- /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
- r1_bio->state = 0;
- raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
+read_more:
+ disk = read_balance(conf, r1_bio, &max_sectors);
+ if (disk == -1) {
+ pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev), b, (unsigned long long)r1_bio->sector);
+ raid_end_bio_io(r1_bio);
+ } else {
+ const unsigned long do_sync
+ = r1_bio->master_bio->bi_opf & REQ_SYNC;
+ r1_bio->read_disk = disk;
+ bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+ bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+ r1_bio->bios[r1_bio->read_disk] = bio;
+ rdev = conf->mirrors[disk].rdev;
+ pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(rdev->bdev, b));
+ bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_end_io = raid1_end_read_request;
+ bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
+ if (test_bit(FailFast, &rdev->flags) &&
+ test_bit(R1BIO_FailFast, &r1_bio->state))
+ bio->bi_opf |= MD_FAILFAST;
+ bio->bi_private = r1_bio;
+ if (max_sectors < r1_bio->sectors) {
+ /* Drat - have to split this up more */
+ struct bio *mbio = r1_bio->master_bio;
+ int sectors_handled = (r1_bio->sector + max_sectors
+ - mbio->bi_iter.bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (mbio->bi_phys_segments == 0)
+ mbio->bi_phys_segments = 2;
+ else
+ mbio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, bio_dev, bio_sector);
+ generic_make_request(bio);
+ bio = NULL;
+
+ r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
+ set_bit(R1BIO_ReadError, &r1_bio->state);
+
+ goto read_more;
+ } else {
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, bio_dev, bio_sector);
+ generic_make_request(bio);
+ }
+ }
}
static void raid1d(struct md_thread *thread)
@@ -2560,7 +2617,10 @@ static void raid1d(struct md_thread *thread)
else if (test_bit(R1BIO_ReadError, &r1_bio->state))
handle_read_error(conf, r1_bio);
else
- WARN_ON_ONCE(1);
+ /* just a partial read to be scheduled from separate
+ * context
+ */
+ generic_make_request(r1_bio->bios[r1_bio->read_disk]);
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -2690,6 +2750,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
+ bio_reset(bio);
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
@@ -2745,6 +2806,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
+ bio->bi_private = r1_bio;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
}
@@ -2830,25 +2892,31 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
}
for (i = 0 ; i < conf->raid_disks * 2; i++) {
- struct resync_pages *rp;
-
bio = r1_bio->bios[i];
- rp = get_resync_pages(bio);
if (bio->bi_end_io) {
- page = resync_fetch_page(rp, rp->idx++);
-
- /*
- * won't fail because the vec table is big
- * enough to hold all these pages
- */
- bio_add_page(bio, page, len, 0);
+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+ if (bio_add_page(bio, page, len, 0) == 0) {
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ while (i > 0) {
+ i--;
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io==NULL)
+ continue;
+ /* remove last page from this bio */
+ bio->bi_vcnt--;
+ bio->bi_iter.bi_size -= len;
+ bio_clear_flag(bio, BIO_SEG_VALID);
+ }
+ goto bio_full;
+ }
}
}
nr_sectors += len>>9;
sector_nr += len>>9;
sync_blocks -= (len>>9);
- } while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
-
+ } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+ bio_full:
r1_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
@@ -2948,15 +3016,12 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->r1bio_pool)
goto abort;
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
- if (!conf->bio_split)
- goto abort;
-
conf->poolinfo->mddev = mddev;
err = -EINVAL;
spin_lock_init(&conf->device_lock);
rdev_for_each(rdev, mddev) {
+ struct request_queue *q;
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
@@ -2969,6 +3034,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (disk->rdev)
goto abort;
disk->rdev = rdev;
+ q = bdev_get_queue(rdev->bdev);
+
disk->head_position = 0;
disk->seq_start = MaxSector;
}
@@ -3030,8 +3097,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3137,8 +3202,6 @@ static void raid1_free(struct mddev *mddev, void *priv)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
kfree(conf);
}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index b0ab0da6e39e..dd22a37d0d83 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -107,8 +107,6 @@ struct r1conf {
mempool_t *r1bio_pool;
mempool_t *r1buf_pool;
- struct bio_set *bio_split;
-
/* temporary buffer to synchronous IO when attempting to repair
* a read error.
*/
@@ -155,13 +153,9 @@ struct r1bio {
int read_disk;
struct list_head retry_list;
-
- /*
- * When R1BIO_BehindIO is set, we store pages for write behind
- * in behind_master_bio.
- */
- struct bio *behind_master_bio;
-
+ /* Next two are only valid when R1BIO_BehindIO is set */
+ struct bio_vec *behind_bvecs;
+ int behind_page_count;
/*
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 83e502b8c138..629fd38b86fe 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,24 +110,6 @@ static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
-/*
- * 'strct resync_pages' stores actual pages used for doing the resync
- * IO, and it is per-bio, so make .bi_private points to it.
- */
-static inline struct resync_pages *get_resync_pages(struct bio *bio)
-{
- return bio->bi_private;
-}
-
-/*
- * for resync bio, r10bio pointer can be retrieved from the per-bio
- * 'struct resync_pages'.
- */
-static inline struct r10bio *get_resync_r10bio(struct bio *bio)
-{
- return get_resync_pages(bio)->raid_bio;
-}
-
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
@@ -143,6 +125,9 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio);
}
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
@@ -158,11 +143,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
+ struct page *page;
struct r10bio *r10_bio;
struct bio *bio;
- int j;
- int nalloc, nalloc_rp;
- struct resync_pages *rps;
+ int i, j;
+ int nalloc;
r10_bio = r10bio_pool_alloc(gfp_flags, conf);
if (!r10_bio)
@@ -174,15 +159,6 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
else
nalloc = 2; /* recovery */
- /* allocate once for all bios */
- if (!conf->have_replacement)
- nalloc_rp = nalloc;
- else
- nalloc_rp = nalloc * 2;
- rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
- if (!rps)
- goto out_free_r10bio;
-
/*
* Allocate bios.
*/
@@ -202,40 +178,36 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
* Allocate RESYNC_PAGES data pages and attach them
* where needed.
*/
- for (j = 0; j < nalloc; j++) {
+ for (j = 0 ; j < nalloc; j++) {
struct bio *rbio = r10_bio->devs[j].repl_bio;
- struct resync_pages *rp, *rp_repl;
-
- rp = &rps[j];
- if (rbio)
- rp_repl = &rps[nalloc + j];
-
bio = r10_bio->devs[j].bio;
-
- if (!j || test_bit(MD_RECOVERY_SYNC,
- &conf->mddev->recovery)) {
- if (resync_alloc_pages(rp, gfp_flags))
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
+ &conf->mddev->recovery)) {
+ /* we can share bv_page's during recovery
+ * and reshape */
+ struct bio *rbio = r10_bio->devs[0].bio;
+ page = rbio->bi_io_vec[i].bv_page;
+ get_page(page);
+ } else
+ page = alloc_page(gfp_flags);
+ if (unlikely(!page))
goto out_free_pages;
- } else {
- memcpy(rp, &rps[0], sizeof(*rp));
- resync_get_all_pages(rp);
- }
- rp->idx = 0;
- rp->raid_bio = r10_bio;
- bio->bi_private = rp;
- if (rbio) {
- memcpy(rp_repl, rp, sizeof(*rp));
- rbio->bi_private = rp_repl;
+ bio->bi_io_vec[i].bv_page = page;
+ if (rbio)
+ rbio->bi_io_vec[i].bv_page = page;
}
}
return r10_bio;
out_free_pages:
- while (--j >= 0)
- resync_free_pages(&rps[j * 2]);
-
+ for ( ; i > 0 ; i--)
+ safe_put_page(bio->bi_io_vec[i-1].bv_page);
+ while (j--)
+ for (i = 0; i < RESYNC_PAGES ; i++)
+ safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
j = 0;
out_free_bio:
for ( ; j < nalloc; j++) {
@@ -244,34 +216,30 @@ out_free_bio:
if (r10_bio->devs[j].repl_bio)
bio_put(r10_bio->devs[j].repl_bio);
}
- kfree(rps);
-out_free_r10bio:
r10bio_pool_free(r10_bio, conf);
return NULL;
}
static void r10buf_pool_free(void *__r10_bio, void *data)
{
+ int i;
struct r10conf *conf = data;
struct r10bio *r10bio = __r10_bio;
int j;
- struct resync_pages *rp = NULL;
- for (j = conf->copies; j--; ) {
+ for (j=0; j < conf->copies; j++) {
struct bio *bio = r10bio->devs[j].bio;
-
- rp = get_resync_pages(bio);
- resync_free_pages(rp);
- bio_put(bio);
-
+ if (bio) {
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ safe_put_page(bio->bi_io_vec[i].bv_page);
+ bio->bi_io_vec[i].bv_page = NULL;
+ }
+ bio_put(bio);
+ }
bio = r10bio->devs[j].repl_bio;
if (bio)
bio_put(bio);
}
-
- /* resync pages array stored in the 1st bio's .bi_private */
- kfree(rp);
-
r10bio_pool_free(r10bio, conf);
}
@@ -333,18 +301,27 @@ static void reschedule_retry(struct r10bio *r10_bio)
static void raid_end_bio_io(struct r10bio *r10_bio)
{
struct bio *bio = r10_bio->master_bio;
+ int done;
struct r10conf *conf = r10_bio->mddev->private;
+ if (bio->bi_phys_segments) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio->bi_phys_segments--;
+ done = (bio->bi_phys_segments == 0);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ } else
+ done = 1;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_error = -EIO;
-
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf);
-
+ if (done) {
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+ }
free_r10bio(r10_bio);
}
@@ -1118,41 +1095,12 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct bio *read_bio;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+ int sectors_handled;
int max_sectors;
sector_t sectors;
struct md_rdev *rdev;
- char b[BDEVNAME_SIZE];
- int slot = r10_bio->read_slot;
- struct md_rdev *err_rdev = NULL;
- gfp_t gfp = GFP_NOIO;
-
- if (r10_bio->devs[slot].rdev) {
- /*
- * This is an error retry, but we cannot
- * safely dereference the rdev in the r10_bio,
- * we must use the one in conf.
- * If it has already been disconnected (unlikely)
- * we lose the device name in error messages.
- */
- int disk;
- /*
- * As we are blocking raid10, it is a little safer to
- * use __GFP_HIGH.
- */
- gfp = GFP_NOIO | __GFP_HIGH;
+ int slot;
- rcu_read_lock();
- disk = r10_bio->devs[slot].devnum;
- err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
- if (err_rdev)
- bdevname(err_rdev->bdev, b);
- else {
- strcpy(b, "???");
- /* This never gets dereferenced */
- err_rdev = r10_bio->devs[slot].rdev;
- }
- rcu_read_unlock();
- }
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1160,7 +1108,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = r10_bio->sectors;
+ sectors = bio_sectors(bio);
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1177,33 +1125,17 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
wait_barrier(conf);
}
+read_again:
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
- if (err_rdev) {
- pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), b,
- (unsigned long long)r10_bio->sector);
- }
raid_end_bio_io(r10_bio);
return;
}
- if (err_rdev)
- pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- (unsigned long long)r10_bio->sector);
- if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, conf->bio_split);
- bio_chain(split, bio);
- generic_make_request(bio);
- bio = split;
- r10_bio->master_bio = bio;
- r10_bio->sectors = max_sectors;
- }
slot = r10_bio->read_slot;
- read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1222,77 +1154,39 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
- generic_make_request(read_bio);
- return;
-}
-
-static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
- struct bio *bio, bool replacement,
- int n_copy)
-{
- const int op = bio_op(bio);
- const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
- unsigned long flags;
- struct blk_plug_cb *cb;
- struct raid10_plug_cb *plug = NULL;
- struct r10conf *conf = mddev->private;
- struct md_rdev *rdev;
- int devnum = r10_bio->devs[n_copy].devnum;
- struct bio *mbio;
-
- if (replacement) {
- rdev = conf->mirrors[devnum].replacement;
- if (rdev == NULL) {
- /* Replacement just got moved to main 'rdev' */
- smp_mb();
- rdev = conf->mirrors[devnum].rdev;
- }
- } else
- rdev = conf->mirrors[devnum].rdev;
-
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- if (replacement)
- r10_bio->devs[n_copy].repl_bio = mbio;
- else
- r10_bio->devs[n_copy].bio = mbio;
-
- mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
- choose_data_offset(r10_bio, rdev));
- mbio->bi_bdev = rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
- if (!replacement && test_bit(FailFast,
- &conf->mirrors[devnum].rdev->flags)
- && enough(conf, devnum))
- mbio->bi_opf |= MD_FAILFAST;
- mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
- mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
- /* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_bdev = (void *)rdev;
+ if (max_sectors < r10_bio->sectors) {
+ /*
+ * Could not read all from this device, so we will need another
+ * r10_bio.
+ */
+ sectors_handled = (r10_bio->sector + max_sectors
+ - bio->bi_iter.bi_sector);
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ /*
+ * Cannot call generic_make_request directly as that will be
+ * queued in __generic_make_request and subsequent
+ * mempool_alloc might block waiting for it. so hand bio over
+ * to raid10d.
+ */
+ reschedule_retry(r10_bio);
- atomic_inc(&r10_bio->remaining);
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
- cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid10_plug_cb, cb);
- else
- plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- plug->pending_cnt++;
- } else {
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
- md_wakeup_thread(mddev->thread);
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+ r10_bio->state = 0;
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
+ return;
}
static void raid10_write_request(struct mddev *mddev, struct bio *bio,
@@ -1300,8 +1194,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
{
struct r10conf *conf = mddev->private;
int i;
+ const int op = bio_op(bio);
+ const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+ const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
+ unsigned long flags;
struct md_rdev *blocked_rdev;
+ struct blk_plug_cb *cb;
+ struct raid10_plug_cb *plug = NULL;
sector_t sectors;
+ int sectors_handled;
int max_sectors;
md_write_start(mddev, bio);
@@ -1313,7 +1214,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = r10_bio->sectors;
+ sectors = bio_sectors(bio);
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1361,7 +1262,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* on which we have seen a write error, we want to avoid
* writing to those blocks. This potentially requires several
* writes to write around the bad blocks. Each set of writes
- * gets its own r10_bio with a set of bios attached.
+ * gets its own r10_bio with a set of bios attached. The number
+ * of r10_bios is recored in bio->bi_phys_segments just as with
+ * the read case.
*/
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
@@ -1481,31 +1384,132 @@ retry_write:
goto retry_write;
}
- if (max_sectors < r10_bio->sectors)
+ if (max_sectors < r10_bio->sectors) {
+ /* We are splitting this into multiple parts, so
+ * we need to prepare for allocating another r10_bio.
+ */
r10_bio->sectors = max_sectors;
-
- if (r10_bio->sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, r10_bio->sectors,
- GFP_NOIO, conf->bio_split);
- bio_chain(split, bio);
- generic_make_request(bio);
- bio = split;
- r10_bio->master_bio = bio;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
}
+ sectors_handled = r10_bio->sector + max_sectors -
+ bio->bi_iter.bi_sector;
atomic_set(&r10_bio->remaining, 1);
bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) {
- if (r10_bio->devs[i].bio)
- raid10_write_one_disk(mddev, r10_bio, bio, false, i);
- if (r10_bio->devs[i].repl_bio)
- raid10_write_one_disk(mddev, r10_bio, bio, true, i);
+ struct bio *mbio;
+ int d = r10_bio->devs[i].devnum;
+ if (r10_bio->devs[i].bio) {
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+ r10_bio->devs[i].bio = mbio;
+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
+ choose_data_offset(r10_bio, rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ bio_set_op_attrs(mbio, op, do_sync | do_fua);
+ if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
+ enough(conf, d))
+ mbio->bi_opf |= MD_FAILFAST;
+ mbio->bi_private = r10_bio;
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+ mbio, disk_devt(conf->mddev->gendisk),
+ r10_bio->sector);
+ /* flush_pending_writes() needs access to the rdev so...*/
+ mbio->bi_bdev = (void*)rdev;
+
+ atomic_inc(&r10_bio->remaining);
+
+ cb = blk_check_plugged(raid10_unplug, mddev,
+ sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb,
+ cb);
+ else
+ plug = NULL;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!plug)
+ md_wakeup_thread(mddev->thread);
+ }
+
+ if (r10_bio->devs[i].repl_bio) {
+ struct md_rdev *rdev = conf->mirrors[d].replacement;
+ if (rdev == NULL) {
+ /* Replacement just got moved to main 'rdev' */
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+ r10_bio->devs[i].repl_bio = mbio;
+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
+ choose_data_offset(r10_bio, rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ bio_set_op_attrs(mbio, op, do_sync | do_fua);
+ mbio->bi_private = r10_bio;
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+ mbio, disk_devt(conf->mddev->gendisk),
+ r10_bio->sector);
+ /* flush_pending_writes() needs access to the rdev so...*/
+ mbio->bi_bdev = (void*)rdev;
+
+ atomic_inc(&r10_bio->remaining);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!mddev_check_plugged(mddev))
+ md_wakeup_thread(mddev->thread);
+ }
+ }
+
+ /* Don't remove the bias on 'remaining' (one_write_done) until
+ * after checking if we need to go around again.
+ */
+
+ if (sectors_handled < bio_sectors(bio)) {
+ one_write_done(r10_bio);
+ /* We need another r10_bio. It has already been counted
+ * in bio->bi_phys_segments.
+ */
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+ r10_bio->state = 0;
+ goto retry_write;
}
one_write_done(r10_bio);
}
-static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
+static void __make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
@@ -1513,12 +1517,21 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
- r10_bio->sectors = sectors;
+ r10_bio->sectors = bio_sectors(bio);
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
- memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
+
+ /*
+ * We might need to issue multiple reads to different devices if there
+ * are bad blocks around, so we keep track of the number of reads in
+ * bio->bi_phys_segments. If this is 0, there is only one r10_bio and
+ * no locking will be needed when the request completes. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ bio_clear_flag(bio, BIO_SEG_VALID);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
@@ -1531,26 +1544,54 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
struct r10conf *conf = mddev->private;
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1;
- int sectors = bio_sectors(bio);
+
+ struct bio *split;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- /*
- * If this request crosses a chunk boundary, we need to split
- * it.
- */
- if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
- sectors > chunk_sects
- && (conf->geo.near_copies < conf->geo.raid_disks
- || conf->prev.near_copies <
- conf->prev.raid_disks)))
- sectors = chunk_sects -
- (bio->bi_iter.bi_sector &
- (chunk_sects - 1));
- __make_request(mddev, bio, sectors);
+ do {
+
+ /*
+ * If this request crosses a chunk boundary, we need to split
+ * it.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+ bio_sectors(bio) > chunk_sects
+ && (conf->geo.near_copies < conf->geo.raid_disks
+ || conf->prev.near_copies <
+ conf->prev.raid_disks))) {
+ split = bio_split(bio, chunk_sects -
+ (bio->bi_iter.bi_sector &
+ (chunk_sects - 1)),
+ GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
+
+ /*
+ * If a bio is splitted, the first part of bio will pass
+ * barrier but the bio is queued in current->bio_list (see
+ * generic_make_request). If there is a raise_barrier() called
+ * here, the second part of bio can't pass barrier. But since
+ * the first part bio isn't dispatched to underlaying disks
+ * yet, the barrier is never released, hence raise_barrier will
+ * alays wait. We have a deadlock.
+ * Note, this only happens in read path. For write path, the
+ * first part of bio is dispatched in a schedule() call
+ * (because of blk plug) or offloaded to raid10d.
+ * Quitting from the function immediately can change the bio
+ * order queued in bio_list and avoid the deadlock.
+ */
+ __make_request(mddev, split);
+ if (split != bio && bio_data_dir(bio) == READ) {
+ generic_make_request(bio);
+ break;
+ }
+ } while (split != bio);
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
@@ -1893,9 +1934,17 @@ abort:
return err;
}
-static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
+static void end_sync_read(struct bio *bio)
{
+ struct r10bio *r10_bio = bio->bi_private;
struct r10conf *conf = r10_bio->mddev->private;
+ int d;
+
+ if (bio == r10_bio->master_bio) {
+ /* this is a reshape read */
+ d = r10_bio->read_slot; /* really the read dev */
+ } else
+ d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (!bio->bi_error)
set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1919,23 +1968,6 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
}
}
-static void end_sync_read(struct bio *bio)
-{
- struct r10bio *r10_bio = get_resync_r10bio(bio);
- struct r10conf *conf = r10_bio->mddev->private;
- int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
-
- __end_sync_read(r10_bio, bio, d);
-}
-
-static void end_reshape_read(struct bio *bio)
-{
- /* reshape read bio isn't allocated from r10buf_pool */
- struct r10bio *r10_bio = bio->bi_private;
-
- __end_sync_read(r10_bio, bio, r10_bio->read_slot);
-}
-
static void end_sync_request(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
@@ -1965,7 +1997,7 @@ static void end_sync_request(struct r10bio *r10_bio)
static void end_sync_write(struct bio *bio)
{
- struct r10bio *r10_bio = get_resync_r10bio(bio);
+ struct r10bio *r10_bio = bio->bi_private;
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
@@ -2024,7 +2056,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int i, first;
struct bio *tbio, *fbio;
int vcnt;
- struct page **tpages, **fpages;
atomic_set(&r10_bio->remaining, 1);
@@ -2040,14 +2071,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
fbio = r10_bio->devs[i].bio;
fbio->bi_iter.bi_size = r10_bio->sectors << 9;
fbio->bi_iter.bi_idx = 0;
- fpages = get_resync_pages(fbio)->pages;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
/* now find blocks with errors */
for (i=0 ; i < conf->copies ; i++) {
int j, d;
struct md_rdev *rdev;
- struct resync_pages *rp;
tbio = r10_bio->devs[i].bio;
@@ -2055,8 +2084,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
if (i == first)
continue;
-
- tpages = get_resync_pages(tbio)->pages;
d = r10_bio->devs[i].devnum;
rdev = conf->mirrors[d].rdev;
if (!r10_bio->devs[i].bio->bi_error) {
@@ -2069,8 +2096,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int len = PAGE_SIZE;
if (sectors < (len / 512))
len = sectors * 512;
- if (memcmp(page_address(fpages[j]),
- page_address(tpages[j]),
+ if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
+ page_address(tbio->bi_io_vec[j].bv_page),
len))
break;
sectors -= len/512;
@@ -2091,13 +2118,11 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* First we need to fixup bv_offset, bv_len and
* bi_vecs, as the read request might have corrupted these
*/
- rp = get_resync_pages(tbio);
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
- rp->raid_bio = r10_bio;
- tbio->bi_private = rp;
+ tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
tbio->bi_end_io = end_sync_write;
bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -2168,7 +2193,6 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
int idx = 0;
int dr = r10_bio->devs[0].devnum;
int dw = r10_bio->devs[1].devnum;
- struct page **pages = get_resync_pages(bio)->pages;
while (sectors) {
int s = sectors;
@@ -2184,7 +2208,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- pages[idx],
+ bio->bi_io_vec[idx].bv_page,
REQ_OP_READ, 0, false);
if (ok) {
rdev = conf->mirrors[dw].rdev;
@@ -2192,7 +2216,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- pages[idx],
+ bio->bi_io_vec[idx].bv_page,
REQ_OP_WRITE, 0, false);
if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags);
@@ -2564,7 +2588,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
if (sectors > sect_to_write)
sectors = sect_to_write;
/* Write at 'sector' for 'sectors' */
- wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
wbio->bi_iter.bi_sector = wsector +
@@ -2592,6 +2616,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *bio;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev = r10_bio->devs[slot].rdev;
+ char b[BDEVNAME_SIZE];
+ unsigned long do_sync;
+ int max_sectors;
dev_t bio_dev;
sector_t bio_last_sector;
@@ -2604,6 +2631,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* frozen.
*/
bio = r10_bio->devs[slot].bio;
+ bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
bio_put(bio);
@@ -2619,9 +2647,70 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
- allow_barrier(conf);
- r10_bio->state = 0;
- raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+
+read_more:
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (rdev == NULL) {
+ pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev), b,
+ (unsigned long long)r10_bio->sector);
+ raid_end_bio_io(r10_bio);
+ return;
+ }
+
+ do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
+ slot = r10_bio->read_slot;
+ pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ bio = bio_clone_mddev(r10_bio->master_bio,
+ GFP_NOIO, mddev);
+ bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+ r10_bio->devs[slot].bio = bio;
+ r10_bio->devs[slot].rdev = rdev;
+ bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
+ + choose_data_offset(r10_bio, rdev);
+ bio->bi_bdev = rdev->bdev;
+ bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
+ if (test_bit(FailFast, &rdev->flags) &&
+ test_bit(R10BIO_FailFast, &r10_bio->state))
+ bio->bi_opf |= MD_FAILFAST;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = raid10_end_read_request;
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, bio_dev,
+ bio_last_sector - r10_bio->sectors);
+
+ if (max_sectors < r10_bio->sectors) {
+ /* Drat - have to split this up more */
+ struct bio *mbio = r10_bio->master_bio;
+ int sectors_handled =
+ r10_bio->sector + max_sectors
+ - mbio->bi_iter.bi_sector;
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (mbio->bi_phys_segments == 0)
+ mbio->bi_phys_segments = 2;
+ else
+ mbio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ generic_make_request(bio);
+
+ r10_bio = mempool_alloc(conf->r10bio_pool,
+ GFP_NOIO);
+ r10_bio->master_bio = mbio;
+ r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
+ r10_bio->state = 0;
+ set_bit(R10BIO_ReadError,
+ &r10_bio->state);
+ r10_bio->mddev = mddev;
+ r10_bio->sector = mbio->bi_iter.bi_sector
+ + sectors_handled;
+
+ goto read_more;
+ } else
+ generic_make_request(bio);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
@@ -2708,11 +2797,6 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
conf->nr_queued++;
spin_unlock_irq(&conf->device_lock);
- /*
- * In case freeze_array() is waiting for condition
- * nr_pending == nr_queued + extra to be true.
- */
- wake_up(&conf->wait_barrier);
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_bit(R10BIO_WriteError,
@@ -2787,8 +2871,13 @@ static void raid10d(struct md_thread *thread)
recovery_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_ReadError, &r10_bio->state))
handle_read_error(mddev, r10_bio);
- else
- WARN_ON_ONCE(1);
+ else {
+ /* just a partial read to be scheduled from a
+ * separate context
+ */
+ int slot = r10_bio->read_slot;
+ generic_make_request(r10_bio->devs[slot].bio);
+ }
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3102,8 +3191,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
bio = r10_bio->devs[0].bio;
+ bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
+ bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &rdev->flags))
@@ -3127,8 +3218,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!test_bit(In_sync, &mrdev->flags)) {
bio = r10_bio->devs[1].bio;
+ bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
+ bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr
@@ -3153,8 +3246,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace == NULL || bio == NULL ||
test_bit(Faulty, &mreplace->flags))
break;
+ bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
+ bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr +
@@ -3276,6 +3371,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->devs[i].repl_bio->bi_end_io = NULL;
bio = r10_bio->devs[i].bio;
+ bio_reset(bio);
bio->bi_error = -EIO;
rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -3300,6 +3396,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&r10_bio->remaining);
bio->bi_next = biolist;
biolist = bio;
+ bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3318,11 +3415,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
+ bio_reset(bio);
bio->bi_error = -EIO;
sector = r10_bio->devs[i].addr;
bio->bi_next = biolist;
biolist = bio;
+ bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3361,17 +3460,27 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
- struct resync_pages *rp = get_resync_pages(bio);
- page = resync_fetch_page(rp, rp->idx++);
- /*
- * won't fail because the vec table is big enough
- * to hold all these pages
- */
- bio_add_page(bio, page, len, 0);
+ struct bio *bio2;
+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+ if (bio_add_page(bio, page, len, 0))
+ continue;
+
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ for (bio2 = biolist;
+ bio2 && bio2 != bio;
+ bio2 = bio2->bi_next) {
+ /* remove last page from this bio */
+ bio2->bi_vcnt--;
+ bio2->bi_iter.bi_size -= len;
+ bio_clear_flag(bio2, BIO_SEG_VALID);
+ }
+ goto bio_full;
}
nr_sectors += len>>9;
sector_nr += len>>9;
- } while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
+ } while (biolist->bi_vcnt < RESYNC_PAGES);
+ bio_full:
r10_bio->sectors = nr_sectors;
while (biolist) {
@@ -3379,7 +3488,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
biolist = biolist->bi_next;
bio->bi_next = NULL;
- r10_bio = get_resync_r10bio(bio);
+ r10_bio = bio->bi_private;
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
@@ -3561,10 +3670,6 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->r10bio_pool)
goto out;
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
- if (!conf->bio_split)
- goto out;
-
calc_sectors(conf, mddev->dev_sectors);
if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo;
@@ -3602,8 +3707,6 @@ static struct r10conf *setup_conf(struct mddev *mddev)
mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3648,6 +3751,7 @@ static int raid10_run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
+ struct request_queue *q;
disk_idx = rdev->raid_disk;
if (disk_idx < 0)
@@ -3666,6 +3770,7 @@ static int raid10_run(struct mddev *mddev)
goto out_free_conf;
disk->rdev = rdev;
}
+ q = bdev_get_queue(rdev->bdev);
diff = (rdev->new_data_offset - rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
@@ -3682,7 +3787,6 @@ static int raid10_run(struct mddev *mddev)
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
discard_supported = true;
- first = 0;
}
if (mddev->queue) {
@@ -3812,8 +3916,6 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
kfree(conf);
}
@@ -4088,7 +4190,6 @@ static int raid10_start_reshape(struct mddev *mddev)
if (first || diff < min_offset_diff)
min_offset_diff = diff;
}
- first = 0;
}
if (max(before_length, after_length) > min_offset_diff)
@@ -4278,7 +4379,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
struct bio *blist;
struct bio *bio, *read_bio;
int sectors_done = 0;
- struct page **pages;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
@@ -4399,7 +4499,7 @@ read_more:
read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ rdev->data_offset);
read_bio->bi_private = r10_bio;
- read_bio->bi_end_io = end_reshape_read;
+ read_bio->bi_end_io = end_sync_read;
bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
read_bio->bi_error = 0;
@@ -4429,9 +4529,11 @@ read_more:
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
continue;
+ bio_reset(b);
b->bi_bdev = rdev2->bdev;
b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
rdev2->new_data_offset;
+ b->bi_private = r10_bio;
b->bi_end_io = end_reshape_write;
bio_set_op_attrs(b, REQ_OP_WRITE, 0);
b->bi_next = blist;
@@ -4441,22 +4543,31 @@ read_more:
/* Now add as many pages as possible to all of these bios. */
nr_sectors = 0;
- pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
- struct page *page = pages[s / (PAGE_SIZE >> 9)];
+ struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
int len = (max_sectors - s) << 9;
if (len > PAGE_SIZE)
len = PAGE_SIZE;
for (bio = blist; bio ; bio = bio->bi_next) {
- /*
- * won't fail because the vec table is big enough
- * to hold all these pages
- */
- bio_add_page(bio, page, len, 0);
+ struct bio *bio2;
+ if (bio_add_page(bio, page, len, 0))
+ continue;
+
+ /* Didn't fit, must stop */
+ for (bio2 = blist;
+ bio2 && bio2 != bio;
+ bio2 = bio2->bi_next) {
+ /* Remove last page from this bio */
+ bio2->bi_vcnt--;
+ bio2->bi_iter.bi_size -= len;
+ bio_clear_flag(bio2, BIO_SEG_VALID);
+ }
+ goto bio_full;
}
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
+bio_full:
rcu_read_unlock();
r10_bio->sectors = nr_sectors;
@@ -4570,10 +4681,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10b = &on_stack.r10_bio;
int slot = 0;
int idx = 0;
- struct page **pages;
-
- /* reshape IOs share pages from .devs[0].bio */
- pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
+ struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
r10b->sector = r10_bio->sector;
__raid10_find_phys(&conf->prev, r10b);
@@ -4602,7 +4710,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
success = sync_page_io(rdev,
addr,
s << 9,
- pages[idx],
+ bvec[idx].bv_page,
REQ_OP_READ, 0, false);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
@@ -4630,7 +4738,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
static void end_reshape_write(struct bio *bio)
{
- struct r10bio *r10_bio = get_resync_r10bio(bio);
+ struct r10bio *r10_bio = bio->bi_private;
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 735ce1a3d260..3162615e57bd 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,7 +82,6 @@ struct r10conf {
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
struct page *tmppage;
- struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index fd6a6389973c..2a5a414ee54f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -23,6 +23,7 @@
#include <linux/types.h>
#include "md.h"
#include "raid5.h"
+#include "raid5-log.h"
#include "bitmap.h"
/*
@@ -30,7 +31,6 @@
* underneath hardware sector size. only works with PAGE_SIZE == 4096
*/
#define BLOCK_SECTORS (8)
-#define BLOCK_SECTOR_SHIFT (3)
/*
* log->max_free_space is min(1/4 disk size, 10G reclaimable space).
@@ -44,7 +44,7 @@
/* wake up reclaim thread periodically */
#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
/* start flush with these full stripes */
-#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
+#define R5C_FULL_STRIPE_FLUSH_BATCH 256
/* reclaim stripes in groups */
#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
@@ -308,7 +308,8 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
}
static void
-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
+ struct bio_list *return_bi)
{
struct bio *wbi, *wbi2;
@@ -317,21 +318,24 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- md_write_end(conf->mddev);
- bio_endio(wbi);
+ if (!raid5_dec_bi_active_stripes(wbi)) {
+ md_write_end(conf->mddev);
+ bio_list_add(return_bi, wbi);
+ }
wbi = wbi2;
}
}
void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks)
+ struct stripe_head *sh, int disks, struct bio_list *return_bi)
{
int i;
for (i = sh->disks; i--; ) {
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
- r5c_return_dev_pending_writes(conf, &sh->dev[i]);
+ r5c_return_dev_pending_writes(conf, &sh->dev[i],
+ return_bi);
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
@@ -340,8 +344,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
}
}
-void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-
/* Check whether we should flush some stripes to free up stripe cache */
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
@@ -380,7 +382,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
* or a full stripe (chunk size / 4k stripes).
*/
if (atomic_read(&conf->r5c_cached_full_stripes) >=
- min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
+ min(R5C_FULL_STRIPE_FLUSH_BATCH,
conf->chunk_sectors >> STRIPE_SHIFT))
r5l_wake_reclaim(conf->log, 0);
}
@@ -589,7 +591,7 @@ static void r5l_log_endio(struct bio *bio)
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
- if (log->need_cache_flush && !list_empty(&io->stripe_list))
+ if (log->need_cache_flush)
r5l_move_to_end_ios(log);
else
r5l_log_run_stripes(log);
@@ -617,11 +619,9 @@ static void r5l_log_endio(struct bio *bio)
bio_endio(bi);
atomic_dec(&io->pending_stripe);
}
+ if (atomic_read(&io->pending_stripe) == 0)
+ __r5l_stripe_write_finished(io);
}
-
- /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
- if (atomic_read(&io->pending_stripe) == 0)
- __r5l_stripe_write_finished(io);
}
static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@ -843,41 +843,6 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
r5_reserve_log_entry(log, io);
}
-static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
-{
- struct mddev *mddev = log->rdev->mddev;
- struct r5conf *conf = mddev->private;
- struct r5l_io_unit *io;
- struct r5l_payload_flush *payload;
- int meta_size;
-
- /*
- * payload_flush requires extra writes to the journal.
- * To avoid handling the extra IO in quiesce, just skip
- * flush_payload
- */
- if (conf->quiesce)
- return;
-
- mutex_lock(&log->io_mutex);
- meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
-
- if (r5l_get_meta(log, meta_size)) {
- mutex_unlock(&log->io_mutex);
- return;
- }
-
- /* current implementation is one stripe per flush payload */
- io = log->current_io;
- payload = page_address(io->meta_page) + io->meta_offset;
- payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
- payload->header.flags = cpu_to_le16(0);
- payload->size = cpu_to_le32(sizeof(__le64));
- payload->flush_stripes[0] = cpu_to_le64(sect);
- io->meta_offset += meta_size;
- mutex_unlock(&log->io_mutex);
-}
-
static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
int data_pages, int parity_pages)
{
@@ -1429,7 +1394,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
- R5C_FULL_STRIPE_FLUSH_BATCH(conf))
+ R5C_FULL_STRIPE_FLUSH_BATCH)
/*
* if stripe cache pressure moderate, or if there is many full
* stripes,flush all full stripes
@@ -1588,8 +1553,6 @@ bool r5l_log_disk_error(struct r5conf *conf)
return ret;
}
-#define R5L_RECOVERY_PAGE_POOL_SIZE 256
-
struct r5l_recovery_ctx {
struct page *meta_page; /* current meta */
sector_t meta_total_blocks; /* total size of current meta and data */
@@ -1598,131 +1561,18 @@ struct r5l_recovery_ctx {
int data_parity_stripes; /* number of data_parity stripes */
int data_only_stripes; /* number of data_only stripes */
struct list_head cached_list;
-
- /*
- * read ahead page pool (ra_pool)
- * in recovery, log is read sequentially. It is not efficient to
- * read every page with sync_page_io(). The read ahead page pool
- * reads multiple pages with one IO, so further log read can
- * just copy data from the pool.
- */
- struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
- sector_t pool_offset; /* offset of first page in the pool */
- int total_pages; /* total allocated pages */
- int valid_pages; /* pages with valid data */
- struct bio *ra_bio; /* bio to do the read ahead */
};
-static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
- struct r5l_recovery_ctx *ctx)
-{
- struct page *page;
-
- ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
- if (!ctx->ra_bio)
- return -ENOMEM;
-
- ctx->valid_pages = 0;
- ctx->total_pages = 0;
- while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
- page = alloc_page(GFP_KERNEL);
-
- if (!page)
- break;
- ctx->ra_pool[ctx->total_pages] = page;
- ctx->total_pages += 1;
- }
-
- if (ctx->total_pages == 0) {
- bio_put(ctx->ra_bio);
- return -ENOMEM;
- }
-
- ctx->pool_offset = 0;
- return 0;
-}
-
-static void r5l_recovery_free_ra_pool(struct r5l_log *log,
- struct r5l_recovery_ctx *ctx)
-{
- int i;
-
- for (i = 0; i < ctx->total_pages; ++i)
- put_page(ctx->ra_pool[i]);
- bio_put(ctx->ra_bio);
-}
-
-/*
- * fetch ctx->valid_pages pages from offset
- * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
- * However, if the offset is close to the end of the journal device,
- * ctx->valid_pages could be smaller than ctx->total_pages
- */
-static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
- struct r5l_recovery_ctx *ctx,
- sector_t offset)
-{
- bio_reset(ctx->ra_bio);
- ctx->ra_bio->bi_bdev = log->rdev->bdev;
- bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
- ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
-
- ctx->valid_pages = 0;
- ctx->pool_offset = offset;
-
- while (ctx->valid_pages < ctx->total_pages) {
- bio_add_page(ctx->ra_bio,
- ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
- ctx->valid_pages += 1;
-
- offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
-
- if (offset == 0) /* reached end of the device */
- break;
- }
-
- return submit_bio_wait(ctx->ra_bio);
-}
-
-/*
- * try read a page from the read ahead page pool, if the page is not in the
- * pool, call r5l_recovery_fetch_ra_pool
- */
-static int r5l_recovery_read_page(struct r5l_log *log,
- struct r5l_recovery_ctx *ctx,
- struct page *page,
- sector_t offset)
-{
- int ret;
-
- if (offset < ctx->pool_offset ||
- offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
- ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
- if (ret)
- return ret;
- }
-
- BUG_ON(offset < ctx->pool_offset ||
- offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
-
- memcpy(page_address(page),
- page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
- BLOCK_SECTOR_SHIFT]),
- PAGE_SIZE);
- return 0;
-}
-
static int r5l_recovery_read_meta_block(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct page *page = ctx->meta_page;
struct r5l_meta_block *mb;
u32 crc, stored_crc;
- int ret;
- ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
- if (ret != 0)
- return ret;
+ if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
+ false))
+ return -EIO;
mb = page_address(page);
stored_crc = le32_to_cpu(mb->checksum);
@@ -1804,7 +1654,8 @@ static void r5l_recovery_load_data(struct r5l_log *log,
raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0,
&dd_idx, sh);
- r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+ sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
sh->dev[dd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
ctx->meta_total_blocks += BLOCK_SECTORS;
@@ -1823,15 +1674,17 @@ static void r5l_recovery_load_parity(struct r5l_log *log,
struct r5conf *conf = mddev->private;
ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
- r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+ sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
sh->dev[sh->pd_idx].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
if (sh->qd_idx >= 0) {
- r5l_recovery_read_page(
- log, ctx, sh->dev[sh->qd_idx].page,
- r5l_ring_add(log, log_offset, BLOCK_SECTORS));
+ sync_page_io(log->rdev,
+ r5l_ring_add(log, log_offset, BLOCK_SECTORS),
+ PAGE_SIZE, sh->dev[sh->qd_idx].page,
+ REQ_OP_READ, 0, false);
sh->dev[sh->qd_idx].log_checksum =
le32_to_cpu(payload->checksum[1]);
set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
@@ -1962,15 +1815,14 @@ r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
/* if matches return 0; otherwise return -EINVAL */
static int
-r5l_recovery_verify_data_checksum(struct r5l_log *log,
- struct r5l_recovery_ctx *ctx,
- struct page *page,
+r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
sector_t log_offset, __le32 log_checksum)
{
void *addr;
u32 checksum;
- r5l_recovery_read_page(log, ctx, page, log_offset);
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+ page, REQ_OP_READ, 0, false);
addr = kmap_atomic(page);
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
kunmap_atomic(addr);
@@ -1992,7 +1844,6 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
struct page *page;
struct r5l_payload_data_parity *payload;
- struct r5l_payload_flush *payload_flush;
page = alloc_page(GFP_KERNEL);
if (!page)
@@ -2000,42 +1851,33 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
while (mb_offset < le32_to_cpu(mb->meta_size)) {
payload = (void *)mb + mb_offset;
- payload_flush = (void *)mb + mb_offset;
- if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+ if (payload->header.type == R5LOG_PAYLOAD_DATA) {
if (r5l_recovery_verify_data_checksum(
- log, ctx, page, log_offset,
+ log, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
- } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
+ } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
if (r5l_recovery_verify_data_checksum(
- log, ctx, page, log_offset,
+ log, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
if (conf->max_degraded == 2 && /* q for RAID 6 */
r5l_recovery_verify_data_checksum(
- log, ctx, page,
+ log, page,
r5l_ring_add(log, log_offset,
BLOCK_SECTORS),
payload->checksum[1]) < 0)
goto mismatch;
- } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
- /* nothing to do for R5LOG_PAYLOAD_FLUSH here */
- } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
+ } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
goto mismatch;
- if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
- mb_offset += sizeof(struct r5l_payload_flush) +
- le32_to_cpu(payload_flush->size);
- } else {
- /* DATA or PARITY payload */
- log_offset = r5l_ring_add(log, log_offset,
- le32_to_cpu(payload->size));
- mb_offset += sizeof(struct r5l_payload_data_parity) +
- sizeof(__le32) *
- (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
- }
+ log_offset = r5l_ring_add(log, log_offset,
+ le32_to_cpu(payload->size));
+ mb_offset += sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
put_page(page);
@@ -2063,7 +1905,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
struct r5conf *conf = mddev->private;
struct r5l_meta_block *mb;
struct r5l_payload_data_parity *payload;
- struct r5l_payload_flush *payload_flush;
int mb_offset;
sector_t log_offset;
sector_t stripe_sect;
@@ -2089,31 +1930,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
int dd;
payload = (void *)mb + mb_offset;
- payload_flush = (void *)mb + mb_offset;
-
- if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
- int i, count;
-
- count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
- for (i = 0; i < count; ++i) {
- stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
- sh = r5c_recovery_lookup_stripe(cached_stripe_list,
- stripe_sect);
- if (sh) {
- WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
- r5l_recovery_reset_stripe(sh);
- list_del_init(&sh->lru);
- raid5_release_stripe(sh);
- }
- }
-
- mb_offset += sizeof(struct r5l_payload_flush) +
- le32_to_cpu(payload_flush->size);
- continue;
- }
-
- /* DATA or PARITY payload */
- stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
+ stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
raid5_compute_sector(
conf, le64_to_cpu(payload->location), 0, &dd,
NULL)
@@ -2151,7 +1968,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
list_add_tail(&sh->lru, cached_stripe_list);
}
- if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+ if (payload->header.type == R5LOG_PAYLOAD_DATA) {
if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
r5l_recovery_replay_one_stripe(conf, sh, ctx);
@@ -2159,7 +1976,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
}
r5l_recovery_load_data(log, sh, ctx, payload,
log_offset);
- } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
+ } else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
r5l_recovery_load_parity(log, sh, ctx, payload,
log_offset);
else
@@ -2361,7 +2178,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
payload = (void *)mb + offset;
payload->header.type = cpu_to_le16(
R5LOG_PAYLOAD_DATA);
- payload->size = cpu_to_le32(BLOCK_SECTORS);
+ payload->size = BLOCK_SECTORS;
payload->location = cpu_to_le64(
raid5_compute_blocknr(sh, i, 0));
addr = kmap_atomic(dev->page);
@@ -2425,70 +2242,55 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
static int r5l_recovery_log(struct r5l_log *log)
{
struct mddev *mddev = log->rdev->mddev;
- struct r5l_recovery_ctx *ctx;
+ struct r5l_recovery_ctx ctx;
int ret;
sector_t pos;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
-
- ctx->pos = log->last_checkpoint;
- ctx->seq = log->last_cp_seq;
- INIT_LIST_HEAD(&ctx->cached_list);
- ctx->meta_page = alloc_page(GFP_KERNEL);
+ ctx.pos = log->last_checkpoint;
+ ctx.seq = log->last_cp_seq;
+ ctx.meta_page = alloc_page(GFP_KERNEL);
+ ctx.data_only_stripes = 0;
+ ctx.data_parity_stripes = 0;
+ INIT_LIST_HEAD(&ctx.cached_list);
- if (!ctx->meta_page) {
- ret = -ENOMEM;
- goto meta_page;
- }
-
- if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
- ret = -ENOMEM;
- goto ra_pool;
- }
+ if (!ctx.meta_page)
+ return -ENOMEM;
- ret = r5c_recovery_flush_log(log, ctx);
+ ret = r5c_recovery_flush_log(log, &ctx);
+ __free_page(ctx.meta_page);
if (ret)
- goto error;
+ return ret;
+
+ pos = ctx.pos;
+ ctx.seq += 10000;
- pos = ctx->pos;
- ctx->seq += 10000;
- if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
+ if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
pr_debug("md/raid:%s: starting from clean shutdown\n",
mdname(mddev));
else
pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
- mdname(mddev), ctx->data_only_stripes,
- ctx->data_parity_stripes);
-
- if (ctx->data_only_stripes == 0) {
- log->next_checkpoint = ctx->pos;
- r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
- ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
- } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
+ mdname(mddev), ctx.data_only_stripes,
+ ctx.data_parity_stripes);
+
+ if (ctx.data_only_stripes == 0) {
+ log->next_checkpoint = ctx.pos;
+ r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
+ ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+ } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
mdname(mddev));
- ret = -EIO;
- goto error;
+ return -EIO;
}
- log->log_start = ctx->pos;
- log->seq = ctx->seq;
+ log->log_start = ctx.pos;
+ log->seq = ctx.seq;
log->last_checkpoint = pos;
r5l_write_super(log, pos);
- r5c_recovery_flush_data_only_stripes(log, ctx);
- ret = 0;
-error:
- r5l_recovery_free_ra_pool(log, ctx);
-ra_pool:
- __free_page(ctx->meta_page);
-meta_page:
- kfree(ctx);
- return ret;
+ r5c_recovery_flush_data_only_stripes(log, &ctx);
+ return 0;
}
static void r5l_write_super(struct r5l_log *log, sector_t cp)
@@ -2817,11 +2619,11 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
atomic_dec(&conf->r5c_flushing_full_stripes);
atomic_dec(&conf->r5c_cached_full_stripes);
}
-
- r5l_append_flush_payload(log, sh->sector);
}
-int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
+int
+r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+ struct stripe_head_state *s)
{
struct r5conf *conf = sh->raid_conf;
int pages = 0;
@@ -3090,13 +2892,8 @@ io_kc:
return -EINVAL;
}
-void r5l_exit_log(struct r5conf *conf)
+void r5l_exit_log(struct r5l_log *log)
{
- struct r5l_log *log = conf->log;
-
- conf->log = NULL;
- synchronize_rcu();
-
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 20f1f7c8c57b..724b90cac518 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -2,7 +2,7 @@
#define _RAID5_LOG_H
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5conf *conf);
+extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
@@ -21,8 +21,9 @@ extern void r5c_release_extra_page(struct stripe_head *sh);
extern void r5c_use_extra_page(struct stripe_head *sh);
extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
extern void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+ struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
+ struct stripe_head_state *s);
extern void r5c_make_stripe_write_out(struct stripe_head *sh);
extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
@@ -33,7 +34,7 @@ extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
extern struct dma_async_tx_descriptor *
ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
- struct dma_async_tx_descriptor *tx);
+ struct dma_async_tx_descriptor *tx);
extern int ppl_init_log(struct r5conf *conf);
extern void ppl_exit_log(struct r5conf *conf);
extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
@@ -50,19 +51,10 @@ static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log) {
- if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
- /* writing out phase */
- if (s->waiting_extra_page)
- return 0;
- return r5l_write_stripe(conf->log, sh);
- } else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
- /* caching phase */
- return r5c_cache_data(conf->log, sh);
- }
- } else if (raid5_has_ppl(conf)) {
+ if (conf->log)
+ return r5l_write_stripe(conf->log, sh);
+ else if (raid5_has_ppl(conf))
return ppl_write_stripe(conf, sh);
- }
return -EAGAIN;
}
@@ -88,17 +80,16 @@ static inline void log_write_stripe_run(struct r5conf *conf)
static inline void log_exit(struct r5conf *conf)
{
if (conf->log)
- r5l_exit_log(conf);
+ r5l_exit_log(conf->log);
else if (raid5_has_ppl(conf))
ppl_exit_log(conf);
}
-static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev,
- bool ppl)
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
{
if (journal_dev)
return r5l_init_log(conf, journal_dev);
- else if (ppl)
+ else if (raid5_has_ppl(conf))
return ppl_init_log(conf);
return 0;
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 5d25bebf3328..27bad3e2d7ce 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -102,14 +102,11 @@ struct ppl_conf {
struct kmem_cache *io_kc;
mempool_t *io_pool;
struct bio_set *bs;
+ mempool_t *meta_pool;
/* used only for recovery */
int recovered_entries;
int mismatch_count;
-
- /* stripes to retry if failed to allocate io_unit */
- struct list_head no_mem_stripes;
- spinlock_t no_mem_stripes_lock;
};
struct ppl_log {
@@ -122,6 +119,8 @@ struct ppl_log {
* always at the end of io_list */
spinlock_t io_list_lock;
struct list_head io_list; /* all io_units of this log */
+ struct list_head no_mem_stripes;/* stripes to retry if failed to
+ * allocate io_unit */
};
#define PPL_IO_INLINE_BVECS 32
@@ -152,7 +151,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
- struct page **srcs = flex_array_get(percpu->scribble, 0);
+ struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
@@ -165,18 +164,18 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
* differently.
*/
if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
- /*
- * rmw: xor old data and parity from updated disks
- * This is calculated earlier by ops_run_prexor5() so just copy
- * the parity dev page.
- */
- srcs[count++] = sh->dev[pd_idx].page;
+ /* rmw: xor old data and parity from updated disks */
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+ xor_srcs[count++] = dev->page;
+ }
} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
/* rcw: xor data from all not updated disks */
for (i = disks; i--;) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_UPTODATE, &dev->flags))
- srcs[count++] = dev->page;
+ xor_srcs[count++] = dev->page;
}
} else {
return tx;
@@ -187,64 +186,34 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ sizeof(struct page *) * (sh->disks + 2));
if (count == 1)
- tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
+ tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
&submit);
else
- tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
+ tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
&submit);
return tx;
}
-static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
-{
- struct kmem_cache *kc = pool_data;
- struct ppl_io_unit *io;
-
- io = kmem_cache_alloc(kc, gfp_mask);
- if (!io)
- return NULL;
-
- io->header_page = alloc_page(gfp_mask);
- if (!io->header_page) {
- kmem_cache_free(kc, io);
- return NULL;
- }
-
- return io;
-}
-
-static void ppl_io_pool_free(void *element, void *pool_data)
-{
- struct kmem_cache *kc = pool_data;
- struct ppl_io_unit *io = element;
-
- __free_page(io->header_page);
- kmem_cache_free(kc, io);
-}
-
static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
struct stripe_head *sh)
{
struct ppl_conf *ppl_conf = log->ppl_conf;
struct ppl_io_unit *io;
struct ppl_header *pplhdr;
- struct page *header_page;
- io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
+ io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
if (!io)
return NULL;
- header_page = io->header_page;
memset(io, 0, sizeof(*io));
- io->header_page = header_page;
-
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
atomic_set(&io->pending_stripes, 0);
bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+ io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
pplhdr = page_address(io->header_page);
clear_page(pplhdr);
memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
@@ -357,7 +326,7 @@ int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
struct ppl_io_unit *io = sh->ppl_io;
struct ppl_log *log;
- if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
+ if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
!test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
!test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
@@ -378,9 +347,9 @@ int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
atomic_inc(&sh->count);
if (ppl_log_stripe(log, sh)) {
- spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
- list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
- spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&sh->log_list, &log->no_mem_stripes);
+ spin_unlock_irq(&log->io_list_lock);
}
mutex_unlock(&log->io_mutex);
@@ -400,6 +369,8 @@ static void ppl_log_endio(struct bio *bio)
if (bio->bi_error)
md_error(ppl_conf->mddev, log->rdev);
+ mempool_free(io->header_page, ppl_conf->meta_pool);
+
list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
list_del_init(&sh->log_list);
@@ -521,32 +492,25 @@ void ppl_write_stripe_run(struct r5conf *conf)
static void ppl_io_unit_finished(struct ppl_io_unit *io)
{
struct ppl_log *log = io->log;
- struct ppl_conf *ppl_conf = log->ppl_conf;
unsigned long flags;
pr_debug("%s: seq: %llu\n", __func__, io->seq);
- local_irq_save(flags);
+ spin_lock_irqsave(&log->io_list_lock, flags);
- spin_lock(&log->io_list_lock);
list_del(&io->log_sibling);
- spin_unlock(&log->io_list_lock);
-
- mempool_free(io, ppl_conf->io_pool);
+ mempool_free(io, log->ppl_conf->io_pool);
- spin_lock(&ppl_conf->no_mem_stripes_lock);
- if (!list_empty(&ppl_conf->no_mem_stripes)) {
- struct stripe_head *sh;
-
- sh = list_first_entry(&ppl_conf->no_mem_stripes,
- struct stripe_head, log_list);
+ if (!list_empty(&log->no_mem_stripes)) {
+ struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
+ struct stripe_head,
+ log_list);
list_del_init(&sh->log_list);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
- spin_unlock(&ppl_conf->no_mem_stripes_lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
}
void ppl_stripe_write_finished(struct stripe_head *sh)
@@ -1034,6 +998,7 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
kfree(ppl_conf->child_logs);
+ mempool_destroy(ppl_conf->meta_pool);
if (ppl_conf->bs)
bioset_free(ppl_conf->bs);
mempool_destroy(ppl_conf->io_pool);
@@ -1105,7 +1070,7 @@ int ppl_init_log(struct r5conf *conf)
struct mddev *mddev = conf->mddev;
int ret = 0;
int i;
- bool need_cache_flush = false;
+ bool need_cache_flush;
pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
mdname(conf->mddev));
@@ -1139,20 +1104,25 @@ int ppl_init_log(struct r5conf *conf)
ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
if (!ppl_conf->io_kc) {
- ret = -ENOMEM;
+ ret = -EINVAL;
goto err;
}
- ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
- ppl_io_pool_free, ppl_conf->io_kc);
+ ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
if (!ppl_conf->io_pool) {
- ret = -ENOMEM;
+ ret = -EINVAL;
goto err;
}
ppl_conf->bs = bioset_create(conf->raid_disks, 0);
if (!ppl_conf->bs) {
- ret = -ENOMEM;
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+ if (!ppl_conf->meta_pool) {
+ ret = -EINVAL;
goto err;
}
@@ -1165,8 +1135,6 @@ int ppl_init_log(struct r5conf *conf)
}
atomic64_set(&ppl_conf->seq, 0);
- INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
- spin_lock_init(&ppl_conf->no_mem_stripes_lock);
if (!mddev->external) {
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
@@ -1182,6 +1150,7 @@ int ppl_init_log(struct r5conf *conf)
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
INIT_LIST_HEAD(&log->io_list);
+ INIT_LIST_HEAD(&log->no_mem_stripes);
log->ppl_conf = ppl_conf;
log->rdev = rdev;
@@ -1225,7 +1194,6 @@ int ppl_init_log(struct r5conf *conf)
}
conf->log_private = ppl_conf;
- set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
return 0;
err:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7dd8fb00dbc0..96a69ff0f756 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -56,7 +56,6 @@
#include <linux/nodemask.h>
#include <linux/flex_array.h>
#include <trace/events/block.h>
-#include <linux/list_sort.h>
#include "md.h"
#include "raid5.h"
@@ -156,6 +155,17 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
+static void return_io(struct bio_list *return_bi)
+{
+ struct bio *bi;
+ while ((bi = bio_list_pop(return_bi)) != NULL) {
+ bi->bi_iter.bi_size = 0;
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
+ bio_endio(bi);
+ }
+}
+
static void print_raid5_conf (struct r5conf *conf);
static int stripe_operations_active(struct stripe_head *sh)
@@ -165,13 +175,6 @@ static int stripe_operations_active(struct stripe_head *sh)
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
-static bool stripe_is_lowprio(struct stripe_head *sh)
-{
- return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
- test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
- !test_bit(STRIPE_R5C_CACHING, &sh->state);
-}
-
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
@@ -187,10 +190,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
if (list_empty(&sh->lru)) {
struct r5worker_group *group;
group = conf->worker_groups + cpu_to_group(cpu);
- if (stripe_is_lowprio(sh))
- list_add_tail(&sh->lru, &group->loprio_list);
- else
- list_add_tail(&sh->lru, &group->handle_list);
+ list_add_tail(&sh->lru, &group->handle_list);
group->stripes_cnt++;
sh->group = group;
}
@@ -253,12 +253,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
if (conf->worker_cnt_per_group == 0) {
- if (stripe_is_lowprio(sh))
- list_add_tail(&sh->lru,
- &conf->loprio_list);
- else
- list_add_tail(&sh->lru,
- &conf->handle_list);
+ list_add_tail(&sh->lru, &conf->handle_list);
} else {
raid5_wakeup_stripe_thread(sh);
return;
@@ -359,15 +354,17 @@ static void release_inactive_stripe_list(struct r5conf *conf,
static int release_stripe_list(struct r5conf *conf,
struct list_head *temp_inactive_list)
{
- struct stripe_head *sh, *t;
+ struct stripe_head *sh;
int count = 0;
struct llist_node *head;
head = llist_del_all(&conf->released_stripes);
head = llist_reverse_order(head);
- llist_for_each_entry_safe(sh, t, head, release_list) {
+ while (head) {
int hash;
+ sh = llist_entry(head, struct stripe_head, release_list);
+ head = llist_next(head);
/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
smp_mb();
clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
@@ -469,6 +466,11 @@ static void shrink_buffers(struct stripe_head *sh)
sh->dev[i].page = NULL;
put_page(p);
}
+
+ if (sh->ppl_page) {
+ put_page(sh->ppl_page);
+ sh->ppl_page = NULL;
+ }
}
static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -486,6 +488,12 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].orig_page = page;
}
+ if (raid5_has_ppl(sh->raid_conf)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page)
+ return 1;
+ }
+
return 0;
}
@@ -868,107 +876,41 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
return 1;
}
-static void dispatch_bio_list(struct bio_list *tmp)
-{
- struct bio *bio;
-
- while ((bio = bio_list_pop(tmp)))
- generic_make_request(bio);
-}
-
-static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
-{
- const struct r5pending_data *da = list_entry(a,
- struct r5pending_data, sibling);
- const struct r5pending_data *db = list_entry(b,
- struct r5pending_data, sibling);
- if (da->sector > db->sector)
- return 1;
- if (da->sector < db->sector)
- return -1;
- return 0;
-}
-
-static void dispatch_defer_bios(struct r5conf *conf, int target,
- struct bio_list *list)
-{
- struct r5pending_data *data;
- struct list_head *first, *next = NULL;
- int cnt = 0;
-
- if (conf->pending_data_cnt == 0)
- return;
-
- list_sort(NULL, &conf->pending_list, cmp_stripe);
-
- first = conf->pending_list.next;
-
- /* temporarily move the head */
- if (conf->next_pending_data)
- list_move_tail(&conf->pending_list,
- &conf->next_pending_data->sibling);
-
- while (!list_empty(&conf->pending_list)) {
- data = list_first_entry(&conf->pending_list,
- struct r5pending_data, sibling);
- if (&data->sibling == first)
- first = data->sibling.next;
- next = data->sibling.next;
-
- bio_list_merge(list, &data->bios);
- list_move(&data->sibling, &conf->free_list);
- cnt++;
- if (cnt >= target)
- break;
- }
- conf->pending_data_cnt -= cnt;
- BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
-
- if (next != &conf->pending_list)
- conf->next_pending_data = list_entry(next,
- struct r5pending_data, sibling);
- else
- conf->next_pending_data = NULL;
- /* list isn't empty */
- if (first != &conf->pending_list)
- list_move_tail(&conf->pending_list, first);
-}
-
static void flush_deferred_bios(struct r5conf *conf)
{
- struct bio_list tmp = BIO_EMPTY_LIST;
+ struct bio_list tmp;
+ struct bio *bio;
- if (conf->pending_data_cnt == 0)
+ if (!conf->batch_bio_dispatch || !conf->group_cnt)
return;
+ bio_list_init(&tmp);
spin_lock(&conf->pending_bios_lock);
- dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
- BUG_ON(conf->pending_data_cnt != 0);
+ bio_list_merge(&tmp, &conf->pending_bios);
+ bio_list_init(&conf->pending_bios);
spin_unlock(&conf->pending_bios_lock);
- dispatch_bio_list(&tmp);
+ while ((bio = bio_list_pop(&tmp)))
+ generic_make_request(bio);
}
-static void defer_issue_bios(struct r5conf *conf, sector_t sector,
- struct bio_list *bios)
+static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
{
- struct bio_list tmp = BIO_EMPTY_LIST;
- struct r5pending_data *ent;
-
+ /*
+ * change group_cnt will drain all bios, so this is safe
+ *
+ * A read generally means a read-modify-write, which usually means a
+ * randwrite, so we don't delay it
+ */
+ if (!conf->batch_bio_dispatch || !conf->group_cnt ||
+ bio_op(bio) == REQ_OP_READ) {
+ generic_make_request(bio);
+ return;
+ }
spin_lock(&conf->pending_bios_lock);
- ent = list_first_entry(&conf->free_list, struct r5pending_data,
- sibling);
- list_move_tail(&ent->sibling, &conf->pending_list);
- ent->sector = sector;
- bio_list_init(&ent->bios);
- bio_list_merge(&ent->bios, bios);
- conf->pending_data_cnt++;
- if (conf->pending_data_cnt >= PENDING_IO_MAX)
- dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
-
+ bio_list_add(&conf->pending_bios, bio);
spin_unlock(&conf->pending_bios_lock);
-
- dispatch_bio_list(&tmp);
+ md_wakeup_thread(conf->mddev->thread);
}
static void
@@ -981,15 +923,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
struct r5conf *conf = sh->raid_conf;
int i, disks = sh->disks;
struct stripe_head *head_sh = sh;
- struct bio_list pending_bios = BIO_EMPTY_LIST;
- bool should_defer;
might_sleep();
- if (log_stripe(sh, s) == 0)
- return;
-
- should_defer = conf->batch_bio_dispatch && conf->group_cnt;
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+ /* writing out phase */
+ if (s->waiting_extra_page)
+ return;
+ if (r5l_write_stripe(conf->log, sh) == 0)
+ return;
+ } else { /* caching phase */
+ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+ r5c_cache_data(conf->log, sh, s);
+ return;
+ }
+ }
for (i = disks; i--; ) {
int op, op_flags = 0;
@@ -1145,10 +1093,7 @@ again:
trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
bi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- if (should_defer && op_is_write(op))
- bio_list_add(&pending_bios, bi);
- else
- generic_make_request(bi);
+ defer_bio_issue(conf, bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
@@ -1193,10 +1138,7 @@ again:
trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
rbi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- if (should_defer && op_is_write(op))
- bio_list_add(&pending_bios, rbi);
- else
- generic_make_request(rbi);
+ defer_bio_issue(conf, rbi);
}
if (!rdev && !rrdev) {
if (op_is_write(op))
@@ -1214,9 +1156,6 @@ again:
if (sh != head_sh)
goto again;
}
-
- if (should_defer && !bio_list_empty(&pending_bios))
- defer_issue_bios(conf, head_sh->sector, &pending_bios);
}
static struct dma_async_tx_descriptor *
@@ -1286,6 +1225,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
+ struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1309,13 +1249,16 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- bio_endio(rbi);
+ if (!raid5_dec_bi_active_stripes(rbi))
+ bio_list_add(&return_bi, rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
+ return_io(&return_bi);
+
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
@@ -1471,8 +1414,7 @@ static int set_syndrome_sources(struct page **srcs,
(test_bit(R5_Wantdrain, &dev->flags) ||
test_bit(R5_InJournal, &dev->flags))) ||
(srctype == SYNDROME_SRC_WRITTEN &&
- (dev->written ||
- test_bit(R5_InJournal, &dev->flags)))) {
+ dev->written)) {
if (test_bit(R5_InJournal, &dev->flags))
srcs[slot] = sh->dev[i].orig_page;
else
@@ -2077,6 +2019,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
async_tx_ack(tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
if (level < 6)
tx = ops_run_prexor5(sh, percpu, tx);
@@ -2084,9 +2029,6 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
tx = ops_run_prexor6(sh, percpu, tx);
}
- if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
- tx = ops_run_partial_parity(sh, percpu, tx);
-
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
@@ -2119,15 +2061,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
-static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
-{
- if (sh->ppl_page)
- __free_page(sh->ppl_page);
- kmem_cache_free(sc, sh);
-}
-
static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
- int disks, struct r5conf *conf)
+ int disks)
{
struct stripe_head *sh;
int i;
@@ -2141,7 +2076,6 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
INIT_LIST_HEAD(&sh->r5c);
INIT_LIST_HEAD(&sh->log_list);
atomic_set(&sh->count, 1);
- sh->raid_conf = conf;
sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
@@ -2149,14 +2083,6 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
bio_init(&dev->req, &dev->vec, 1);
bio_init(&dev->rreq, &dev->rvec, 1);
}
-
- if (raid5_has_ppl(conf)) {
- sh->ppl_page = alloc_page(gfp);
- if (!sh->ppl_page) {
- free_stripe(sc, sh);
- sh = NULL;
- }
- }
}
return sh;
}
@@ -2164,13 +2090,15 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
+ sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
if (!sh)
return 0;
+ sh->raid_conf = conf;
+
if (grow_buffers(sh, gfp)) {
shrink_buffers(sh);
- free_stripe(conf->slab_cache, sh);
+ kmem_cache_free(conf->slab_cache, sh);
return 0;
}
sh->hash_lock_index =
@@ -2297,7 +2225,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* pages have been transferred over, and the old kmem_cache is
* freed when all stripes are done.
* 3/ reallocate conf->disks to be suitable bigger. If this fails,
- * we simple return a failure status - no need to clean anything up.
+ * we simple return a failre status - no need to clean anything up.
* 4/ allocate new pages for the new slots in the new stripe_heads.
* If this fails, we don't bother trying the shrink the
* stripe_heads down again, we just leave them as they are.
@@ -2315,6 +2243,9 @@ static int resize_stripes(struct r5conf *conf, int newsize)
int i;
int hash, cnt;
+ if (newsize <= conf->pool_size)
+ return 0; /* never bother to shrink */
+
err = md_allow_write(conf->mddev);
if (err)
return err;
@@ -2330,10 +2261,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
mutex_lock(&conf->cache_size_mutex);
for (i = conf->max_nr_stripes; i; i--) {
- nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
+ nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
if (!nsh)
break;
+ nsh->raid_conf = conf;
list_add(&nsh->lru, &newstripes);
}
if (i) {
@@ -2341,7 +2273,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
while (!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
list_del(&nsh->lru);
- free_stripe(sc, nsh);
+ kmem_cache_free(sc, nsh);
}
kmem_cache_destroy(sc);
mutex_unlock(&conf->cache_size_mutex);
@@ -2367,7 +2299,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
nsh->dev[i].orig_page = osh->dev[i].page;
}
nsh->hash_lock_index = hash;
- free_stripe(conf->slab_cache, osh);
+ kmem_cache_free(conf->slab_cache, osh);
cnt++;
if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2406,10 +2338,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
err = -ENOMEM;
mutex_unlock(&conf->cache_size_mutex);
-
- conf->slab_cache = sc;
- conf->active_name = 1-conf->active_name;
-
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2427,6 +2355,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
}
/* critical section pass, GFP_NOIO no longer needed */
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
if (!err)
conf->pool_size = newsize;
return err;
@@ -2444,7 +2374,7 @@ static int drop_one_stripe(struct r5conf *conf)
return 0;
BUG_ON(atomic_read(&sh->count));
shrink_buffers(sh);
- free_stripe(conf->slab_cache, sh);
+ kmem_cache_free(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
conf->max_nr_stripes--;
return 1;
@@ -3167,7 +3097,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
- if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
+ if (raid5_has_ppl(sh->raid_conf) &&
test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
!test_bit(STRIPE_FULL_WRITE, &sh->state) &&
test_bit(R5_Insync, &sh->dev[pd_idx].flags))
@@ -3194,6 +3124,14 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
(unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
+ /*
+ * If several bio share a stripe. The bio bi_phys_segments acts as a
+ * reference count to avoid race. The reference count should already be
+ * increased before this function is called (for example, in
+ * raid5_make_request()), so other bio sharing this stripe will not free the
+ * stripe. If a stripe is owned by one stripe, the stripe lock will
+ * protect it.
+ */
spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
@@ -3249,8 +3187,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip)
bi->bi_next = *bip;
*bip = bi;
- bio_inc_remaining(bi);
- md_write_inc(conf->mddev, bi);
+ raid5_inc_bi_active_stripes(bi);
if (forwrite) {
/* check if page is covered */
@@ -3327,7 +3264,8 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks)
+ struct stripe_head_state *s, int disks,
+ struct bio_list *return_bi)
{
int i;
BUG_ON(sh->batch_head);
@@ -3373,8 +3311,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- md_write_end(conf->mddev);
- bio_endio(bi);
+ if (!raid5_dec_bi_active_stripes(bi)) {
+ md_write_end(conf->mddev);
+ bio_list_add(return_bi, bi);
+ }
bi = nextbi;
}
if (bitmap_end)
@@ -3395,8 +3335,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- md_write_end(conf->mddev);
- bio_endio(bi);
+ if (!raid5_dec_bi_active_stripes(bi)) {
+ md_write_end(conf->mddev);
+ bio_list_add(return_bi, bi);
+ }
bi = bi2;
}
@@ -3421,7 +3363,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
- bio_endio(bi);
+ if (!raid5_dec_bi_active_stripes(bi))
+ bio_list_add(return_bi, bi);
bi = nextbi;
}
}
@@ -3557,7 +3500,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
/* Pre-reads at not permitted until after short delay
* to gather multiple requests. However if this
- * device is no Insync, the block could only be computed
+ * device is no Insync, the block could only be be computed
* and there is no need to delay that.
*/
return 0;
@@ -3576,7 +3519,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
/* If we are forced to do a reconstruct-write, either because
* the current RAID6 implementation only supports that, or
- * because parity cannot be trusted and we are currently
+ * or because parity cannot be trusted and we are currently
* recovering it, there is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
@@ -3616,20 +3559,9 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
BUG_ON(sh->batch_head);
-
- /*
- * In the raid6 case if the only non-uptodate disk is P
- * then we already trusted P to compute the other failed
- * drives. It is safe to compute rather than re-read P.
- * In other cases we only compute blocks from failed
- * devices, otherwise check/repair might fail to detect
- * a real inconsistency.
- */
-
if ((s->uptodate == disks - 1) &&
- ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
(s->failed && (disk_idx == s->failed_num[0] ||
- disk_idx == s->failed_num[1])))) {
+ disk_idx == s->failed_num[1]))) {
/* have disk failed, and we're requested to fetch it;
* do compute it
*/
@@ -3731,7 +3663,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks)
+ struct stripe_head *sh, int disks, struct bio_list *return_bi)
{
int i;
struct r5dev *dev;
@@ -3763,8 +3695,10 @@ returnbi:
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- md_write_end(conf->mddev);
- bio_endio(wbi);
+ if (!raid5_dec_bi_active_stripes(wbi)) {
+ md_write_end(conf->mddev);
+ bio_list_add(return_bi, wbi);
+ }
wbi = wbi2;
}
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -4673,8 +4607,7 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
- if (s.handle_bad_blocks ||
- test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
+ if (s.handle_bad_blocks) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -4707,7 +4640,7 @@ static void handle_stripe(struct stripe_head *sh)
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
- handle_failed_stripe(conf, sh, &s, disks);
+ handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
}
@@ -4773,12 +4706,14 @@ static void handle_stripe(struct stripe_head *sh)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
- handle_stripe_clean_event(conf, sh, disks);
+ handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
- if (s.just_cached)
- r5c_handle_cached_data_endio(conf, sh, disks);
log_stripe_write_finished(sh);
+ if (s.just_cached)
+ r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+ r5l_stripe_write_finished(sh);
+
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
@@ -5004,6 +4939,16 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
+ if (!bio_list_empty(&s.return_bi)) {
+ if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->return_bi, &s.return_bi);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else
+ return_io(&s.return_bi);
+ }
+
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -5092,14 +5037,12 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
md_wakeup_thread(conf->mddev->thread);
}
-static struct bio *remove_bio_from_retry(struct r5conf *conf,
- unsigned int *offset)
+static struct bio *remove_bio_from_retry(struct r5conf *conf)
{
struct bio *bi;
bi = conf->retry_read_aligned;
if (bi) {
- *offset = conf->retry_read_offset;
conf->retry_read_aligned = NULL;
return bi;
}
@@ -5107,7 +5050,11 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
if(bi) {
conf->retry_read_aligned_list = bi->bi_next;
bi->bi_next = NULL;
- *offset = 0;
+ /*
+ * this sets the active strip count to 1 and the processed
+ * strip count to zero (upper 8 bits)
+ */
+ raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
}
return bi;
@@ -5163,9 +5110,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0;
}
/*
- * use bio_clone_fast to make a copy of the bio
+ * use bio_clone_mddev to make a copy of the bio
*/
- align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
+ align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
if (!align_bi)
return 0;
/*
@@ -5244,20 +5191,24 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
struct bio *split;
- sector_t sector = raid_bio->bi_iter.bi_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
- if (sectors < bio_sectors(raid_bio)) {
- struct r5conf *conf = mddev->private;
- split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
- bio_chain(split, raid_bio);
- generic_make_request(raid_bio);
- raid_bio = split;
- }
+ do {
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
+
+ if (sectors < bio_sectors(raid_bio)) {
+ split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
+ bio_chain(split, raid_bio);
+ } else
+ split = raid_bio;
- if (!raid5_read_one_chunk(mddev, raid_bio))
- return raid_bio;
+ if (!raid5_read_one_chunk(mddev, split)) {
+ if (split != raid_bio)
+ generic_make_request(raid_bio);
+ return split;
+ }
+ } while (split != raid_bio);
return NULL;
}
@@ -5274,27 +5225,19 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
{
- struct stripe_head *sh, *tmp;
+ struct stripe_head *sh = NULL, *tmp;
struct list_head *handle_list = NULL;
- struct r5worker_group *wg;
- bool second_try = !r5c_is_writeback(conf->log);
- bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
+ struct r5worker_group *wg = NULL;
-again:
- wg = NULL;
- sh = NULL;
if (conf->worker_cnt_per_group == 0) {
- handle_list = try_loprio ? &conf->loprio_list :
- &conf->handle_list;
+ handle_list = &conf->handle_list;
} else if (group != ANY_GROUP) {
- handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
- &conf->worker_groups[group].handle_list;
+ handle_list = &conf->worker_groups[group].handle_list;
wg = &conf->worker_groups[group];
} else {
int i;
for (i = 0; i < conf->group_cnt; i++) {
- handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
- &conf->worker_groups[i].handle_list;
+ handle_list = &conf->worker_groups[i].handle_list;
wg = &conf->worker_groups[i];
if (!list_empty(handle_list))
break;
@@ -5345,13 +5288,8 @@ again:
wg = NULL;
}
- if (!sh) {
- if (second_try)
- return NULL;
- second_try = true;
- try_loprio = !try_loprio;
- goto again;
- }
+ if (!sh)
+ return NULL;
if (wg) {
wg->stripes_cnt--;
@@ -5440,6 +5378,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
struct r5conf *conf = mddev->private;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ int remaining;
int stripe_sectors;
if (mddev->reshape_position != MaxSector)
@@ -5450,7 +5389,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
- md_write_start(mddev, bi);
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
stripe_sectors = conf->chunk_sectors *
(conf->raid_disks - conf->max_degraded);
@@ -5496,8 +5435,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
continue;
sh->dev[d].towrite = bi;
set_bit(R5_OVERWRITE, &sh->dev[d].flags);
- bio_inc_remaining(bi);
- md_write_inc(mddev, bi);
+ raid5_inc_bi_active_stripes(bi);
sh->overwrite_disks++;
}
spin_unlock_irq(&sh->stripe_lock);
@@ -5520,8 +5458,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
release_stripe_plug(mddev, sh);
}
- md_write_end(mddev);
- bio_endio(bi);
+ remaining = raid5_dec_bi_active_stripes(bi);
+ if (remaining == 0) {
+ md_write_end(mddev);
+ bio_endio(bi);
+ }
}
static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5532,6 +5473,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
+ int remaining;
DEFINE_WAIT(w);
bool do_prepare;
bool do_flush = false;
@@ -5553,6 +5495,8 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = bi->bi_opf & REQ_PREFLUSH;
}
+ md_write_start(mddev, bi);
+
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
@@ -5573,7 +5517,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
- md_write_start(mddev, bi);
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5708,9 +5652,16 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
}
finish_wait(&conf->wait_for_overlap, &w);
- if (rw == WRITE)
- md_write_end(mddev);
- bio_endio(bi);
+ remaining = raid5_dec_bi_active_stripes(bi);
+ if (remaining == 0) {
+
+ if ( rw == WRITE )
+ md_write_end(mddev);
+
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
+ bio_endio(bi);
+ }
}
static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -6059,8 +6010,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return STRIPE_SECTORS;
}
-static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
- unsigned int offset)
+static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
{
/* We may not be able to submit a whole bio at once as there
* may not be enough stripe_heads available.
@@ -6076,6 +6026,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
int dd_idx;
sector_t sector, logical_sector, last_sector;
int scnt = 0;
+ int remaining;
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
@@ -6089,7 +6040,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
sector += STRIPE_SECTORS,
scnt++) {
- if (scnt < offset)
+ if (scnt < raid5_bi_processed_stripes(raid_bio))
/* already done this stripe */
continue;
@@ -6097,15 +6048,15 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
if (!sh) {
/* failed to get a stripe - must wait */
+ raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
- conf->retry_read_offset = scnt;
return handled;
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
raid5_release_stripe(sh);
+ raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
- conf->retry_read_offset = scnt;
return handled;
}
@@ -6114,9 +6065,12 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
raid5_release_stripe(sh);
handled++;
}
-
- bio_endio(raid_bio);
-
+ remaining = raid5_dec_bi_active_stripes(raid_bio);
+ if (remaining == 0) {
+ trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+ raid_bio, 0);
+ bio_endio(raid_bio);
+ }
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return handled;
@@ -6176,7 +6130,6 @@ static void raid5_do_work(struct work_struct *work)
struct r5worker *worker = container_of(work, struct r5worker, work);
struct r5worker_group *group = worker->group;
struct r5conf *conf = group->conf;
- struct mddev *mddev = conf->mddev;
int group_id = group - conf->worker_groups;
int handled;
struct blk_plug plug;
@@ -6197,9 +6150,6 @@ static void raid5_do_work(struct work_struct *work)
if (!batch_size && !released)
break;
handled += batch_size;
- wait_event_lock_irq(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
- conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -6227,13 +6177,24 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
+ if (!bio_list_empty(&conf->return_bi) &&
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ spin_lock_irq(&conf->device_lock);
+ if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+ bio_list_merge(&tmp, &conf->return_bi);
+ bio_list_init(&conf->return_bi);
+ }
+ spin_unlock_irq(&conf->device_lock);
+ return_io(&tmp);
+ }
+
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
int batch_size, released;
- unsigned int offset;
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
@@ -6251,10 +6212,10 @@ static void raid5d(struct md_thread *thread)
}
raid5_activate_delayed(conf);
- while ((bio = remove_bio_from_retry(conf, &offset))) {
+ while ((bio = remove_bio_from_retry(conf))) {
int ok;
spin_unlock_irq(&conf->device_lock);
- ok = retry_aligned_read(conf, bio, offset);
+ ok = retry_aligned_read(conf, bio);
spin_lock_irq(&conf->device_lock);
if (!ok)
break;
@@ -6638,7 +6599,6 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
group = &(*worker_groups)[i];
INIT_LIST_HEAD(&group->handle_list);
- INIT_LIST_HEAD(&group->loprio_list);
group->conf = conf;
group->workers = workers + i * cnt;
@@ -6743,10 +6703,7 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
kfree(conf->stripe_hashtbl);
- kfree(conf->pending_data);
kfree(conf);
}
@@ -6892,14 +6849,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
- INIT_LIST_HEAD(&conf->free_list);
- INIT_LIST_HEAD(&conf->pending_list);
- conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
- PENDING_IO_MAX, GFP_KERNEL);
- if (!conf->pending_data)
- goto abort;
- for (i = 0; i < PENDING_IO_MAX; i++)
- list_add(&conf->pending_data[i].sibling, &conf->free_list);
/* Don't enable multi-threading by default*/
if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
&new_group)) {
@@ -6915,14 +6864,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
- INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
+ bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
atomic_set(&conf->active_aligned_reads, 0);
+ bio_list_init(&conf->pending_bios);
spin_lock_init(&conf->pending_bios_lock);
conf->batch_bio_dispatch = true;
rdev_for_each(rdev, mddev) {
@@ -6956,9 +6906,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
- if (!conf->bio_split)
- goto abort;
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -7410,6 +7357,15 @@ static int raid5_run(struct mddev *mddev)
stripe = (stripe | (stripe-1)) + 1;
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
+
+ /*
+ * We use 16-bit counter of active stripes in bi_phys_segments
+ * (minus one for over-loaded initialization)
+ */
+ blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
+ blk_queue_max_discard_sectors(mddev->queue,
+ 0xfffe * STRIPE_SECTORS);
+
/*
* unaligned part of discard request will be ignored, so can't
* guarantee discard_zeroes_data
@@ -7463,7 +7419,7 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
- if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
+ if (log_init(conf, journal_dev))
goto abort;
return 0;
@@ -7581,12 +7537,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
/*
* we can't wait pending write here, as this is called in
* raid5d, wait will deadlock.
- * neilb: there is no locking about new writes here,
- * so this cannot be safe.
*/
- if (atomic_read(&conf->active_stripes)) {
+ if (atomic_read(&mddev->writes_pending))
return -EBUSY;
- }
log_exit(conf);
return 0;
}
@@ -7672,7 +7625,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- log_init(conf, rdev, false);
+ log_init(conf, rdev);
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7822,9 +7775,6 @@ static int check_reshape(struct mddev *mddev)
mddev->chunk_sectors)
) < 0)
return -ENOMEM;
-
- if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
- return 0; /* never bother to shrink */
return resize_stripes(conf, (conf->previous_raid_disks
+ mddev->delta_disks));
}
@@ -8315,6 +8265,20 @@ static void *raid6_takeover(struct mddev *mddev)
return setup_conf(mddev);
}
+static void raid5_reset_stripe_cache(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ mutex_lock(&conf->cache_size_mutex);
+ while (conf->max_nr_stripes &&
+ drop_one_stripe(conf))
+ ;
+ while (conf->min_nr_stripes > conf->max_nr_stripes &&
+ grow_one_stripe(conf, GFP_KERNEL))
+ ;
+ mutex_unlock(&conf->cache_size_mutex);
+}
+
static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
{
struct r5conf *conf;
@@ -8329,42 +8293,18 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
return -ENODEV;
}
- if (strncmp(buf, "ppl", 3) == 0) {
- /* ppl only works with RAID 5 */
- if (!raid5_has_ppl(conf) && conf->level == 5) {
- err = log_init(conf, NULL, true);
- if (!err) {
- err = resize_stripes(conf, conf->pool_size);
- if (err)
- log_exit(conf);
- }
- } else
- err = -EINVAL;
- } else if (strncmp(buf, "resync", 6) == 0) {
- if (raid5_has_ppl(conf)) {
- mddev_suspend(mddev);
- log_exit(conf);
- mddev_resume(mddev);
- err = resize_stripes(conf, conf->pool_size);
- } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
- r5l_log_disk_error(conf)) {
- bool journal_dev_exists = false;
- struct md_rdev *rdev;
-
- rdev_for_each(rdev, mddev)
- if (test_bit(Journal, &rdev->flags)) {
- journal_dev_exists = true;
- break;
- }
-
- if (!journal_dev_exists) {
- mddev_suspend(mddev);
- clear_bit(MD_HAS_JOURNAL, &mddev->flags);
- mddev_resume(mddev);
- } else /* need remove journal device first */
- err = -EBUSY;
- } else
- err = -EINVAL;
+ if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
+ mddev_suspend(mddev);
+ set_bit(MD_HAS_PPL, &mddev->flags);
+ err = log_init(conf, NULL);
+ if (!err)
+ raid5_reset_stripe_cache(mddev);
+ mddev_resume(mddev);
+ } else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) {
+ mddev_suspend(mddev);
+ log_exit(conf);
+ raid5_reset_stripe_cache(mddev);
+ mddev_resume(mddev);
} else {
err = -EINVAL;
}
@@ -8399,7 +8339,6 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.congested = raid5_congested,
- .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid5_personality =
{
@@ -8448,7 +8387,6 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.congested = raid5_congested,
- .change_consistency_policy = raid5_change_consistency_policy,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 088afc764301..0328a583910b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -278,6 +278,7 @@ struct stripe_head_state {
int dec_preread_active;
unsigned long ops_request;
+ struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
int log_failed;
@@ -487,6 +488,50 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
return NULL;
}
+/*
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ */
+static inline int raid5_bi_processed_stripes(struct bio *bio)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+ return (atomic_read(segments) >> 16) & 0xffff;
+}
+
+static inline int raid5_dec_bi_active_stripes(struct bio *bio)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+ return atomic_sub_return(1, segments) & 0xffff;
+}
+
+static inline void raid5_inc_bi_active_stripes(struct bio *bio)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+ atomic_inc(segments);
+}
+
+static inline void raid5_set_bi_processed_stripes(struct bio *bio,
+ unsigned int cnt)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ int old, new;
+
+ do {
+ old = atomic_read(segments);
+ new = (old & 0xffff) | (cnt << 16);
+ } while (atomic_cmpxchg(segments, old, new) != old);
+}
+
+static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+
+ atomic_set(segments, cnt);
+}
+
#define NR_STRIPES 256
#define STRIPE_SIZE PAGE_SIZE
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
@@ -510,7 +555,6 @@ struct r5worker {
struct r5worker_group {
struct list_head handle_list;
- struct list_head loprio_list;
struct r5conf *conf;
struct r5worker *workers;
int stripes_cnt;
@@ -540,14 +584,6 @@ enum r5_cache_state {
*/
};
-#define PENDING_IO_MAX 512
-#define PENDING_IO_ONE_FLUSH 128
-struct r5pending_data {
- struct list_head sibling;
- sector_t sector; /* stripe sector */
- struct bio_list bios;
-};
-
struct r5conf {
struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
@@ -585,12 +621,10 @@ struct r5conf {
*/
struct list_head handle_list; /* stripes needing handling */
- struct list_head loprio_list; /* low priority stripes */
struct list_head hold_list; /* preread ready stripes */
struct list_head delayed_list; /* stripes that have plugged requests */
struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
struct bio *retry_read_aligned; /* currently retrying aligned bios */
- unsigned int retry_read_offset; /* sector offset into retry_read_aligned */
struct bio *retry_read_aligned_list; /* aligned bios retry list */
atomic_t preread_active_stripes; /* stripes with scheduled io */
atomic_t active_aligned_reads;
@@ -600,6 +634,9 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
+ /* bios to have bi_end_io called after metadata is synced */
+ struct bio_list return_bi;
+
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.
@@ -654,7 +691,6 @@ struct r5conf {
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
- struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
@@ -665,17 +701,13 @@ struct r5conf {
int group_cnt;
int worker_cnt_per_group;
struct r5l_log *log;
- void *log_private;
- spinlock_t pending_bios_lock;
- bool batch_bio_dispatch;
- struct r5pending_data *pending_data;
- struct list_head free_list;
- struct list_head pending_list;
- int pending_data_cnt;
- struct r5pending_data *next_pending_data;
-};
+ struct bio_list pending_bios;
+ spinlock_t pending_bios_lock;
+ bool batch_bio_dispatch;
+ void *log_private;
+};
/*
* Our supported algorithms
@@ -749,4 +781,7 @@ extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
#endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 132ee538394c..77678af596a0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -54,7 +54,6 @@ struct blk_mq_hw_ctx {
atomic_t nr_active;
- struct delayed_work delayed_run_work;
struct delayed_work delay_work;
struct blk_mq_cpu_notifier cpu_notifier;
@@ -240,7 +239,6 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
-void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index ef7bf4139b6a..abd7c01c84db 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -99,7 +99,6 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch);
-void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill);
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index da339773d25a..6111bcb28376 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -249,22 +249,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
smp_store_release(&ref->percpu_count_ptr,
ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}
-EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
-
-/**
- * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
- * @ref: percpu_ref to switch to atomic mode
- *
- * Schedule switching the ref to atomic mode, and wait for the
- * switch to complete. Caller must ensure that no other thread
- * will switch back to percpu mode.
- */
-void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
-{
- percpu_ref_switch_to_atomic(ref, NULL);
- wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-}
-EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
/**
* percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
@@ -292,7 +276,6 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
__percpu_ref_switch_to_percpu(ref);
}
-EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
/**
* percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation