diff --git a/drivers/md/md.c b/drivers/md/md.c index 82821ee0d57fa..01175dac0db63 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev) if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ + /* + * NOTE: some pers->run(), for example r5l_recovery_log(), wakes + * up mddev->thread. It is important to initialize critical + * resources for mddev->thread BEFORE calling pers->run(). + */ err = pers->run(mddev); if (err) pr_warn("md: pers->run() failed ...\n"); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0e8ed2c327b07..302dea3296ba5 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -162,6 +162,8 @@ struct r5l_log { /* to submit async io_units, to fulfill ordering of flush */ struct work_struct deferred_io_work; + /* to disable write back during in degraded mode */ + struct work_struct disable_writeback_work; }; /* @@ -611,6 +613,21 @@ static void r5l_submit_io_async(struct work_struct *work) r5l_do_submit_io(log, io); } +static void r5c_disable_writeback_async(struct work_struct *work) +{ + struct r5l_log *log = container_of(work, struct r5l_log, + disable_writeback_work); + struct mddev *mddev = log->rdev->mddev; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return; + pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", + mdname(mddev)); + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); +} + static void r5l_submit_current_io(struct r5l_log *log) { struct r5l_io_unit *io = log->current_io; @@ -1393,8 +1410,6 @@ static void r5l_do_reclaim(struct r5l_log *log) next_checkpoint = r5c_calculate_new_cp(conf); spin_unlock_irq(&log->io_list_lock); - BUG_ON(reclaimable < 0); - if (reclaimable == 0 || !write_super) return; @@ -2062,7 +2077,7 @@ static int r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, struct r5l_recovery_ctx *ctx) { - struct stripe_head *sh, *next; + struct stripe_head *sh; struct mddev *mddev = log->rdev->mddev; struct page *page; sector_t next_checkpoint = MaxSector; @@ -2076,7 +2091,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, WARN_ON(list_empty(&ctx->cached_list)); - list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { + list_for_each_entry(sh, &ctx->cached_list, lru) { struct r5l_meta_block *mb; int i; int offset; @@ -2126,14 +2141,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, ctx->pos = write_pos; ctx->seq += 1; next_checkpoint = sh->log_start; - list_del_init(&sh->lru); - raid5_release_stripe(sh); } log->next_checkpoint = next_checkpoint; __free_page(page); return 0; } +static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + struct stripe_head *sh, *next; + + if (ctx->data_only_stripes == 0) + return; + + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; + + list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { + r5c_make_stripe_write_out(sh); + set_bit(STRIPE_HANDLE, &sh->state); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } + + md_wakeup_thread(conf->mddev->thread); + /* reuse conf->wait_for_quiescent in recovery */ + wait_event(conf->wait_for_quiescent, + atomic_read(&conf->active_stripes) == 0); + + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; +} + static int r5l_recovery_log(struct r5l_log *log) { struct mddev *mddev = log->rdev->mddev; @@ -2160,32 +2200,31 @@ static int r5l_recovery_log(struct r5l_log *log) pos = ctx.pos; ctx.seq += 10000; - if (ctx.data_only_stripes == 0) { - log->next_checkpoint = ctx.pos; - r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); - ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); - } if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) pr_debug("md/raid:%s: starting from clean shutdown\n", mdname(mddev)); - else { + else pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", mdname(mddev), ctx.data_only_stripes, ctx.data_parity_stripes); - if (ctx.data_only_stripes > 0) - if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { - pr_err("md/raid:%s: failed to rewrite stripes to journal\n", - mdname(mddev)); - return -EIO; - } + if (ctx.data_only_stripes == 0) { + log->next_checkpoint = ctx.pos; + r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); + ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); + } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { + pr_err("md/raid:%s: failed to rewrite stripes to journal\n", + mdname(mddev)); + return -EIO; } log->log_start = ctx.pos; log->seq = ctx.seq; log->last_checkpoint = pos; r5l_write_super(log, pos); + + r5c_recovery_flush_data_only_stripes(log, &ctx); return 0; } @@ -2247,6 +2286,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev, val > R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; + if (raid5_calc_degraded(conf) > 0 && + val == R5C_JOURNAL_MODE_WRITE_BACK) + return -EINVAL; + mddev_suspend(mddev); conf->log->r5c_journal_mode = val; mddev_resume(mddev); @@ -2301,6 +2344,16 @@ int r5c_try_caching_write(struct r5conf *conf, set_bit(STRIPE_R5C_CACHING, &sh->state); } + /* + * When run in degraded mode, array is set to write-through mode. + * This check helps drain pending write safely in the transition to + * write-through mode. + */ + if (s->failed) { + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + for (i = disks; i--; ) { dev = &sh->dev[i]; /* if non-overwrite, use writing-out phase */ @@ -2351,6 +2404,8 @@ void r5c_release_extra_page(struct stripe_head *sh) struct page *p = sh->dev[i].orig_page; sh->dev[i].orig_page = sh->dev[i].page; + clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); + if (!using_disk_info_extra_page) put_page(p); } @@ -2555,6 +2610,19 @@ static int r5l_load_log(struct r5l_log *log) return ret; } +void r5c_update_on_rdev_error(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + struct r5l_log *log = conf->log; + + if (!log) + return; + + if (raid5_calc_degraded(conf) > 0 && + conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) + schedule_work(&log->disable_writeback_work); +} + int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -2627,6 +2695,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->no_space_stripes_lock); INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); + INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; INIT_LIST_HEAD(&log->stripe_in_journal_list); @@ -2659,6 +2728,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) void r5l_exit_log(struct r5l_log *log) { + flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); bioset_free(log->bs); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 36c13e4be9c9e..3c7e106c12a24 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -556,7 +556,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. */ -static int calc_degraded(struct r5conf *conf) +int raid5_calc_degraded(struct r5conf *conf) { int degraded, degraded2; int i; @@ -619,7 +619,7 @@ static int has_failed(struct r5conf *conf) if (conf->mddev->reshape_position == MaxSector) return conf->mddev->degraded > conf->max_degraded; - degraded = calc_degraded(conf); + degraded = raid5_calc_degraded(conf); if (degraded > conf->max_degraded) return 1; return 0; @@ -1015,7 +1015,17 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); - sh->dev[i].vec.bv_page = sh->dev[i].page; + + if (!op_is_write(op) && + test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * issuing read for a page in journal, this + * must be preparing for prexor in rmw; read + * the data into orig_page + */ + sh->dev[i].vec.bv_page = sh->dev[i].orig_page; + else + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; @@ -2380,6 +2390,13 @@ static void raid5_end_read_request(struct bio * bi) } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * end read for a page in journal, this + * must be preparing for prexor in rmw + */ + set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); + if (atomic_read(&rdev->read_errors)) atomic_set(&rdev->read_errors, 0); } else { @@ -2538,7 +2555,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) spin_lock_irqsave(&conf->device_lock, flags); clear_bit(In_sync, &rdev->flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_RECOVERY_INTR, &mddev->recovery); @@ -2552,6 +2569,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); + r5c_update_on_rdev_error(mddev); } /* @@ -2880,6 +2898,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) return r_sector; } +/* + * There are cases where we want handle_stripe_dirtying() and + * schedule_reconstruction() to delay towrite to some dev of a stripe. + * + * This function checks whether we want to delay the towrite. Specifically, + * we delay the towrite when: + * + * 1. degraded stripe has a non-overwrite to the missing dev, AND this + * stripe has data in journal (for other devices). + * + * In this case, when reading data for the non-overwrite dev, it is + * necessary to handle complex rmw of write back cache (prexor with + * orig_page, and xor with page). To keep read path simple, we would + * like to flush data in journal to RAID disks first, so complex rmw + * is handled in the write patch (handle_stripe_dirtying). + * + */ +static inline bool delay_towrite(struct r5dev *dev, + struct stripe_head_state *s) +{ + return !test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_Insync, &dev->flags) && s->injournal; +} + static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) @@ -2900,7 +2942,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->towrite) { + if (dev->towrite && !delay_towrite(dev, s)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantdrain, &dev->flags); if (!expand) @@ -3295,13 +3337,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx) return rv; } -/* fetch_block - checks the given member device to see if its data needs - * to be read or computed to satisfy a request. - * - * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill to continue - */ - static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, int disk_idx, int disks) { @@ -3392,6 +3427,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, return 0; } +/* fetch_block - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill to continue + */ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, int disk_idx, int disks) { @@ -3478,10 +3519,26 @@ static void handle_stripe_fill(struct stripe_head *sh, * midst of changing due to a write */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !sh->reconstruct_state) + !sh->reconstruct_state) { + + /* + * For degraded stripe with data in journal, do not handle + * read requests yet, instead, flush the stripe to raid + * disks first, this avoids handling complex rmw of write + * back cache (prexor with orig_page, and then xor with + * page) in the read path + */ + if (s->injournal && s->failed) { + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) + r5c_make_stripe_write_out(sh); + goto out; + } + for (i = disks; i--; ) if (fetch_block(sh, s, i, disks)) break; + } +out: set_bit(STRIPE_HANDLE, &sh->state); } @@ -3594,6 +3651,21 @@ static void handle_stripe_clean_event(struct r5conf *conf, break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); } +/* + * For RMW in write back cache, we need extra page in prexor to store the + * old data. This page is stored in dev->orig_page. + * + * This function checks whether we have data for prexor. The exact logic + * is: + * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) + */ +static inline bool uptodate_for_rmw(struct r5dev *dev) +{ + return (test_bit(R5_UPTODATE, &dev->flags)) && + (!test_bit(R5_InJournal, &dev->flags) || + test_bit(R5_OrigPageUPTDODATE, &dev->flags)); +} + static int handle_stripe_dirtying(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, @@ -3622,12 +3694,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || + if (((dev->towrite && !delay_towrite(dev, s)) || + i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !((test_bit(R5_UPTODATE, &dev->flags) && - (!test_bit(R5_InJournal, &dev->flags) || - dev->page != dev->orig_page)) || + !(uptodate_for_rmw(dev) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rmw++; @@ -3639,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf, i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_InJournal, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -3689,13 +3759,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || + if (((dev->towrite && !delay_towrite(dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !((test_bit(R5_UPTODATE, &dev->flags) && - (!test_bit(R5_InJournal, &dev->flags) || - dev->page != dev->orig_page)) || + !(uptodate_for_rmw(dev) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, @@ -3722,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf, i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_InJournal, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { rcw++; if (test_bit(R5_Insync, &dev->flags) && @@ -7025,7 +7092,7 @@ static int raid5_run(struct mddev *mddev) /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); if (has_failed(conf)) { pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", @@ -7272,7 +7339,7 @@ static int raid5_spare_active(struct mddev *mddev) } } spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); print_raid5_conf(conf); return count; @@ -7632,7 +7699,7 @@ static int raid5_start_reshape(struct mddev *mddev) * pre and post number of devices. */ spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); } mddev->raid_disks = conf->raid_disks; @@ -7720,7 +7787,7 @@ static void raid5_finish_reshape(struct mddev *mddev) } else { int d; spin_lock_irq(&conf->device_lock); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irq(&conf->device_lock); for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ed8e1362ab369..1440fa26e2962 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -322,6 +322,11 @@ enum r5dev_flags { * data and parity being written are in the journal * device */ + R5_OrigPageUPTDODATE, /* with write back cache, we read old data into + * dev->orig_page for prexor. When this flag is + * set, orig_page contains latest data in the + * raid disk. + */ }; /* @@ -753,6 +758,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, extern struct stripe_head * raid5_get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce); +extern int raid5_calc_degraded(struct r5conf *conf); extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); extern void r5l_exit_log(struct r5l_log *log); extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); @@ -781,4 +787,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num); extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; +extern void r5c_update_on_rdev_error(struct mddev *mddev); #endif