Blame - drivers/md/raid5-ppl.c - SHIFTPHONES/kernel/shift/mainline

blob: db5b72b11594fd84cf54424b0cbe9f29fbf0331a [file] [log] [blame]

Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame^]	1	/*
				2	* Partial Parity Log for closing the RAID5 write hole
				3	* Copyright (c) 2017, Intel Corporation.
				4	*
				5	* This program is free software; you can redistribute it and/or modify it
				6	* under the terms and conditions of the GNU General Public License,
				7	* version 2, as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope it will be useful, but WITHOUT
				10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				12	* more details.
				13	*/
				14
				15	#include <linux/kernel.h>
				16	#include <linux/blkdev.h>
				17	#include <linux/slab.h>
				18	#include <linux/crc32c.h>
				19	#include <linux/flex_array.h>
				20	#include <linux/async_tx.h>
				21	#include <linux/raid/md_p.h>
				22	#include "md.h"
				23	#include "raid5.h"
				24
				25	/*
				26	* PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
				27	* partial parity data. The header contains an array of entries
				28	* (struct ppl_header_entry) which describe the logged write requests.
				29	* Partial parity for the entries comes after the header, written in the same
				30	* sequence as the entries:
				31	*
				32	* Header
				33	* entry0
				34	* ...
				35	* entryN
				36	* PP data
				37	* PP for entry0
				38	* ...
				39	* PP for entryN
				40	*
				41	* An entry describes one or more consecutive stripe_heads, up to a full
				42	* stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
				43	* number of stripe_heads in the entry and n is the number of modified data
				44	* disks. Every stripe_head in the entry must write to the same data disks.
				45	* An example of a valid case described by a single entry (writes to the first
				46	* stripe of a 4 disk array, 16k chunk size):
				47	*
				48	* sh->sector dd0 dd1 dd2 ppl
				49	* +-----+-----+-----+
				50	* 0 \| --- \| --- \| --- \| +----+
				51	* 8 \| -W- \| -W- \| --- \| \| pp \| data_sector = 8
				52	* 16 \| -W- \| -W- \| --- \| \| pp \| data_size = 3 * 2 * 4k
				53	* 24 \| -W- \| -W- \| --- \| \| pp \| pp_size = 3 * 4k
				54	* +-----+-----+-----+ +----+
				55	*
				56	* data_sector is the first raid sector of the modified data, data_size is the
				57	* total size of modified data and pp_size is the size of partial parity for
				58	* this entry. Entries for full stripe writes contain no partial parity
				59	* (pp_size = 0), they only mark the stripes for which parity should be
				60	* recalculated after an unclean shutdown. Every entry holds a checksum of its
				61	* partial parity, the header also has a checksum of the header itself.
				62	*
				63	* A write request is always logged to the PPL instance stored on the parity
				64	* disk of the corresponding stripe. For each member disk there is one ppl_log
				65	* used to handle logging for this disk, independently from others. They are
				66	* grouped in child_logs array in struct ppl_conf, which is assigned to
				67	* r5conf->log_private.
				68	*
				69	* ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
				70	* PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
				71	* can be appended to the last entry if it meets the conditions for a valid
				72	* entry described above, otherwise a new entry is added. Checksums of entries
				73	* are calculated incrementally as stripes containing partial parity are being
				74	* added. ppl_submit_iounit() calculates the checksum of the header and submits
				75	* a bio containing the header page and partial parity pages (sh->ppl_page) for
				76	* all stripes of the io_unit. When the PPL write completes, the stripes
				77	* associated with the io_unit are released and raid5d starts writing their data
				78	* and parity. When all stripes are written, the io_unit is freed and the next
				79	* can be submitted.
				80	*
				81	* An io_unit is used to gather stripes until it is submitted or becomes full
				82	* (if the maximum number of entries or size of PPL is reached). Another io_unit
				83	* can't be submitted until the previous has completed (PPL and stripe
				84	* data+parity is written). The log->io_list tracks all io_units of a log
				85	* (for a single member disk). New io_units are added to the end of the list
				86	* and the first io_unit is submitted, if it is not submitted already.
				87	* The current io_unit accepting new stripes is always at the end of the list.
				88	*/
				89
				90	struct ppl_conf {
				91	struct mddev *mddev;
				92
				93	/* array of child logs, one for each raid disk */
				94	struct ppl_log *child_logs;
				95	int count;
				96
				97	int block_size; /* the logical block size used for data_sector
				98	* in ppl_header_entry */
				99	u32 signature; /* raid array identifier */
				100	atomic64_t seq; /* current log write sequence number */
				101
				102	struct kmem_cache *io_kc;
				103	mempool_t *io_pool;
				104	struct bio_set *bs;
				105	mempool_t *meta_pool;
				106	};
				107
				108	struct ppl_log {
				109	struct ppl_conf ppl_conf; / shared between all log instances */
				110
				111	struct md_rdev rdev; / array member disk associated with
				112	* this log instance */
				113	struct mutex io_mutex;
				114	struct ppl_io_unit current_io; / current io_unit accepting new data
				115	* always at the end of io_list */
				116	spinlock_t io_list_lock;
				117	struct list_head io_list; /* all io_units of this log */
				118	struct list_head no_mem_stripes;/* stripes to retry if failed to
				119	* allocate io_unit */
				120	};
				121
				122	#define PPL_IO_INLINE_BVECS 32
				123
				124	struct ppl_io_unit {
				125	struct ppl_log *log;
				126
				127	struct page header_page; / for ppl_header */
				128
				129	unsigned int entries_count; /* number of entries in ppl_header */
				130	unsigned int pp_size; /* total size current of partial parity */
				131
				132	u64 seq; /* sequence number of this log write */
				133	struct list_head log_sibling; /* log->io_list */
				134
				135	struct list_head stripe_list; /* stripes added to the io_unit */
				136	atomic_t pending_stripes; /* how many stripes not written to raid */
				137
				138	bool submitted; /* true if write to log started */
				139
				140	/* inline bio and its biovec for submitting the iounit */
				141	struct bio bio;
				142	struct bio_vec biovec[PPL_IO_INLINE_BVECS];
				143	};
				144
				145	struct dma_async_tx_descriptor *
				146	ops_run_partial_parity(struct stripe_head sh, struct raid5_percpu percpu,
				147	struct dma_async_tx_descriptor *tx)
				148	{
				149	int disks = sh->disks;
				150	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
				151	int count = 0, pd_idx = sh->pd_idx, i;
				152	struct async_submit_ctl submit;
				153
				154	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
				155
				156	/*
				157	* Partial parity is the XOR of stripe data chunks that are not changed
				158	* during the write request. Depending on available data
				159	* (read-modify-write vs. reconstruct-write case) we calculate it
				160	* differently.
				161	*/
				162	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
				163	/* rmw: xor old data and parity from updated disks */
				164	for (i = disks; i--;) {
				165	struct r5dev *dev = &sh->dev[i];
				166	if (test_bit(R5_Wantdrain, &dev->flags) \|\| i == pd_idx)
				167	xor_srcs[count++] = dev->page;
				168	}
				169	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
				170	/* rcw: xor data from all not updated disks */
				171	for (i = disks; i--;) {
				172	struct r5dev *dev = &sh->dev[i];
				173	if (test_bit(R5_UPTODATE, &dev->flags))
				174	xor_srcs[count++] = dev->page;
				175	}
				176	} else {
				177	return tx;
				178	}
				179
				180	init_async_submit(&submit, ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, tx,
				181	NULL, sh, flex_array_get(percpu->scribble, 0)
				182	+ sizeof(struct page ) (sh->disks + 2));
				183
				184	if (count == 1)
				185	tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
				186	&submit);
				187	else
				188	tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
				189	&submit);
				190
				191	return tx;
				192	}
				193
				194	static struct ppl_io_unit ppl_new_iounit(struct ppl_log log,
				195	struct stripe_head *sh)
				196	{
				197	struct ppl_conf *ppl_conf = log->ppl_conf;
				198	struct ppl_io_unit *io;
				199	struct ppl_header *pplhdr;
				200
				201	io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
				202	if (!io)
				203	return NULL;
				204
				205	memset(io, 0, sizeof(*io));
				206	io->log = log;
				207	INIT_LIST_HEAD(&io->log_sibling);
				208	INIT_LIST_HEAD(&io->stripe_list);
				209	atomic_set(&io->pending_stripes, 0);
				210	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
				211
				212	io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
				213	pplhdr = page_address(io->header_page);
				214	clear_page(pplhdr);
				215	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
				216	pplhdr->signature = cpu_to_le32(ppl_conf->signature);
				217
				218	io->seq = atomic64_add_return(1, &ppl_conf->seq);
				219	pplhdr->generation = cpu_to_le64(io->seq);
				220
				221	return io;
				222	}
				223
				224	static int ppl_log_stripe(struct ppl_log log, struct stripe_head sh)
				225	{
				226	struct ppl_io_unit *io = log->current_io;
				227	struct ppl_header_entry *e = NULL;
				228	struct ppl_header *pplhdr;
				229	int i;
				230	sector_t data_sector = 0;
				231	int data_disks = 0;
				232	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
				233	struct r5conf *conf = sh->raid_conf;
				234
				235	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
				236
				237	/* check if current io_unit is full */
				238	if (io && (io->pp_size == entry_space \|\|
				239	io->entries_count == PPL_HDR_MAX_ENTRIES)) {
				240	pr_debug("%s: add io_unit blocked by seq: %llu\n",
				241	__func__, io->seq);
				242	io = NULL;
				243	}
				244
				245	/* add a new unit if there is none or the current is full */
				246	if (!io) {
				247	io = ppl_new_iounit(log, sh);
				248	if (!io)
				249	return -ENOMEM;
				250	spin_lock_irq(&log->io_list_lock);
				251	list_add_tail(&io->log_sibling, &log->io_list);
				252	spin_unlock_irq(&log->io_list_lock);
				253
				254	log->current_io = io;
				255	}
				256
				257	for (i = 0; i < sh->disks; i++) {
				258	struct r5dev *dev = &sh->dev[i];
				259
				260	if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
				261	if (!data_disks \|\| dev->sector < data_sector)
				262	data_sector = dev->sector;
				263	data_disks++;
				264	}
				265	}
				266	BUG_ON(!data_disks);
				267
				268	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
				269	io->seq, (unsigned long long)data_sector, data_disks);
				270
				271	pplhdr = page_address(io->header_page);
				272
				273	if (io->entries_count > 0) {
				274	struct ppl_header_entry *last =
				275	&pplhdr->entries[io->entries_count - 1];
				276	struct stripe_head *sh_last = list_last_entry(
				277	&io->stripe_list, struct stripe_head, log_list);
				278	u64 data_sector_last = le64_to_cpu(last->data_sector);
				279	u32 data_size_last = le32_to_cpu(last->data_size);
				280
				281	/*
				282	* Check if we can append the stripe to the last entry. It must
				283	* be just after the last logged stripe and write to the same
				284	* disks. Use bit shift and logarithm to avoid 64-bit division.
				285	*/
				286	if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
				287	(data_sector >> ilog2(conf->chunk_sectors) ==
				288	data_sector_last >> ilog2(conf->chunk_sectors)) &&
				289	((data_sector - data_sector_last) * data_disks ==
				290	data_size_last >> 9))
				291	e = last;
				292	}
				293
				294	if (!e) {
				295	e = &pplhdr->entries[io->entries_count++];
				296	e->data_sector = cpu_to_le64(data_sector);
				297	e->parity_disk = cpu_to_le32(sh->pd_idx);
				298	e->checksum = cpu_to_le32(~0);
				299	}
				300
				301	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
				302
				303	/* don't write any PP if full stripe write */
				304	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
				305	le32_add_cpu(&e->pp_size, PAGE_SIZE);
				306	io->pp_size += PAGE_SIZE;
				307	e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
				308	page_address(sh->ppl_page),
				309	PAGE_SIZE));
				310	}
				311
				312	list_add_tail(&sh->log_list, &io->stripe_list);
				313	atomic_inc(&io->pending_stripes);
				314	sh->ppl_io = io;
				315
				316	return 0;
				317	}
				318
				319	int ppl_write_stripe(struct r5conf conf, struct stripe_head sh)
				320	{
				321	struct ppl_conf *ppl_conf = conf->log_private;
				322	struct ppl_io_unit *io = sh->ppl_io;
				323	struct ppl_log *log;
				324
				325	if (io \|\| test_bit(STRIPE_SYNCING, &sh->state) \|\|
				326	!test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) \|\|
				327	!test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
				328	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
				329	return -EAGAIN;
				330	}
				331
				332	log = &ppl_conf->child_logs[sh->pd_idx];
				333
				334	mutex_lock(&log->io_mutex);
				335
				336	if (!log->rdev \|\| test_bit(Faulty, &log->rdev->flags)) {
				337	mutex_unlock(&log->io_mutex);
				338	return -EAGAIN;
				339	}
				340
				341	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
				342	clear_bit(STRIPE_DELAYED, &sh->state);
				343	atomic_inc(&sh->count);
				344
				345	if (ppl_log_stripe(log, sh)) {
				346	spin_lock_irq(&log->io_list_lock);
				347	list_add_tail(&sh->log_list, &log->no_mem_stripes);
				348	spin_unlock_irq(&log->io_list_lock);
				349	}
				350
				351	mutex_unlock(&log->io_mutex);
				352
				353	return 0;
				354	}
				355
				356	static void ppl_log_endio(struct bio *bio)
				357	{
				358	struct ppl_io_unit *io = bio->bi_private;
				359	struct ppl_log *log = io->log;
				360	struct ppl_conf *ppl_conf = log->ppl_conf;
				361	struct stripe_head sh, next;
				362
				363	pr_debug("%s: seq: %llu\n", __func__, io->seq);
				364
				365	if (bio->bi_error)
				366	md_error(ppl_conf->mddev, log->rdev);
				367
				368	mempool_free(io->header_page, ppl_conf->meta_pool);
				369
				370	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
				371	list_del_init(&sh->log_list);
				372
				373	set_bit(STRIPE_HANDLE, &sh->state);
				374	raid5_release_stripe(sh);
				375	}
				376	}
				377
				378	static void ppl_submit_iounit_bio(struct ppl_io_unit io, struct bio bio)
				379	{
				380	char b[BDEVNAME_SIZE];
				381
				382	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
				383	__func__, io->seq, bio->bi_iter.bi_size,
				384	(unsigned long long)bio->bi_iter.bi_sector,
				385	bdevname(bio->bi_bdev, b));
				386
				387	submit_bio(bio);
				388	}
				389
				390	static void ppl_submit_iounit(struct ppl_io_unit *io)
				391	{
				392	struct ppl_log *log = io->log;
				393	struct ppl_conf *ppl_conf = log->ppl_conf;
				394	struct ppl_header *pplhdr = page_address(io->header_page);
				395	struct bio *bio = &io->bio;
				396	struct stripe_head *sh;
				397	int i;
				398
				399	for (i = 0; i < io->entries_count; i++) {
				400	struct ppl_header_entry *e = &pplhdr->entries[i];
				401
				402	pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
				403	__func__, io->seq, i, le64_to_cpu(e->data_sector),
				404	le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
				405
				406	e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
				407	ilog2(ppl_conf->block_size >> 9));
				408	e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
				409	}
				410
				411	pplhdr->entries_count = cpu_to_le32(io->entries_count);
				412	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
				413
				414	bio->bi_private = io;
				415	bio->bi_end_io = ppl_log_endio;
				416	bio->bi_opf = REQ_OP_WRITE \| REQ_FUA;
				417	bio->bi_bdev = log->rdev->bdev;
				418	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
				419	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
				420
				421	list_for_each_entry(sh, &io->stripe_list, log_list) {
				422	/* entries for full stripe writes have no partial parity */
				423	if (test_bit(STRIPE_FULL_WRITE, &sh->state))
				424	continue;
				425
				426	if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
				427	struct bio *prev = bio;
				428
				429	bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
				430	ppl_conf->bs);
				431	bio->bi_opf = prev->bi_opf;
				432	bio->bi_bdev = prev->bi_bdev;
				433	bio->bi_iter.bi_sector = bio_end_sector(prev);
				434	bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
				435
				436	bio_chain(bio, prev);
				437	ppl_submit_iounit_bio(io, prev);
				438	}
				439	}
				440
				441	ppl_submit_iounit_bio(io, bio);
				442	}
				443
				444	static void ppl_submit_current_io(struct ppl_log *log)
				445	{
				446	struct ppl_io_unit *io;
				447
				448	spin_lock_irq(&log->io_list_lock);
				449
				450	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
				451	log_sibling);
				452	if (io && io->submitted)
				453	io = NULL;
				454
				455	spin_unlock_irq(&log->io_list_lock);
				456
				457	if (io) {
				458	io->submitted = true;
				459
				460	if (io == log->current_io)
				461	log->current_io = NULL;
				462
				463	ppl_submit_iounit(io);
				464	}
				465	}
				466
				467	void ppl_write_stripe_run(struct r5conf *conf)
				468	{
				469	struct ppl_conf *ppl_conf = conf->log_private;
				470	struct ppl_log *log;
				471	int i;
				472
				473	for (i = 0; i < ppl_conf->count; i++) {
				474	log = &ppl_conf->child_logs[i];
				475
				476	mutex_lock(&log->io_mutex);
				477	ppl_submit_current_io(log);
				478	mutex_unlock(&log->io_mutex);
				479	}
				480	}
				481
				482	static void ppl_io_unit_finished(struct ppl_io_unit *io)
				483	{
				484	struct ppl_log *log = io->log;
				485	unsigned long flags;
				486
				487	pr_debug("%s: seq: %llu\n", __func__, io->seq);
				488
				489	spin_lock_irqsave(&log->io_list_lock, flags);
				490
				491	list_del(&io->log_sibling);
				492	mempool_free(io, log->ppl_conf->io_pool);
				493
				494	if (!list_empty(&log->no_mem_stripes)) {
				495	struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
				496	struct stripe_head,
				497	log_list);
				498	list_del_init(&sh->log_list);
				499	set_bit(STRIPE_HANDLE, &sh->state);
				500	raid5_release_stripe(sh);
				501	}
				502
				503	spin_unlock_irqrestore(&log->io_list_lock, flags);
				504	}
				505
				506	void ppl_stripe_write_finished(struct stripe_head *sh)
				507	{
				508	struct ppl_io_unit *io;
				509
				510	io = sh->ppl_io;
				511	sh->ppl_io = NULL;
				512
				513	if (io && atomic_dec_and_test(&io->pending_stripes))
				514	ppl_io_unit_finished(io);
				515	}
				516
				517	static void __ppl_exit_log(struct ppl_conf *ppl_conf)
				518	{
				519	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
				520
				521	kfree(ppl_conf->child_logs);
				522
				523	mempool_destroy(ppl_conf->meta_pool);
				524	if (ppl_conf->bs)
				525	bioset_free(ppl_conf->bs);
				526	mempool_destroy(ppl_conf->io_pool);
				527	kmem_cache_destroy(ppl_conf->io_kc);
				528
				529	kfree(ppl_conf);
				530	}
				531
				532	void ppl_exit_log(struct r5conf *conf)
				533	{
				534	struct ppl_conf *ppl_conf = conf->log_private;
				535
				536	if (ppl_conf) {
				537	__ppl_exit_log(ppl_conf);
				538	conf->log_private = NULL;
				539	}
				540	}
				541
				542	static int ppl_validate_rdev(struct md_rdev *rdev)
				543	{
				544	char b[BDEVNAME_SIZE];
				545	int ppl_data_sectors;
				546	int ppl_size_new;
				547
				548	/*
				549	* The configured PPL size must be enough to store
				550	* the header and (at the very least) partial parity
				551	* for one stripe. Round it down to ensure the data
				552	* space is cleanly divisible by stripe size.
				553	*/
				554	ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
				555
				556	if (ppl_data_sectors > 0)
				557	ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
				558
				559	if (ppl_data_sectors <= 0) {
				560	pr_warn("md/raid:%s: PPL space too small on %s\n",
				561	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				562	return -ENOSPC;
				563	}
				564
				565	ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
				566
				567	if ((rdev->ppl.sector < rdev->data_offset &&
				568	rdev->ppl.sector + ppl_size_new > rdev->data_offset) \|\|
				569	(rdev->ppl.sector >= rdev->data_offset &&
				570	rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
				571	pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
				572	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				573	return -EINVAL;
				574	}
				575
				576	if (!rdev->mddev->external &&
				577	((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) \|\|
				578	(rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
				579	pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
				580	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				581	return -EINVAL;
				582	}
				583
				584	rdev->ppl.size = ppl_size_new;
				585
				586	return 0;
				587	}
				588
				589	int ppl_init_log(struct r5conf *conf)
				590	{
				591	struct ppl_conf *ppl_conf;
				592	struct mddev *mddev = conf->mddev;
				593	int ret = 0;
				594	int i;
				595	bool need_cache_flush;
				596
				597	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
				598	mdname(conf->mddev));
				599
				600	if (PAGE_SIZE != 4096)
				601	return -EINVAL;
				602
				603	if (mddev->level != 5) {
				604	pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
				605	mdname(mddev), mddev->level);
				606	return -EINVAL;
				607	}
				608
				609	if (mddev->bitmap_info.file \|\| mddev->bitmap_info.offset) {
				610	pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
				611	mdname(mddev));
				612	return -EINVAL;
				613	}
				614
				615	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
				616	pr_warn("md/raid:%s PPL is not compatible with journal\n",
				617	mdname(mddev));
				618	return -EINVAL;
				619	}
				620
				621	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
				622	if (!ppl_conf)
				623	return -ENOMEM;
				624
				625	ppl_conf->mddev = mddev;
				626
				627	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
				628	if (!ppl_conf->io_kc) {
				629	ret = -EINVAL;
				630	goto err;
				631	}
				632
				633	ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
				634	if (!ppl_conf->io_pool) {
				635	ret = -EINVAL;
				636	goto err;
				637	}
				638
				639	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
				640	if (!ppl_conf->bs) {
				641	ret = -EINVAL;
				642	goto err;
				643	}
				644
				645	ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
				646	if (!ppl_conf->meta_pool) {
				647	ret = -EINVAL;
				648	goto err;
				649	}
				650
				651	ppl_conf->count = conf->raid_disks;
				652	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
				653	GFP_KERNEL);
				654	if (!ppl_conf->child_logs) {
				655	ret = -ENOMEM;
				656	goto err;
				657	}
				658
				659	atomic64_set(&ppl_conf->seq, 0);
				660
				661	if (!mddev->external) {
				662	ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
				663	ppl_conf->block_size = 512;
				664	} else {
				665	ppl_conf->block_size = queue_logical_block_size(mddev->queue);
				666	}
				667
				668	for (i = 0; i < ppl_conf->count; i++) {
				669	struct ppl_log *log = &ppl_conf->child_logs[i];
				670	struct md_rdev *rdev = conf->disks[i].rdev;
				671
				672	mutex_init(&log->io_mutex);
				673	spin_lock_init(&log->io_list_lock);
				674	INIT_LIST_HEAD(&log->io_list);
				675	INIT_LIST_HEAD(&log->no_mem_stripes);
				676
				677	log->ppl_conf = ppl_conf;
				678	log->rdev = rdev;
				679
				680	if (rdev) {
				681	struct request_queue *q;
				682
				683	ret = ppl_validate_rdev(rdev);
				684	if (ret)
				685	goto err;
				686
				687	q = bdev_get_queue(rdev->bdev);
				688	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
				689	need_cache_flush = true;
				690	}
				691	}
				692
				693	if (need_cache_flush)
				694	pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
				695	mdname(mddev));
				696
				697	conf->log_private = ppl_conf;
				698
				699	return 0;
				700	err:
				701	__ppl_exit_log(ppl_conf);
				702	return ret;
				703	}