Blame - drivers/md/raid5.h - SHIFTPHONES/mainline/linux

blob: 9e8486a9e4451df367b9ececb15ff34476d535b8 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0 */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2	#ifndef _RAID5_H
				3	#define _RAID5_H
				4
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5	#include <linux/raid/xor.h>
Dan Williams	ad283ea	2009-08-29 19:09:26 -0700	[diff] [blame]	6	#include <linux/dmaengine.h>
Davidlohr Bueso	770b1d2	2021-11-15 17:23:17 -0800	[diff] [blame]	7	#include <linux/local_lock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	8
				9	/*
				10	*
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	11	* Each stripe contains one buffer per device. Each buffer can be in
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	12	* one of a number of states stored in "flags". Changes between
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	13	* these states happen almost exclusively under the protection of the
				14	* STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
				15	* these are not protected by STRIPE_ACTIVE.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	16	*
				17	* The flag bits that are used to represent these states are:
				18	* R5_UPTODATE and R5_LOCKED
				19	*
				20	* State Empty == !UPTODATE, !LOCK
				21	* We have no data, and there is no active request
				22	* State Want == !UPTODATE, LOCK
				23	* A read request is being submitted for this block
				24	* State Dirty == UPTODATE, LOCK
				25	* Some new data is in this buffer, and it is being written out
				26	* State Clean == UPTODATE, !LOCK
				27	* We have valid data which is the same as on disc
				28	*
				29	* The possible state transitions are:
				30	*
				31	* Empty -> Want - on read or write to get old data for parity calc
NeilBrown	ede7ee8	2011-12-23 10:17:52 +1100	[diff] [blame]	32	* Empty -> Dirty - on compute_parity to satisfy write/sync request.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	33	* Empty -> Clean - on compute_block when computing a block for failed drive
				34	* Want -> Empty - on failed read
				35	* Want -> Clean - on successful completion of read request
				36	* Dirty -> Clean - on successful completion of write request
				37	* Dirty -> Clean - on failed write
				38	* Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
				39	*
				40	* The Want->Empty, Want->Clean, Dirty->Clean, transitions
				41	* all happen in b_end_io at interrupt time.
				42	* Each sets the Uptodate bit before releasing the Lock bit.
				43	* This leaves one multi-stage transition:
				44	* Want->Dirty->Clean
				45	* This is safe because thinking that a Clean buffer is actually dirty
				46	* will at worst delay some action, and the stripe will be scheduled
				47	* for attention after the transition is complete.
				48	*
				49	* There is one possibility that is not covered by these states. That
				50	* is if one drive has failed and there is a spare being rebuilt. We
				51	* can't distinguish between a clean block that has been generated
				52	* from parity calculations, and a clean block that has been
				53	* successfully written to the spare ( or to parity when resyncing).
Michael Opdenacker	aa5e5dc	2013-09-18 06:00:43 +0200	[diff] [blame]	54	* To distinguish these states we have a stripe bit STRIPE_INSYNC that
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	55	* is set whenever a write is scheduled to the spare, or to the parity
				56	* disc if there is no spare. A sync request clears this bit, and
				57	* when we find it set with no buffers locked, we know the sync is
				58	* complete.
				59	*
				60	* Buffers for the md device that arrive via make_request are attached
				61	* to the appropriate stripe in one of two lists linked on b_reqnext.
				62	* One list (bh_read) for read requests, one (bh_write) for write.
				63	* There should never be more than one buffer on the two lists
				64	* together, but we are not guaranteed of that so we allow for more.
				65	*
				66	* If a buffer is on the read list when the associated cache buffer is
				67	* Uptodate, the data is copied into the read buffer and it's b_end_io
				68	* routine is called. This may happen in the end_request routine only
				69	* if the buffer has just successfully been read. end_request should
				70	* remove the buffers from the list and then set the Uptodate bit on
				71	* the buffer. Other threads may do this only if they first check
				72	* that the Uptodate bit is set. Once they have checked that they may
				73	* take buffers off the read queue.
				74	*
				75	* When a buffer on the write list is committed for write it is copied
				76	* into the cache buffer, which is then marked dirty, and moved onto a
				77	* third list, the written list (bh_written). Once both the parity
				78	* block and the cached buffer are successfully written, any buffer on
				79	* a written list can be returned with b_end_io.
				80	*
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	81	* The write list and read list both act as fifos. The read list,
				82	* write list and written list are protected by the device_lock.
				83	* The device_lock is only for list manipulations and will only be
				84	* held for a very short time. It can be claimed from interrupts.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	85	*
				86	*
				87	* Stripes in the stripe cache can be on one of two lists (or on
				88	* neither). The "inactive_list" contains stripes which are not
				89	* currently being used for any request. They can freely be reused
				90	* for another stripe. The "handle_list" contains stripes that need
				91	* to be handled in some way. Both of these are fifo queues. Each
				92	* stripe is also (potentially) linked to a hash bucket in the hash
				93	* table so that it can be found by sector number. Stripes that are
				94	* not hashed must be on the inactive_list, and will normally be at
				95	* the front. All stripes start life this way.
				96	*
				97	* The inactive_list, handle_list and hash bucket lists are all protected by the
				98	* device_lock.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	99	* - stripes have a reference counter. If count==0, they are on a list.
				100	* - If a stripe might need handling, STRIPE_HANDLE is set.
				101	* - When refcount reaches zero, then if STRIPE_HANDLE it is put on
				102	* handle_list else inactive_list
				103	*
				104	* This, combined with the fact that STRIPE_HANDLE is only ever
				105	* cleared while a stripe has a non-zero count means that if the
				106	* refcount is 0 and STRIPE_HANDLE is set, then it is on the
				107	* handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
				108	* the stripe is on inactive_list.
				109	*
				110	* The possible transitions are:
				111	* activate an unhashed/inactive stripe (get_active_stripe())
				112	* lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
				113	* activate a hashed, possibly active stripe (get_active_stripe())
				114	* lockdev check-hash if(!cnt++)unlink-stripe unlockdev
				115	* attach a request to an active stripe (add_stripe_bh())
				116	* lockdev attach-buffer unlockdev
				117	* handle a stripe (handle_stripe())
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	118	* setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	119	* (lockdev check-buffers unlockdev) ..
				120	* change-state ..
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	121	* record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	122	* release an active stripe (release_stripe())
				123	* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
				124	*
				125	* The refcount counts each thread that have activated the stripe,
				126	* plus raid5d if it is handling it, plus one for each active request
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	127	* on a cached buffer, and plus one if the stripe is undergoing stripe
				128	* operations.
				129	*
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	130	* The stripe operations are:
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	131	* -copying data between the stripe cache and user application buffers
				132	* -computing blocks to save a disk access, or to recover a missing block
				133	* -updating the parity on a write operation (reconstruct write and
				134	* read-modify-write)
				135	* -checking parity correctness
				136	* -running i/o to disk
				137	* These operations are carried out by raid5_run_ops which uses the async_tx
				138	* api to (optionally) offload operations to dedicated hardware engines.
				139	* When requesting an operation handle_stripe sets the pending bit for the
				140	* operation and increments the count. raid5_run_ops is then run whenever
				141	* the count is non-zero.
				142	* There are some critical dependencies between the operations that prevent some
				143	* from being requested while another is in flight.
				144	* 1/ Parity check operations destroy the in cache version of the parity block,
				145	* so we prevent parity dependent operations like writes and compute_blocks
				146	* from starting while a check is in progress. Some dma engines can perform
				147	* the check without damaging the parity block, in these cases the parity
				148	* block is re-marked up to date (assuming the check was successful) and is
				149	* not re-read from disk.
				150	* 2/ When a write operation is requested we immediately lock the affected
				151	* blocks, and mark them as not up to date. This causes new read requests
				152	* to be held off, as well as parity checks and compute block operations.
				153	* 3/ Once a compute block operation has been requested handle_stripe treats
				154	* that block as if it is up to date. raid5_run_ops guaruntees that any
				155	* operation that is dependent on the compute block result is initiated after
				156	* the compute block completes.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	157	*/
				158
Dan Williams	ecc65c9	2008-06-28 08:31:57 +1000	[diff] [blame]	159	/*
NeilBrown	f72ffdd	2014-09-30 14:23:59 +1000	[diff] [blame]	160	* Operations state - intermediate states that are visible outside of
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	161	* STRIPE_ACTIVE.
Dan Williams	ecc65c9	2008-06-28 08:31:57 +1000	[diff] [blame]	162	* In general _idle indicates nothing is running, _run indicates a data
				163	* processing operation is active, and _result means the data processing result
				164	* is stable and can be acted upon. For simple operations like biofill and
				165	* compute that only have an _idle and _run state they are indicated with
				166	* sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
				167	*/
				168	/**
				169	* enum check_states - handles syncing / repairing a stripe
				170	* @check_state_idle - check operations are quiesced
				171	* @check_state_run - check operation is running
				172	* @check_state_result - set outside lock when check result is valid
				173	* @check_state_compute_run - check failed and we are repairing
				174	* @check_state_compute_result - set outside lock when compute result is valid
				175	*/
				176	enum check_states {
				177	check_state_idle = 0,
Dan Williams	ac6b53b	2009-07-14 13:40:19 -0700	[diff] [blame]	178	check_state_run, /* xor parity check */
				179	check_state_run_q, /* q-parity check */
				180	check_state_run_pq, /* pq dual parity check */
Dan Williams	ecc65c9	2008-06-28 08:31:57 +1000	[diff] [blame]	181	check_state_check_result,
				182	check_state_compute_run, /* parity repair */
				183	check_state_compute_result,
				184	};
				185
				186	/**
				187	* enum reconstruct_states - handles writing or expanding a stripe
				188	*/
				189	enum reconstruct_states {
				190	reconstruct_state_idle = 0,
Dan Williams	d8ee072	2008-06-28 08:32:06 +1000	[diff] [blame]	191	reconstruct_state_prexor_drain_run, /* prexor-write */
Dan Williams	ecc65c9	2008-06-28 08:31:57 +1000	[diff] [blame]	192	reconstruct_state_drain_run, /* write */
				193	reconstruct_state_run, /* expand */
Dan Williams	d8ee072	2008-06-28 08:32:06 +1000	[diff] [blame]	194	reconstruct_state_prexor_drain_result,
Dan Williams	ecc65c9	2008-06-28 08:31:57 +1000	[diff] [blame]	195	reconstruct_state_drain_result,
				196	reconstruct_state_result,
				197	};
				198
Yufen Yu	046169f	2020-08-20 09:22:12 -0400	[diff] [blame]	199	#define DEFAULT_STRIPE_SIZE 4096
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	200	struct stripe_head {
NeilBrown	fccddba	2006-01-06 00:20:33 -0800	[diff] [blame]	201	struct hlist_node hash;
NeilBrown	d0dabf7	2009-03-31 14:39:38 +1100	[diff] [blame]	202	struct list_head lru; /* inactive_list or handle_list */
Shaohua Li	773ca82	2013-08-27 17:50:39 +0800	[diff] [blame]	203	struct llist_node release_list;
NeilBrown	d1688a6	2011-10-11 16:49:52 +1100	[diff] [blame]	204	struct r5conf *raid_conf;
NeilBrown	86b42c7	2009-03-31 15:19:03 +1100	[diff] [blame]	205	short generation; /* increments with every
				206	* reshape */
NeilBrown	d0dabf7	2009-03-31 14:39:38 +1100	[diff] [blame]	207	sector_t sector; /* sector of this row */
				208	short pd_idx; /* parity disk index */
				209	short qd_idx; /* 'Q' disk index for raid6 */
NeilBrown	67cc2b8	2009-03-31 14:39:38 +1100	[diff] [blame]	210	short ddf_layout;/* use DDF ordering to calculate Q */
Shaohua Li	566c09c	2013-11-14 15:16:17 +1100	[diff] [blame]	211	short hash_lock_index;
NeilBrown	d0dabf7	2009-03-31 14:39:38 +1100	[diff] [blame]	212	unsigned long state; /* state flags */
				213	atomic_t count; /* nr of active thread/requests */
NeilBrown	7262668	2005-09-09 16:23:54 -0700	[diff] [blame]	214	int bm_seq; /* sequence number for bitmap flushes */
NeilBrown	d0dabf7	2009-03-31 14:39:38 +1100	[diff] [blame]	215	int disks; /* disks in stripe */
shli@kernel.org	7a87f43	2014-12-15 12:57:03 +1100	[diff] [blame]	216	int overwrite_disks; /* total overwrite disks in stripe,
				217	* this is only checked when stripe
				218	* has STRIPE_BATCH_READY
				219	*/
Dan Williams	ecc65c9	2008-06-28 08:31:57 +1000	[diff] [blame]	220	enum check_states check_state;
Dan Williams	600aa10	2008-06-28 08:32:05 +1000	[diff] [blame]	221	enum reconstruct_states reconstruct_state;
Shaohua Li	b17459c	2012-07-19 16:01:31 +1000	[diff] [blame]	222	spinlock_t stripe_lock;
Shaohua Li	851c30c	2013-08-28 14:30:16 +0800	[diff] [blame]	223	int cpu;
Shaohua Li	bfc90cb	2013-08-29 15:40:32 +0800	[diff] [blame]	224	struct r5worker_group *group;
shli@kernel.org	59fc630	2014-12-15 12:57:03 +1100	[diff] [blame]	225
				226	struct stripe_head batch_head; / protected by stripe lock */
				227	spinlock_t batch_lock; /* only header's lock is useful */
				228	struct list_head batch_list; /* protected by head's batch lock*/
Shaohua Li	f6bed0e	2015-08-13 14:31:59 -0700	[diff] [blame]	229
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	230	union {
				231	struct r5l_io_unit *log_io;
				232	struct ppl_io_unit *ppl_io;
				233	};
				234
Shaohua Li	f6bed0e	2015-08-13 14:31:59 -0700	[diff] [blame]	235	struct list_head log_list;
Song Liu	a39f7af	2016-11-17 15:24:40 -0800	[diff] [blame]	236	sector_t log_start; /* first meta block on the journal */
				237	struct list_head r5c; /* for r5c_cache->stripe_in_journal */
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	238
				239	struct page ppl_page; / partial parity of this stripe */
Dan Williams	417b8d4	2009-10-16 16:25:22 +1100	[diff] [blame]	240	/**
				241	* struct stripe_operations
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	242	* @target - STRIPE_OP_COMPUTE_BLK target
Dan Williams	417b8d4	2009-10-16 16:25:22 +1100	[diff] [blame]	243	* @target2 - 2nd compute target in the raid6 case
				244	* @zero_sum_result - P and Q verification flags
				245	* @request - async service request flags for raid_run_ops
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	246	*/
				247	struct stripe_operations {
Dan Williams	ac6b53b	2009-07-14 13:40:19 -0700	[diff] [blame]	248	int target, target2;
Dan Williams	ad283ea	2009-08-29 19:09:26 -0700	[diff] [blame]	249	enum sum_check_flags zero_sum_result;
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	250	} ops;
Yufen Yu	046169f	2020-08-20 09:22:12 -0400	[diff] [blame]	251
				252	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
				253	/* These pages will be used by bios in dev[i] */
				254	struct page **pages;
				255	int nr_pages; /* page array size */
				256	int stripes_per_page;
				257	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	258	struct r5dev {
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	259	/* rreq and rvec are used for the replacement device when
				260	* writing data to both devices.
				261	*/
				262	struct bio req, rreq;
				263	struct bio_vec vec, rvec;
Shaohua Li	d592a99	2014-05-21 17:57:44 +0800	[diff] [blame]	264	struct page page, orig_page;
Yufen Yu	7aba13b	2020-08-20 09:22:06 -0400	[diff] [blame]	265	unsigned int offset; /* offset of the page */
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	266	struct bio toread, read, towrite, written;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	267	sector_t sector; /* sector of this page */
				268	unsigned long flags;
Shaohua Li	f6bed0e	2015-08-13 14:31:59 -0700	[diff] [blame]	269	u32 log_checksum;
Mariusz Dabrowski	2cd259a	2018-04-19 19:28:10 +0200	[diff] [blame]	270	unsigned short write_hint;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	271	} dev[1]; /* allocated with extra space depending of RAID geometry */
				272	};
Dan Williams	a445685	2007-07-09 11:56:43 -0700	[diff] [blame]	273
				274	/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	275	* for handle_stripe.
Dan Williams	a445685	2007-07-09 11:56:43 -0700	[diff] [blame]	276	*/
				277	struct stripe_head_state {
NeilBrown	9a3e110	2011-12-23 10:17:53 +1100	[diff] [blame]	278	/* 'syncing' means that we need to read all devices, either
				279	* to check/correct parity, or to reconstruct a missing device.
				280	* 'replacing' means we are replacing one or more drives and
				281	* the source is valid at this point so we don't need to
				282	* read all devices, just the replacement targets.
				283	*/
				284	int syncing, expanding, expanded, replacing;
Dan Williams	a445685	2007-07-09 11:56:43 -0700	[diff] [blame]	285	int locked, uptodate, to_read, to_write, failed, written;
Dan Williams	b5e98d6	2007-01-02 13:52:31 -0700	[diff] [blame]	286	int to_fill, compute, req_compute, non_overwrite;
Song Liu	1e6d690	2016-11-17 15:24:39 -0800	[diff] [blame]	287	int injournal, just_cached;
NeilBrown	f2b3b44	2011-07-26 11:35:19 +1000	[diff] [blame]	288	int failed_num[2];
NeilBrown	f2b3b44	2011-07-26 11:35:19 +1000	[diff] [blame]	289	int p_failed, q_failed;
NeilBrown	c5709ef	2011-07-26 11:35:20 +1000	[diff] [blame]	290	int dec_preread_active;
				291	unsigned long ops_request;
				292
NeilBrown	3cb0300	2011-10-11 16:45:26 +1100	[diff] [blame]	293	struct md_rdev *blocked_rdev;
NeilBrown	bc2607f	2011-07-28 11:39:22 +1000	[diff] [blame]	294	int handle_bad_blocks;
Shaohua Li	6e74a9c	2015-10-08 21:54:08 -0700	[diff] [blame]	295	int log_failed;
Song Liu	d7bd398	2016-11-23 22:50:39 -0800	[diff] [blame]	296	int waiting_extra_page;
Dan Williams	a445685	2007-07-09 11:56:43 -0700	[diff] [blame]	297	};
				298
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	299	/* Flags for struct r5dev.flags */
				300	enum r5dev_flags {
				301	R5_UPTODATE, /* page contains current data */
				302	R5_LOCKED, /* IO has been submitted on "req" */
NeilBrown	977df36	2011-12-23 10:17:53 +1100	[diff] [blame]	303	R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	304	R5_OVERWRITE, /* towrite covers whole page */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	305	/* and some that are internal to handle_stripe */
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	306	R5_Insync, /* rdev && rdev->in_sync at start */
				307	R5_Wantread, /* want to schedule a read */
				308	R5_Wantwrite,
				309	R5_Overlap, /* There is a pending overlapping request
				310	* on this block */
majianpeng	3f9e7c1	2012-07-31 10:04:21 +1000	[diff] [blame]	311	R5_ReadNoMerge, /* prevent bio from merging in block-layer */
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	312	R5_ReadError, /* seen a read error here recently */
				313	R5_ReWrite, /* have tried to over-write the readerror */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	314
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	315	R5_Expanded, /* This block now has post-expand data */
				316	R5_Wantcompute, /* compute_block in progress treat as
				317	* uptodate
				318	*/
				319	R5_Wantfill, /* dev->toread contains a bio that needs
				320	* filling
				321	*/
				322	R5_Wantdrain, /* dev->towrite needs to be drained */
				323	R5_WantFUA, /* Write should be FUA */
Shaohua Li	bc0934f	2012-05-22 13:55:05 +1000	[diff] [blame]	324	R5_SyncIO, /* The IO is sync */
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	325	R5_WriteError, /* got a write error - need to record it */
				326	R5_MadeGood, /* A bad block has been fixed by writing to it */
				327	R5_ReadRepl, /* Will/did read from replacement rather than orig */
				328	R5_MadeGoodRepl,/* A bad block on the replacement device has been
				329	* fixed by writing to it */
NeilBrown	9a3e110	2011-12-23 10:17:53 +1100	[diff] [blame]	330	R5_NeedReplace, /* This device has a replacement which is not
				331	* up-to-date at this stripe. */
				332	R5_WantReplace, /* We need to update the replacement, we have read
				333	* data in, and now is a good time to write it out.
				334	*/
Shaohua Li	620125f	2012-10-11 13:49:05 +1100	[diff] [blame]	335	R5_Discard, /* Discard the stripe */
Shaohua Li	d592a99	2014-05-21 17:57:44 +0800	[diff] [blame]	336	R5_SkipCopy, /* Don't copy data from bio to stripe cache */
Song Liu	2ded370	2016-11-17 15:24:38 -0800	[diff] [blame]	337	R5_InJournal, /* data being written is in the journal device.
				338	* if R5_InJournal is set for parity pd_idx, all the
				339	* data and parity being written are in the journal
				340	* device
				341	*/
Song Liu	86aa139	2017-01-12 17:22:41 -0800	[diff] [blame]	342	R5_OrigPageUPTDODATE, /* with write back cache, we read old data into
				343	* dev->orig_page for prexor. When this flag is
				344	* set, orig_page contains latest data in the
				345	* raid disk.
				346	*/
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	347	};
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348
				349	/*
				350	* Stripe state
				351	*/
NeilBrown	83206d6	2011-07-26 11:19:49 +1000	[diff] [blame]	352	enum {
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	353	STRIPE_ACTIVE,
NeilBrown	83206d6	2011-07-26 11:19:49 +1000	[diff] [blame]	354	STRIPE_HANDLE,
				355	STRIPE_SYNC_REQUESTED,
				356	STRIPE_SYNCING,
				357	STRIPE_INSYNC,
NeilBrown	f94c0b6	2013-07-22 12:57:21 +1000	[diff] [blame]	358	STRIPE_REPLACED,
NeilBrown	83206d6	2011-07-26 11:19:49 +1000	[diff] [blame]	359	STRIPE_PREREAD_ACTIVE,
				360	STRIPE_DELAYED,
				361	STRIPE_DEGRADED,
				362	STRIPE_BIT_DELAY,
				363	STRIPE_EXPANDING,
				364	STRIPE_EXPAND_SOURCE,
				365	STRIPE_EXPAND_READY,
				366	STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
				367	STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
				368	STRIPE_BIOFILL_RUN,
				369	STRIPE_COMPUTE_RUN,
Shaohua Li	8811b59	2012-08-02 08:33:00 +1000	[diff] [blame]	370	STRIPE_ON_UNPLUG_LIST,
NeilBrown	f8dfcff	2013-03-12 12:18:06 +1100	[diff] [blame]	371	STRIPE_DISCARD,
Shaohua Li	773ca82	2013-08-27 17:50:39 +0800	[diff] [blame]	372	STRIPE_ON_RELEASE_LIST,
shli@kernel.org	da41ba6	2014-12-15 12:57:03 +1100	[diff] [blame]	373	STRIPE_BATCH_READY,
shli@kernel.org	72ac733	2014-12-15 12:57:03 +1100	[diff] [blame]	374	STRIPE_BATCH_ERR,
NeilBrown	d0852df5	2015-05-27 08:43:45 +1000	[diff] [blame]	375	STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
				376	* to batch yet.
				377	*/
Song Liu	2ded370	2016-11-17 15:24:38 -0800	[diff] [blame]	378	STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
				379	* this bit is used in two scenarios:
				380	*
				381	* 1. write-out phase
				382	* set in first entry of r5l_write_stripe
				383	* clear in second entry of r5l_write_stripe
				384	* used to bypass logic in handle_stripe
				385	*
				386	* 2. caching phase
				387	* set in r5c_try_caching_write()
				388	* clear when journal write is done
				389	* used to initiate r5c_cache_data()
				390	* also used to bypass logic in handle_stripe
				391	*/
				392	STRIPE_R5C_CACHING, /* the stripe is in caching phase
				393	* see more detail in the raid5-cache.c
				394	*/
Song Liu	1e6d690	2016-11-17 15:24:39 -0800	[diff] [blame]	395	STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or
				396	* in conf->r5c_partial_stripe_list)
				397	*/
				398	STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or
				399	* in conf->r5c_full_stripe_list)
				400	*/
Song Liu	3bddb7f	2016-11-18 16:46:50 -0800	[diff] [blame]	401	STRIPE_R5C_PREFLUSH, /* need to flush journal device */
NeilBrown	83206d6	2011-07-26 11:19:49 +1000	[diff] [blame]	402	};
Dan Williams	417b8d4	2009-10-16 16:25:22 +1100	[diff] [blame]	403
NeilBrown	1b956f7	2015-05-21 12:40:26 +1000	[diff] [blame]	404	#define STRIPE_EXPAND_SYNC_FLAGS \
shli@kernel.org	dabc4ec	2014-12-15 12:57:04 +1100	[diff] [blame]	405	((1 << STRIPE_EXPAND_SOURCE) \|\
				406	(1 << STRIPE_EXPAND_READY) \|\
				407	(1 << STRIPE_EXPANDING) \|\
				408	(1 << STRIPE_SYNC_REQUESTED))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	409	/*
Dan Williams	ecc65c9	2008-06-28 08:31:57 +1000	[diff] [blame]	410	* Operation request flags
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	411	*/
NeilBrown	ede7ee8	2011-12-23 10:17:52 +1100	[diff] [blame]	412	enum {
				413	STRIPE_OP_BIOFILL,
				414	STRIPE_OP_COMPUTE_BLK,
				415	STRIPE_OP_PREXOR,
				416	STRIPE_OP_BIODRAIN,
				417	STRIPE_OP_RECONSTRUCT,
				418	STRIPE_OP_CHECK,
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	419	STRIPE_OP_PARTIAL_PARITY,
NeilBrown	ede7ee8	2011-12-23 10:17:52 +1100	[diff] [blame]	420	};
Markus Stockhausen	584acdd	2014-12-15 12:57:05 +1100	[diff] [blame]	421
				422	/*
				423	* RAID parity calculation preferences
				424	*/
				425	enum {
				426	PARITY_DISABLE_RMW = 0,
				427	PARITY_ENABLE_RMW,
Markus Stockhausen	d06f191	2014-12-15 12:57:05 +1100	[diff] [blame]	428	PARITY_PREFER_RMW,
Markus Stockhausen	584acdd	2014-12-15 12:57:05 +1100	[diff] [blame]	429	};
				430
				431	/*
				432	* Pages requested from set_syndrome_sources()
				433	*/
				434	enum {
				435	SYNDROME_SRC_ALL,
				436	SYNDROME_SRC_WANT_DRAIN,
				437	SYNDROME_SRC_WRITTEN,
				438	};
Dan Williams	91c0092	2007-01-02 13:52:30 -0700	[diff] [blame]	439	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	440	* Plugging:
				441	*
				442	* To improve write throughput, we need to delay the handling of some
				443	* stripes until there has been a chance that several write requests
				444	* for the one stripe have all been collected.
				445	* In particular, any write request that would require pre-reading
				446	* is put on a "delayed" queue until there are no stripes currently
				447	* in a pre-read phase. Further, if the "delayed" queue is empty when
				448	* a stripe is put on it then we "plug" the queue and do not process it
				449	* until an unplug call is made. (the unplug_io_fn() is called).
				450	*
				451	* When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
				452	* it to the count of prereading stripes.
				453	* When write is initiated, or the stripe refcnt == 0 (just in case) we
				454	* clear the PREREAD_ACTIVE flag and decrement the count
NeilBrown	b5c124a	2006-10-03 01:15:45 -0700	[diff] [blame]	455	* Whenever the 'handle' queue is empty and the device is not plugged, we
				456	* move any strips from delayed to handle and clear the DELAYED flag and set
				457	* PREREAD_ACTIVE.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	458	* In stripe_handle, if we find pre-reading is necessary, we do it if
				459	* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
NeilBrown	c4c1663	2011-07-26 11:34:20 +1000	[diff] [blame]	460	* HANDLE gets cleared if stripe_handle leaves nothing locked.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	461	*/
Christoph Hellwig	ef740c3	2009-03-31 14:27:03 +1100	[diff] [blame]	462
NeilBrown	f2785b5	2018-02-03 09:19:30 +1100	[diff] [blame]	463	/* Note: disk_info.rdev can be set to NULL asynchronously by raid5_remove_disk.
				464	* There are three safe ways to access disk_info.rdev.
				465	* 1/ when holding mddev->reconfig_mutex
				466	* 2/ when resync/recovery/reshape is known to be happening - i.e. in code that
				467	* is called as part of performing resync/recovery/reshape.
				468	* 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
				469	* and if it is non-NULL, increment rdev->nr_pending before dropping the RCU
				470	* lock.
				471	* When .rdev is set to NULL, the nr_pending count checked again and if
				472	* it has been incremented, the pointer is put back in .rdev.
				473	*/
				474
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	475	struct disk_info {
NeilBrown	671488c	2011-12-23 10:17:52 +1100	[diff] [blame]	476	struct md_rdev rdev, replacement;
Song Liu	d7bd398	2016-11-23 22:50:39 -0800	[diff] [blame]	477	struct page extra_page; / extra page to use in prexor */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	478	};
				479
Song Liu	937621c	2016-11-17 15:24:37 -0800	[diff] [blame]	480	/*
				481	* Stripe cache
				482	*/
				483
				484	#define NR_STRIPES 256
Yufen Yu	e236858	2020-07-18 05:29:08 -0400	[diff] [blame]	485
				486	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
Song Liu	937621c	2016-11-17 15:24:37 -0800	[diff] [blame]	487	#define STRIPE_SIZE PAGE_SIZE
				488	#define STRIPE_SHIFT (PAGE_SHIFT - 9)
				489	#define STRIPE_SECTORS (STRIPE_SIZE>>9)
Yufen Yu	e236858	2020-07-18 05:29:08 -0400	[diff] [blame]	490	#endif
				491
Song Liu	937621c	2016-11-17 15:24:37 -0800	[diff] [blame]	492	#define IO_THRESHOLD 1
				493	#define BYPASS_THRESHOLD 1
				494	#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
				495	#define HASH_MASK (NR_HASH - 1)
				496	#define MAX_STRIPE_BATCH 8
				497
Shaohua Li	566c09c	2013-11-14 15:16:17 +1100	[diff] [blame]	498	/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
				499	* This is because we sometimes take all the spinlocks
				500	* and creating that much locking depth can cause
				501	* problems.
				502	*/
				503	#define NR_STRIPE_HASH_LOCKS 8
				504	#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
				505
Shaohua Li	851c30c	2013-08-28 14:30:16 +0800	[diff] [blame]	506	struct r5worker {
				507	struct work_struct work;
				508	struct r5worker_group *group;
Shaohua Li	566c09c	2013-11-14 15:16:17 +1100	[diff] [blame]	509	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
Shaohua Li	bfc90cb	2013-08-29 15:40:32 +0800	[diff] [blame]	510	bool working;
Shaohua Li	851c30c	2013-08-28 14:30:16 +0800	[diff] [blame]	511	};
				512
				513	struct r5worker_group {
				514	struct list_head handle_list;
Shaohua Li	535ae4e	2017-02-15 19:37:32 -0800	[diff] [blame]	515	struct list_head loprio_list;
Shaohua Li	851c30c	2013-08-28 14:30:16 +0800	[diff] [blame]	516	struct r5conf *conf;
				517	struct r5worker *workers;
Shaohua Li	bfc90cb	2013-08-29 15:40:32 +0800	[diff] [blame]	518	int stripes_cnt;
Shaohua Li	851c30c	2013-08-28 14:30:16 +0800	[diff] [blame]	519	};
				520
Heinz Mauelshagen	78e470c	2017-03-22 17:44:37 +0100	[diff] [blame]	521	/*
				522	* r5c journal modes of the array: write-back or write-through.
				523	* write-through mode has identical behavior as existing log only
				524	* implementation.
				525	*/
				526	enum r5c_journal_mode {
				527	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
				528	R5C_JOURNAL_MODE_WRITE_BACK = 1,
				529	};
				530
Song Liu	a39f7af	2016-11-17 15:24:40 -0800	[diff] [blame]	531	enum r5_cache_state {
				532	R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked,
				533	* waiting for 25% to be free
				534	*/
				535	R5_ALLOC_MORE, /* It might help to allocate another
				536	* stripe.
				537	*/
				538	R5_DID_ALLOC, /* A stripe was allocated, don't allocate
				539	* more until at least one has been
				540	* released. This avoids flooding
				541	* the cache.
				542	*/
				543	R5C_LOG_TIGHT, /* log device space tight, need to
				544	* prioritize stripes at last_checkpoint
				545	*/
				546	R5C_LOG_CRITICAL, /* log device is running out of space,
				547	* only process stripes that are already
				548	* occupying the log
				549	*/
Song Liu	d7bd398	2016-11-23 22:50:39 -0800	[diff] [blame]	550	R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page
				551	* for prexor
				552	*/
Song Liu	a39f7af	2016-11-17 15:24:40 -0800	[diff] [blame]	553	};
				554
Shaohua Li	aaf9f12	2017-03-03 22:06:12 -0800	[diff] [blame]	555	#define PENDING_IO_MAX 512
				556	#define PENDING_IO_ONE_FLUSH 128
				557	struct r5pending_data {
				558	struct list_head sibling;
				559	sector_t sector; /* stripe sector */
				560	struct bio_list bios;
				561	};
				562
NeilBrown	d1688a6	2011-10-11 16:49:52 +1100	[diff] [blame]	563	struct r5conf {
NeilBrown	fccddba	2006-01-06 00:20:33 -0800	[diff] [blame]	564	struct hlist_head *stripe_hashtbl;
Shaohua Li	566c09c	2013-11-14 15:16:17 +1100	[diff] [blame]	565	/* only protect corresponding hash list and inactive_list */
				566	spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
NeilBrown	fd01b88	2011-10-11 16:47:53 +1100	[diff] [blame]	567	struct mddev *mddev;
Andre Noll	09c9e5f	2009-06-18 08:45:55 +1000	[diff] [blame]	568	int chunk_sectors;
Markus Stockhausen	584acdd	2014-12-15 12:57:05 +1100	[diff] [blame]	569	int level, algorithm, rmw_level;
NeilBrown	16a53ec	2006-06-26 00:27:38 -0700	[diff] [blame]	570	int max_degraded;
NeilBrown	02c2de8	2006-10-03 01:15:47 -0700	[diff] [blame]	571	int raid_disks;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	572	int max_nr_stripes;
NeilBrown	edbe83a	2015-02-26 12:47:56 +1100	[diff] [blame]	573	int min_nr_stripes;
Yufen Yu	e236858	2020-07-18 05:29:08 -0400	[diff] [blame]	574	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
				575	unsigned long stripe_size;
				576	unsigned int stripe_shift;
				577	unsigned long stripe_sectors;
				578	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	579
NeilBrown	fef9c61	2009-03-31 15:16:46 +1100	[diff] [blame]	580	/* reshape_progress is the leading edge of a 'reshape'
				581	* It has value MaxSector when no reshape is happening
				582	* If delta_disks < 0, it is the last sector we started work on,
				583	* else is it the next sector to work on.
				584	*/
				585	sector_t reshape_progress;
				586	/* reshape_safe is the trailing edge of a reshape. We know that
				587	* before (or after) this address, all reshape has completed.
				588	*/
				589	sector_t reshape_safe;
NeilBrown	7ecaa1e	2006-03-27 01:18:08 -0800	[diff] [blame]	590	int previous_raid_disks;
Andre Noll	09c9e5f	2009-06-18 08:45:55 +1000	[diff] [blame]	591	int prev_chunk_sectors;
				592	int prev_algo;
NeilBrown	86b42c7	2009-03-31 15:19:03 +1100	[diff] [blame]	593	short generation; /* increments with every reshape */
Ahmed S. Darwish	0a87b25	2020-07-20 17:55:25 +0200	[diff] [blame]	594	seqcount_spinlock_t gen_lock; /* lock against generation changes */
NeilBrown	c8f517c	2009-03-31 15:28:40 +1100	[diff] [blame]	595	unsigned long reshape_checkpoint; /* Time we last updated
				596	* metadata */
NeilBrown	b5254dd	2012-05-21 09:27:01 +1000	[diff] [blame]	597	long long min_offset_diff; /* minimum difference between
				598	* data_offset and
				599	* new_data_offset across all
				600	* devices. May be negative,
				601	* but is closest to zero.
				602	*/
NeilBrown	7ecaa1e	2006-03-27 01:18:08 -0800	[diff] [blame]	603
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	604	struct list_head handle_list; /* stripes needing handling */
Shaohua Li	535ae4e	2017-02-15 19:37:32 -0800	[diff] [blame]	605	struct list_head loprio_list; /* low priority stripes */
Dan Williams	8b3e6cd	2008-04-28 02:15:53 -0700	[diff] [blame]	606	struct list_head hold_list; /* preread ready stripes */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	607	struct list_head delayed_list; /* stripes that have plugged requests */
NeilBrown	7262668	2005-09-09 16:23:54 -0700	[diff] [blame]	608	struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
Raz Ben-Jehuda(caro)	46031f9	2006-12-10 02:20:47 -0800	[diff] [blame]	609	struct bio retry_read_aligned; / currently retrying aligned bios */
NeilBrown	0472a42	2017-03-15 14:05:13 +1100	[diff] [blame]	610	unsigned int retry_read_offset; /* sector offset into retry_read_aligned */
Raz Ben-Jehuda(caro)	46031f9	2006-12-10 02:20:47 -0800	[diff] [blame]	611	struct bio retry_read_aligned_list; / aligned bios retry list */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	612	atomic_t preread_active_stripes; /* stripes with scheduled io */
Raz Ben-Jehuda(caro)	46031f9	2006-12-10 02:20:47 -0800	[diff] [blame]	613	atomic_t active_aligned_reads;
Dan Williams	8b3e6cd	2008-04-28 02:15:53 -0700	[diff] [blame]	614	atomic_t pending_full_writes; /* full write backlog */
				615	int bypass_count; /* bypassed prereads */
				616	int bypass_threshold; /* preread nice */
Shaohua Li	d592a99	2014-05-21 17:57:44 +0800	[diff] [blame]	617	int skip_copy; /* Don't copy data from bio to stripe cache */
Dan Williams	8b3e6cd	2008-04-28 02:15:53 -0700	[diff] [blame]	618	struct list_head last_hold; / detect hold_list promotions */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	619
NeilBrown	f670557	2006-03-27 01:18:11 -0800	[diff] [blame]	620	atomic_t reshape_stripes; /* stripes with pending writes for reshape */
NeilBrown	ad01c9e	2006-03-27 01:18:07 -0800	[diff] [blame]	621	/* unfortunately we need two cache names as we temporarily have
				622	* two caches.
				623	*/
				624	int active_name;
NeilBrown	f4be6b4	2010-06-01 19:37:25 +1000	[diff] [blame]	625	char cache_name[2][32];
NeilBrown	2d5b569	2015-07-06 12:49:23 +1000	[diff] [blame]	626	struct kmem_cache slab_cache; / for allocating stripes */
				627	struct mutex cache_size_mutex; /* Protect changes to cache size */
NeilBrown	7262668	2005-09-09 16:23:54 -0700	[diff] [blame]	628
				629	int seq_flush, seq_write;
				630	int quiesce;
				631
				632	int fullsync; /* set to 1 if a full sync is needed,
				633	* (fresh device added).
				634	* Cleared when a sync completes.
				635	*/
NeilBrown	7f0da59	2011-07-28 11:39:22 +1000	[diff] [blame]	636	int recovery_disabled;
Dan Williams	36d1c64	2009-07-14 11:48:22 -0700	[diff] [blame]	637	/* per cpu variables */
				638	struct raid5_percpu {
				639	struct page spare_page; / Used when checking P/Q in raid6 */
Kent Overstreet	b330e6a	2019-03-11 23:31:06 -0700	[diff] [blame]	640	void scribble; / space for constructing buffer
				641	* lists and performing address
				642	* conversions
				643	*/
Davidlohr Bueso	770b1d2	2021-11-15 17:23:17 -0800	[diff] [blame]	644	int scribble_obj_size;
				645	local_lock_t lock;
Tejun Heo	a29d8b8	2010-02-02 14:39:15 +0900	[diff] [blame]	646	} __percpu *percpu;
Shaohua Li	27a353c	2016-02-24 17:38:28 -0800	[diff] [blame]	647	int scribble_disks;
				648	int scribble_sectors;
Sebastian Andrzej Siewior	29c6d1b	2016-08-18 14:57:24 +0200	[diff] [blame]	649	struct hlist_node node;
NeilBrown	ca65b73	2006-01-06 00:20:17 -0800	[diff] [blame]	650
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	651	/*
				652	* Free stripes pool
				653	*/
				654	atomic_t active_stripes;
Shaohua Li	566c09c	2013-11-14 15:16:17 +1100	[diff] [blame]	655	struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
Song Liu	1e6d690	2016-11-17 15:24:39 -0800	[diff] [blame]	656
				657	atomic_t r5c_cached_full_stripes;
				658	struct list_head r5c_full_stripe_list;
				659	atomic_t r5c_cached_partial_stripes;
				660	struct list_head r5c_partial_stripe_list;
Shaohua Li	e33fbb9	2017-02-10 16:18:09 -0800	[diff] [blame]	661	atomic_t r5c_flushing_full_stripes;
				662	atomic_t r5c_flushing_partial_stripes;
Song Liu	1e6d690	2016-11-17 15:24:39 -0800	[diff] [blame]	663
Shaohua Li	4bda556	2013-11-14 15:16:17 +1100	[diff] [blame]	664	atomic_t empty_inactive_list_nr;
Shaohua Li	773ca82	2013-08-27 17:50:39 +0800	[diff] [blame]	665	struct llist_head released_stripes;
Yuanhan Liu	b1b4648	2015-05-08 18:19:06 +1000	[diff] [blame]	666	wait_queue_head_t wait_for_quiescent;
Shaohua Li	6ab2a4b	2016-02-25 16:24:42 -0800	[diff] [blame]	667	wait_queue_head_t wait_for_stripe;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	668	wait_queue_head_t wait_for_overlap;
NeilBrown	5423399	2015-02-26 12:21:04 +1100	[diff] [blame]	669	unsigned long cache_state;
NeilBrown	edbe83a	2015-02-26 12:47:56 +1100	[diff] [blame]	670	struct shrinker shrinker;
NeilBrown	ad01c9e	2006-03-27 01:18:07 -0800	[diff] [blame]	671	int pool_size; /* number of disks in stripeheads in pool */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	672	spinlock_t device_lock;
NeilBrown	b55e6bf	2006-03-27 01:18:06 -0800	[diff] [blame]	673	struct disk_info *disks;
Kent Overstreet	afeee51	2018-05-20 18:25:52 -0400	[diff] [blame]	674	struct bio_set bio_split;
NeilBrown	91adb56	2009-03-31 14:39:39 +1100	[diff] [blame]	675
				676	/* When taking over an array from a different personality, we store
				677	* the new thread here until we fully activate the array.
				678	*/
NeilBrown	2b8bf34	2011-10-11 16:48:23 +1100	[diff] [blame]	679	struct md_thread *thread;
Shaohua Li	566c09c	2013-11-14 15:16:17 +1100	[diff] [blame]	680	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
Shaohua Li	851c30c	2013-08-28 14:30:16 +0800	[diff] [blame]	681	struct r5worker_group *worker_groups;
				682	int group_cnt;
				683	int worker_cnt_per_group;
Shaohua Li	f6bed0e	2015-08-13 14:31:59 -0700	[diff] [blame]	684	struct r5l_log *log;
Artur Paszkiewicz	3418d03	2017-03-09 09:59:59 +0100	[diff] [blame]	685	void *log_private;
Shaohua Li	765d704	2017-01-04 09:33:23 -0800	[diff] [blame]	686
Shaohua Li	765d704	2017-01-04 09:33:23 -0800	[diff] [blame]	687	spinlock_t pending_bios_lock;
				688	bool batch_bio_dispatch;
Shaohua Li	aaf9f12	2017-03-03 22:06:12 -0800	[diff] [blame]	689	struct r5pending_data *pending_data;
				690	struct list_head free_list;
				691	struct list_head pending_list;
				692	int pending_data_cnt;
				693	struct r5pending_data *next_pending_data;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	694	};
				695
Yufen Yu	e236858	2020-07-18 05:29:08 -0400	[diff] [blame]	696	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
				697	#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE
				698	#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT
				699	#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS
				700	#else
				701	#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size)
				702	#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift)
				703	#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors)
				704	#endif
Yufen Yu	c911c46	2020-07-18 05:29:07 -0400	[diff] [blame]	705
				706	/* bio's attached to a stripe+device for I/O are linked together in bi_sector
				707	* order without overlap. There may be several bio's per stripe+device, and
				708	* a bio could span several devices.
				709	* When walking this list for a particular stripe+device, we must never proceed
				710	* beyond a bio that extends past this device, as the next bio might no longer
				711	* be valid.
				712	* This function is used to determine the 'next' bio in the list, given the
				713	* sector of the current stripe+device
				714	*/
				715	static inline struct bio r5_next_bio(struct r5conf conf, struct bio *bio, sector_t sector)
				716	{
				717	if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf))
				718	return bio->bi_next;
				719	else
				720	return NULL;
				721	}
NeilBrown	5423399	2015-02-26 12:21:04 +1100	[diff] [blame]	722
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	723	/*
				724	* Our supported algorithms
				725	*/
NeilBrown	99c0fb5	2009-03-31 14:39:38 +1100	[diff] [blame]	726	#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */
				727	#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */
				728	#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */
				729	#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	730
NeilBrown	99c0fb5	2009-03-31 14:39:38 +1100	[diff] [blame]	731	/* Define non-rotating (raid4) algorithms. These allow
				732	* conversion of raid4 to raid5.
				733	*/
				734	#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
				735	#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
				736
				737	/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
				738	* Firstly, the exact positioning of the parity block is slightly
				739	* different between the 'LEFT_' modes of md and the "_N_" modes
				740	* of DDF.
				741	* Secondly, or order of datablocks over which the Q syndrome is computed
				742	* is different.
				743	* Consequently we have different layouts for DDF/raid6 than md/raid6.
				744	* These layouts are from the DDFv1.2 spec.
				745	* Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
				746	* leaves RLQ=3 as 'Vendor Specific'
				747	*/
				748
				749	#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
				750	#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
				751	#define ALGORITHM_ROTATING_N_CONTINUE 10 /DDF PRL=6 RLQ=3 /
				752
NeilBrown	99c0fb5	2009-03-31 14:39:38 +1100	[diff] [blame]	753	/* For every RAID5 algorithm we define a RAID6 algorithm
				754	* with exactly the same layout for data and parity, and
				755	* with the Q block always on the last device (N-1).
				756	* This allows trivial conversion from RAID5 to RAID6
				757	*/
				758	#define ALGORITHM_LEFT_ASYMMETRIC_6 16
				759	#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
				760	#define ALGORITHM_LEFT_SYMMETRIC_6 18
				761	#define ALGORITHM_RIGHT_SYMMETRIC_6 19
				762	#define ALGORITHM_PARITY_0_6 20
				763	#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
				764
				765	static inline int algorithm_valid_raid5(int layout)
				766	{
				767	return (layout >= 0) &&
				768	(layout <= 5);
				769	}
				770	static inline int algorithm_valid_raid6(int layout)
				771	{
				772	return (layout >= 0 && layout <= 5)
				773	\|\|
NeilBrown	e4424fe	2009-10-16 16:27:34 +1100	[diff] [blame]	774	(layout >= 8 && layout <= 10)
NeilBrown	99c0fb5	2009-03-31 14:39:38 +1100	[diff] [blame]	775	\|\|
				776	(layout >= 16 && layout <= 20);
				777	}
				778
				779	static inline int algorithm_is_DDF(int layout)
				780	{
				781	return layout >= 8 && layout <= 10;
				782	}
NeilBrown	11d8a6e	2010-07-26 11:57:07 +1000	[diff] [blame]	783
Yufen Yu	046169f	2020-08-20 09:22:12 -0400	[diff] [blame]	784	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
				785	/*
				786	* Return offset of the corresponding page for r5dev.
				787	*/
				788	static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx)
				789	{
				790	return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf);
				791	}
				792
				793	/*
				794	* Return corresponding page address for r5dev.
				795	*/
				796	static inline struct page *
				797	raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
				798	{
				799	return sh->pages[disk_idx / sh->stripes_per_page];
				800	}
				801	#endif
				802
NeilBrown	d1688a6	2011-10-11 16:49:52 +1100	[diff] [blame]	803	extern void md_raid5_kick_device(struct r5conf *conf);
NeilBrown	fd01b88	2011-10-11 16:47:53 +1100	[diff] [blame]	804	extern int raid5_set_cache_size(struct mddev *mddev, int size);
Shaohua Li	6d036f7	2015-08-13 14:31:57 -0700	[diff] [blame]	805	extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
				806	extern void raid5_release_stripe(struct stripe_head *sh);
				807	extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
				808	int previous, int *dd_idx,
				809	struct stripe_head *sh);
				810	extern struct stripe_head *
				811	raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
				812	int previous, int noblock, int noquiesce);
Song Liu	2e38a37	2017-01-24 10:45:30 -0800	[diff] [blame]	813	extern int raid5_calc_degraded(struct r5conf *conf);
Heinz Mauelshagen	78e470c	2017-03-22 17:44:37 +0100	[diff] [blame]	814	extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	815	#endif