Blame - fs/xfs/xfs_log_priv.h - SHIFTPHONES/mainline/linux

blob: 1c6fdbf3d5066d237ad5ac3aa6dafe0e06edc5ab [file] [log] [blame]

Dave Chinner	0b61f8a	2018-06-05 19:42:14 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2	/*
Nathan Scott	7b71876	2005-11-02 14:58:39 +1100	[diff] [blame]	3	* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
				4	* All Rights Reserved.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5	*/
				6	#ifndef __XFS_LOG_PRIV_H__
				7	#define __XFS_LOG_PRIV_H__
				8
				9	struct xfs_buf;
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	10	struct xlog;
Nathan Scott	a844f45	2005-11-02 14:38:42 +1100	[diff] [blame]	11	struct xlog_ticket;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	12	struct xfs_mount;
				13
				14	/*
Dave Chinner	fc06c6d	2013-08-12 20:49:22 +1000	[diff] [blame]	15	* Flags for log structure
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	16	*/
Dave Chinner	fc06c6d	2013-08-12 20:49:22 +1000	[diff] [blame]	17	#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
				18	#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
				19	#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
				20	shutdown */
				21	#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	22
				23	/*
				24	* get client id from packed copy.
				25	*
				26	* this hack is here because the xlog_pack code copies four bytes
				27	* of xlog_op_header containing the fields oh_clientid, oh_flags
				28	* and oh_res2 into the packed copy.
				29	*
				30	* later on this four byte chunk is treated as an int and the
				31	* client id is pulled out.
				32	*
				33	* this has endian issues, of course.
				34	*/
Christoph Hellwig	b53e675	2007-10-12 10:59:34 +1000	[diff] [blame]	35	static inline uint xlog_get_client_id(__be32 i)
Christoph Hellwig	03bea6f	2007-10-12 10:58:05 +1000	[diff] [blame]	36	{
Christoph Hellwig	b53e675	2007-10-12 10:59:34 +1000	[diff] [blame]	37	return be32_to_cpu(i) >> 24;
Christoph Hellwig	03bea6f	2007-10-12 10:58:05 +1000	[diff] [blame]	38	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	39
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	/*
				41	* In core log state
				42	*/
Christoph Hellwig	1858bb0	2019-10-14 10:36:43 -0700	[diff] [blame]	43	enum xlog_iclog_state {
				44	XLOG_STATE_ACTIVE, /* Current IC log being written to */
				45	XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */
				46	XLOG_STATE_SYNCING, /* This IC log is syncing */
				47	XLOG_STATE_DONE_SYNC, /* Done syncing to disk */
Christoph Hellwig	1858bb0	2019-10-14 10:36:43 -0700	[diff] [blame]	48	XLOG_STATE_CALLBACK, /* Callback functions now */
				49	XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */
				50	XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */
				51	};
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	52
				53	/*
Dave Chinner	70e42f2	2020-03-25 18:18:22 -0700	[diff] [blame]	54	* Log ticket flags
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	55	*/
Dave Chinner	70e42f2	2020-03-25 18:18:22 -0700	[diff] [blame]	56	#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
Christoph Hellwig	0b1b213	2009-12-14 23:14:59 +0000	[diff] [blame]	57
				58	#define XLOG_TIC_FLAGS \
Dave Chinner	1054794	2010-12-21 12:02:25 +1100	[diff] [blame]	59	{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
Christoph Hellwig	0b1b213	2009-12-14 23:14:59 +0000	[diff] [blame]	60
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	61	/*
				62	* Below are states for covering allocation transactions.
				63	* By covering, we mean changing the h_tail_lsn in the last on-disk
				64	* log write such that no allocation transactions will be re-done during
				65	* recovery after a system crash. Recovery starts at the last on-disk
				66	* log write.
				67	*
				68	* These states are used to insert dummy log entries to cover
				69	* space allocation transactions which can undo non-transactional changes
				70	* after a crash. Writes to a file with space
				71	* already allocated do not result in any transactions. Allocations
				72	* might include space beyond the EOF. So if we just push the EOF a
				73	* little, the last transaction for the file could contain the wrong
				74	* size. If there is no file system activity, after an allocation
				75	* transaction, and the system crashes, the allocation transaction
				76	* will get replayed and the file will be truncated. This could
				77	* be hours/days/... after the allocation occurred.
				78	*
				79	* The fix for this is to do two dummy transactions when the
				80	* system is idle. We need two dummy transaction because the h_tail_lsn
				81	* in the log record header needs to point beyond the last possible
				82	* non-dummy transaction. The first dummy changes the h_tail_lsn to
				83	* the first transaction before the dummy. The second dummy causes
				84	* h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
				85	*
				86	* These dummy transactions get committed when everything
				87	* is idle (after there has been some activity).
				88	*
				89	* There are 5 states used to control this.
				90	*
				91	* IDLE -- no logging has been done on the file system or
				92	* we are done covering previous transactions.
				93	* NEED -- logging has occurred and we need a dummy transaction
				94	* when the log becomes idle.
				95	* DONE -- we were in the NEED state and have committed a dummy
				96	* transaction.
				97	* NEED2 -- we detected that a dummy transaction has gone to the
				98	* on disk log with no other transactions.
				99	* DONE2 -- we committed a dummy transaction when in the NEED2 state.
				100	*
				101	* There are two places where we switch states:
				102	*
				103	* 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
				104	* We commit the dummy transaction and switch to DONE or DONE2,
				105	* respectively. In all other states, we don't do anything.
				106	*
				107	* 2.) When we finish writing the on-disk log (xlog_state_clean_log).
				108	*
				109	* No matter what state we are in, if this isn't the dummy
				110	* transaction going out, the next state is NEED.
				111	* So, if we aren't in the DONE or DONE2 states, the next state
				112	* is NEED. We can't be finishing a write of the dummy record
				113	* unless it was committed and the state switched to DONE or DONE2.
				114	*
				115	* If we are in the DONE state and this was a write of the
				116	* dummy transaction, we move to NEED2.
				117	*
				118	* If we are in the DONE2 state and this was a write of the
				119	* dummy transaction, we move to IDLE.
				120	*
				121	*
				122	* Writing only one dummy transaction can get appended to
				123	* one file space allocation. When this happens, the log recovery
				124	* code replays the space allocation and a file could be truncated.
				125	* This is why we have the NEED2 and DONE2 states before going idle.
				126	*/
				127
				128	#define XLOG_STATE_COVER_IDLE 0
				129	#define XLOG_STATE_COVER_NEED 1
				130	#define XLOG_STATE_COVER_DONE 2
				131	#define XLOG_STATE_COVER_NEED2 3
				132	#define XLOG_STATE_COVER_DONE2 4
				133
				134	#define XLOG_COVER_OPS 5
				135
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	136	/* Ticket reservation region accounting */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	137	#define XLOG_TIC_LEN_MAX 15
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	138
				139	/*
				140	* Reservation region
				141	* As would be stored in xfs_log_iovec but without the i_addr which
				142	* we don't care about.
				143	*/
				144	typedef struct xlog_res {
Tim Shimmin	1259845	2006-01-11 21:02:47 +1100	[diff] [blame]	145	uint r_len; /* region length :4 */
				146	uint r_type; /* region's transaction type :4 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	147	} xlog_res_t;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	148
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	149	typedef struct xlog_ticket {
Dave Chinner	1054794	2010-12-21 12:02:25 +1100	[diff] [blame]	150	struct list_head t_queue; /* reserve/write queue */
Christoph Hellwig	14a7235f	2012-02-20 02:31:24 +0000	[diff] [blame]	151	struct task_struct t_task; / task that owns this ticket */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	152	xlog_tid_t t_tid; /* transaction identifier : 4 */
Dave Chinner	cc09c0d	2008-11-17 17:37:10 +1100	[diff] [blame]	153	atomic_t t_ref; /* ticket reference count : 4 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	154	int t_curr_res; /* current reservation in bytes : 4 */
				155	int t_unit_res; /* unit reservation in bytes : 4 */
				156	char t_ocnt; /* original count : 1 */
				157	char t_cnt; /* current count : 1 */
				158	char t_clientid; /* who does this belong to; : 1 */
				159	char t_flags; /* properties of reservation : 1 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	160
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	161	/* reservation array fields */
				162	uint t_res_num; /* num in array : 4 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	163	uint t_res_num_ophdrs; /* num op hdrs : 4 */
				164	uint t_res_arr_sum; /* array sum : 4 */
				165	uint t_res_o_flow; /* sum overflow : 4 */
Tim Shimmin	1259845	2006-01-11 21:02:47 +1100	[diff] [blame]	166	xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	167	} xlog_ticket_t;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	168
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	169	/*
				170	* - A log record header is 512 bytes. There is plenty of room to grow the
				171	* xlog_rec_header_t into the reserved space.
				172	* - ic_data follows, so a write to disk can start at the beginning of
				173	* the iclog.
David Chinner	12017fa	2008-08-13 16:34:31 +1000	[diff] [blame]	174	* - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	175	* - ic_next is the pointer to the next iclog in the ring.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	176	* - ic_log is a pointer back to the global log structure.
Christoph Hellwig	79b54d9	2019-06-28 19:27:25 -0700	[diff] [blame]	177	* - ic_size is the full size of the log buffer, minus the cycle headers.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	178	* - ic_offset is the current number of bytes written to in this iclog.
				179	* - ic_refcnt is bumped when someone is writing to the log.
				180	* - ic_state is the state of the iclog.
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	181	*
				182	* Because of cacheline contention on large machines, we need to separate
				183	* various resources onto different cachelines. To start with, make the
				184	* structure cacheline aligned. The following fields can be contended on
				185	* by independent processes:
				186	*
Christoph Hellwig	89ae379	2019-06-28 19:27:34 -0700	[diff] [blame]	187	* - ic_callbacks
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	188	* - ic_refcnt
				189	* - fields protected by the global l_icloglock
				190	*
				191	* so we need to ensure that these fields are located in separate cachelines.
				192	* We'll put all the read-only and l_icloglock fields in the first cacheline,
				193	* and move everything else out to subsequent cachelines.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	194	*/
Christoph Hellwig	b28708d	2008-11-28 14:23:38 +1100	[diff] [blame]	195	typedef struct xlog_in_core {
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	196	wait_queue_head_t ic_force_wait;
				197	wait_queue_head_t ic_write_wait;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	198	struct xlog_in_core *ic_next;
				199	struct xlog_in_core *ic_prev;
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	200	struct xlog *ic_log;
Christoph Hellwig	79b54d9	2019-06-28 19:27:25 -0700	[diff] [blame]	201	u32 ic_size;
Christoph Hellwig	79b54d9	2019-06-28 19:27:25 -0700	[diff] [blame]	202	u32 ic_offset;
Christoph Hellwig	1858bb0	2019-10-14 10:36:43 -0700	[diff] [blame]	203	enum xlog_iclog_state ic_state;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	204	char ic_datap; / pointer to iclog data */
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	205
				206	/* Callback structures need their own cacheline */
				207	spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
Christoph Hellwig	89ae379	2019-06-28 19:27:34 -0700	[diff] [blame]	208	struct list_head ic_callbacks;
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	209
				210	/* reference counts need their own cacheline */
				211	atomic_t ic_refcnt ____cacheline_aligned_in_smp;
Christoph Hellwig	b28708d	2008-11-28 14:23:38 +1100	[diff] [blame]	212	xlog_in_core_2_t *ic_data;
				213	#define ic_header ic_data->hic_header
Christoph Hellwig	366fc4b	2019-06-28 19:27:21 -0700	[diff] [blame]	214	#ifdef DEBUG
				215	bool ic_fail_crc : 1;
				216	#endif
Christoph Hellwig	79b54d9	2019-06-28 19:27:25 -0700	[diff] [blame]	217	struct semaphore ic_sema;
				218	struct work_struct ic_end_io_work;
				219	struct bio ic_bio;
				220	struct bio_vec ic_bvec[];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	221	} xlog_in_core_t;
				222
				223	/*
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	224	* The CIL context is used to aggregate per-transaction details as well be
				225	* passed to the iclog for checkpoint post-commit processing. After being
				226	* passed to the iclog, another context needs to be allocated for tracking the
				227	* next set of transactions to be aggregated into a checkpoint.
				228	*/
				229	struct xfs_cil;
				230
				231	struct xfs_cil_ctx {
				232	struct xfs_cil *cil;
				233	xfs_lsn_t sequence; /* chkpt sequence # */
				234	xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
				235	xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
				236	struct xlog_ticket ticket; / chkpt ticket */
				237	int nvecs; /* number of regions */
				238	int space_used; /* aggregate size of regions */
				239	struct list_head busy_extents; /* busy extents in chkpt */
				240	struct xfs_log_vec lv_chain; / logvecs being pushed */
Christoph Hellwig	89ae379	2019-06-28 19:27:34 -0700	[diff] [blame]	241	struct list_head iclog_entry;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	242	struct list_head committing; /* ctx committing list */
Christoph Hellwig	4560e78	2017-02-07 14:07:58 -0800	[diff] [blame]	243	struct work_struct discard_endio_work;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	244	};
				245
				246	/*
				247	* Committed Item List structure
				248	*
				249	* This structure is used to track log items that have been committed but not
				250	* yet written into the log. It is used only when the delayed logging mount
				251	* option is enabled.
				252	*
				253	* This structure tracks the list of committing checkpoint contexts so
				254	* we can avoid the problem of having to hold out new transactions during a
				255	* flush until we have a the commit record LSN of the checkpoint. We can
				256	* traverse the list of committing contexts in xlog_cil_push_lsn() to find a
				257	* sequence match and extract the commit LSN directly from there. If the
				258	* checkpoint is still in the process of committing, we can block waiting for
				259	* the commit LSN to be determined as well. This should make synchronous
				260	* operations almost as efficient as the old logging methods.
				261	*/
				262	struct xfs_cil {
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	263	struct xlog *xc_log;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	264	struct list_head xc_cil;
				265	spinlock_t xc_cil_lock;
Dave Chinner	4bb928c	2013-08-12 20:50:08 +1000	[diff] [blame]	266
				267	struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	268	struct xfs_cil_ctx *xc_ctx;
Dave Chinner	4bb928c	2013-08-12 20:50:08 +1000	[diff] [blame]	269
				270	spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
				271	xfs_lsn_t xc_push_seq;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	272	struct list_head xc_committing;
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	273	wait_queue_head_t xc_commit_wait;
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	274	xfs_lsn_t xc_current_sequence;
Dave Chinner	4c2d542	2012-04-23 17:54:32 +1000	[diff] [blame]	275	struct work_struct xc_push_work;
Dave Chinner	c7f87f3	2020-06-16 08:57:43 -0700	[diff] [blame]	276	wait_queue_head_t xc_push_wait; /* background push throttle */
Dave Chinner	4bb928c	2013-08-12 20:50:08 +1000	[diff] [blame]	277	} ____cacheline_aligned_in_smp;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	278
				279	/*
Dave Chinner	8016867	2010-09-24 18:13:44 +1000	[diff] [blame]	280	* The amount of log space we allow the CIL to aggregate is difficult to size.
				281	* Whatever we choose, we have to make sure we can get a reservation for the
				282	* log space effectively, that it is large enough to capture sufficient
				283	* relogging to reduce log buffer IO significantly, but it is not too large for
				284	* the log or induces too much latency when writing out through the iclogs. We
				285	* track both space consumed and the number of vectors in the checkpoint
				286	* context, so we need to decide which to use for limiting.
Dave Chinner	df80615	2010-05-17 15:52:13 +1000	[diff] [blame]	287	*
				288	* Every log buffer we write out during a push needs a header reserved, which
				289	* is at least one sector and more for v2 logs. Hence we need a reservation of
				290	* at least 512 bytes per 32k of log space just for the LR headers. That means
				291	* 16KB of reservation per megabyte of delayed logging space we will consume,
				292	* plus various headers. The number of headers will vary based on the num of
				293	* io vectors, so limiting on a specific number of vectors is going to result
				294	* in transactions of varying size. IOWs, it is more consistent to track and
				295	* limit space consumed in the log rather than by the number of objects being
				296	* logged in order to prevent checkpoint ticket overruns.
				297	*
				298	* Further, use of static reservations through the log grant mechanism is
				299	* problematic. It introduces a lot of complexity (e.g. reserve grant vs write
				300	* grant) and a significant deadlock potential because regranting write space
				301	* can block on log pushes. Hence if we have to regrant log space during a log
				302	* push, we can deadlock.
				303	*
				304	* However, we can avoid this by use of a dynamic "reservation stealing"
				305	* technique during transaction commit whereby unused reservation space in the
				306	* transaction ticket is transferred to the CIL ctx commit ticket to cover the
				307	* space needed by the checkpoint transaction. This means that we never need to
				308	* specifically reserve space for the CIL checkpoint transaction, nor do we
				309	* need to regrant space once the checkpoint completes. This also means the
				310	* checkpoint transaction ticket is specific to the checkpoint context, rather
				311	* than the CIL itself.
				312	*
Dave Chinner	8016867	2010-09-24 18:13:44 +1000	[diff] [blame]	313	* With dynamic reservations, we can effectively make up arbitrary limits for
				314	* the checkpoint size so long as they don't violate any other size rules.
				315	* Recovery imposes a rule that no transaction exceed half the log, so we are
				316	* limited by that. Furthermore, the log transaction reservation subsystem
				317	* tries to keep 25% of the log free, so we need to keep below that limit or we
				318	* risk running out of free log space to start any new transactions.
				319	*
Dave Chinner	108a423	2020-03-24 20:10:26 -0700	[diff] [blame]	320	* In order to keep background CIL push efficient, we only need to ensure the
				321	* CIL is large enough to maintain sufficient in-memory relogging to avoid
				322	* repeated physical writes of frequently modified metadata. If we allow the CIL
				323	* to grow to a substantial fraction of the log, then we may be pinning hundreds
				324	* of megabytes of metadata in memory until the CIL flushes. This can cause
				325	* issues when we are running low on memory - pinned memory cannot be reclaimed,
				326	* and the CIL consumes a lot of memory. Hence we need to set an upper physical
				327	* size limit for the CIL that limits the maximum amount of memory pinned by the
				328	* CIL but does not limit performance by reducing relogging efficiency
				329	* significantly.
				330	*
				331	* As such, the CIL push threshold ends up being the smaller of two thresholds:
				332	* - a threshold large enough that it allows CIL to be pushed and progress to be
				333	* made without excessive blocking of incoming transaction commits. This is
				334	* defined to be 12.5% of the log space - half the 25% push threshold of the
				335	* AIL.
				336	* - small enough that it doesn't pin excessive amounts of memory but maintains
				337	* close to peak relogging efficiency. This is defined to be 16x the iclog
				338	* buffer window (32MB) as measurements have shown this to be roughly the
				339	* point of diminishing performance increases under highly concurrent
				340	* modification workloads.
Dave Chinner	0e7ab7e	2020-03-24 20:10:27 -0700	[diff] [blame]	341	*
				342	* To prevent the CIL from overflowing upper commit size bounds, we introduce a
				343	* new threshold at which we block committing transactions until the background
				344	* CIL commit commences and switches to a new context. While this is not a hard
				345	* limit, it forces the process committing a transaction to the CIL to block and
				346	* yeild the CPU, giving the CIL push work a chance to be scheduled and start
				347	* work. This prevents a process running lots of transactions from overfilling
				348	* the CIL because it is not yielding the CPU. We set the blocking limit at
				349	* twice the background push space threshold so we keep in line with the AIL
				350	* push thresholds.
				351	*
				352	* Note: this is not a -hard- limit as blocking is applied after the transaction
				353	* is inserted into the CIL and the push has been triggered. It is largely a
				354	* throttling mechanism that allows the CIL push to be scheduled and run. A hard
				355	* limit will be difficult to implement without introducing global serialisation
				356	* in the CIL commit fast path, and it's not at all clear that we actually need
				357	* such hard limits given the ~7 years we've run without a hard limit before
				358	* finding the first situation where a checkpoint size overflow actually
				359	* occurred. Hence the simple throttle, and an ASSERT check to tell us that
				360	* we've overrun the max size.
Dave Chinner	df80615	2010-05-17 15:52:13 +1000	[diff] [blame]	361	*/
Dave Chinner	108a423	2020-03-24 20:10:26 -0700	[diff] [blame]	362	#define XLOG_CIL_SPACE_LIMIT(log) \
				363	min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
Dave Chinner	df80615	2010-05-17 15:52:13 +1000	[diff] [blame]	364
Dave Chinner	0e7ab7e	2020-03-24 20:10:27 -0700	[diff] [blame]	365	#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
				366	(XLOG_CIL_SPACE_LIMIT(log) * 2)
				367
Dave Chinner	df80615	2010-05-17 15:52:13 +1000	[diff] [blame]	368	/*
Christoph Hellwig	2849696	2012-02-20 02:31:25 +0000	[diff] [blame]	369	* ticket grant locks, queues and accounting have their own cachlines
				370	* as these are quite hot and can be operated on concurrently.
				371	*/
				372	struct xlog_grant_head {
				373	spinlock_t lock ____cacheline_aligned_in_smp;
				374	struct list_head waiters;
				375	atomic64_t grant;
				376	};
				377
				378	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	379	* The reservation head lsn is not made up of a cycle number and block number.
				380	* Instead, it uses a cycle number and byte number. Logs don't expect to
				381	* overflow 31 bits worth of byte offset, so using a byte number will mean
				382	* that round off problems won't occur when releasing partial reservations.
				383	*/
Mark Tinguely	9a8d2fd	2012-06-14 09:22:16 -0500	[diff] [blame]	384	struct xlog {
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	385	/* The following fields don't need locking */
				386	struct xfs_mount l_mp; / mount point */
David Chinner	a9c21c1	2008-10-30 17:39:35 +1100	[diff] [blame]	387	struct xfs_ail l_ailp; / AIL log is working with */
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	388	struct xfs_cil l_cilp; / CIL log is working with */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	389	struct xfs_buftarg l_targ; / buftarg of log */
Christoph Hellwig	1058d0f	2019-06-28 19:27:25 -0700	[diff] [blame]	390	struct workqueue_struct l_ioend_workqueue; / for I/O completions */
Dave Chinner	f661f1e	2012-10-08 21:56:02 +1100	[diff] [blame]	391	struct delayed_work l_work; /* background flush work */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	392	uint l_flags;
				393	uint l_quotaoffs_flag; /* XFS_DQ_, for QUOTAOFFs /
Christoph Hellwig	d5689ea	2010-12-01 22:06:22 +0000	[diff] [blame]	394	struct list_head *l_buf_cancel_table;
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	395	int l_iclog_hsize; /* size of iclog header */
				396	int l_iclog_heads; /* # of iclog header sectors */
Alex Elder	48389ef	2010-04-20 17:10:21 +1000	[diff] [blame]	397	uint l_sectBBsize; /* sector size in BBs (2^n) */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	398	int l_iclog_size; /* size of log in bytes */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	399	int l_iclog_bufs; /* number of iclog buffers */
				400	xfs_daddr_t l_logBBstart; /* start block of log */
				401	int l_logsize; /* size of log in bytes */
				402	int l_logBBsize; /* size of log in BB chunks */
				403
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	404	/* The following block of fields are changed while holding icloglock */
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	405	wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
Matthew Wilcox	d748c62	2008-05-19 16:34:27 +1000	[diff] [blame]	406	/* waiting for iclog flush */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	407	int l_covered_state;/* state of "covering disk
				408	* log entries" */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	409	xlog_in_core_t l_iclog; / head log queue */
Eric Sandeen	b22cd72c	2007-10-11 17:37:10 +1000	[diff] [blame]	410	spinlock_t l_icloglock; /* grab to change iclog state */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	411	int l_curr_cycle; /* Cycle number of log writes */
				412	int l_prev_cycle; /* Cycle number before last
				413	* block increment */
				414	int l_curr_block; /* current logical log block */
				415	int l_prev_block; /* previous logical log block */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	416
Dave Chinner	84f3c68	2010-12-03 22:11:29 +1100	[diff] [blame]	417	/*
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	418	* l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
				419	* read without needing to hold specific locks. To avoid operations
				420	* contending with other hot objects, place each of them on a separate
				421	* cacheline.
Dave Chinner	84f3c68	2010-12-03 22:11:29 +1100	[diff] [blame]	422	*/
				423	/* lsn of last LR on disk */
				424	atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	425	/* lsn of 1st LR with unflushed * buffers */
				426	atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
Dave Chinner	84f3c68	2010-12-03 22:11:29 +1100	[diff] [blame]	427
Christoph Hellwig	2849696	2012-02-20 02:31:25 +0000	[diff] [blame]	428	struct xlog_grant_head l_reserve_head;
				429	struct xlog_grant_head l_write_head;
Dave Chinner	3f16b98	2010-12-21 12:29:01 +1100	[diff] [blame]	430
Brian Foster	baff4e4	2014-07-15 08:07:29 +1000	[diff] [blame]	431	struct xfs_kobj l_kobj;
				432
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	433	/* The following field are used for debugging; need to hold icloglock */
				434	#ifdef DEBUG
Christoph Hellwig	5809d5e	2015-06-22 09:44:47 +1000	[diff] [blame]	435	void *l_iclog_bak[XLOG_MAX_ICLOGS];
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	436	#endif
Brian Foster	12818d2	2016-09-26 08:22:16 +1000	[diff] [blame]	437	/* log recovery lsn tracking (for buffer submission */
				438	xfs_lsn_t l_recovery_lsn;
Mark Tinguely	9a8d2fd	2012-06-14 09:22:16 -0500	[diff] [blame]	439	};
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	440
Christoph Hellwig	d5689ea	2010-12-01 22:06:22 +0000	[diff] [blame]	441	#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
Darrick J. Wong	c8ce540	2017-06-16 11:00:05 -0700	[diff] [blame]	442	((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
Christoph Hellwig	d5689ea	2010-12-01 22:06:22 +0000	[diff] [blame]	443
Christoph Hellwig	b941c71	2020-03-12 16:52:49 -0700	[diff] [blame]	444	#define XLOG_FORCED_SHUTDOWN(log) \
				445	(unlikely((log)->l_flags & XLOG_IO_ERROR))
Nathan Scott	cfcbbbd	2005-11-02 15:12:04 +1100	[diff] [blame]	446
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	447	/* common routines */
Mark Tinguely	9a8d2fd	2012-06-14 09:22:16 -0500	[diff] [blame]	448	extern int
				449	xlog_recover(
				450	struct xlog *log);
				451	extern int
				452	xlog_recover_finish(
				453	struct xlog *log);
Hariprasad Kelam	a7a9250	2019-07-03 07:34:18 -0700	[diff] [blame]	454	extern void
Brian Foster	f0b2efa	2015-08-19 09:58:36 +1000	[diff] [blame]	455	xlog_recover_cancel(struct xlog *);
Christoph Hellwig	0e446be	2012-11-12 22:54:24 +1100	[diff] [blame]	456
Dave Chinner	f9668a0	2012-11-28 13:01:03 +1100	[diff] [blame]	457	extern __le32 xlog_cksum(struct xlog log, struct xlog_rec_header rhead,
Christoph Hellwig	0e446be	2012-11-12 22:54:24 +1100	[diff] [blame]	458	char *dp, int size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	459
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	460	extern kmem_zone_t *xfs_log_ticket_zone;
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	461	struct xlog_ticket *
				462	xlog_ticket_alloc(
				463	struct xlog *log,
				464	int unit_bytes,
				465	int count,
				466	char client,
Carlos Maiolino	ca4f258	2020-07-22 09:23:17 -0700	[diff] [blame]	467	bool permanent);
David Chinner	eb01c9c	2008-04-10 12:18:46 +1000	[diff] [blame]	468
Christoph Hellwig	e6b1f27	2010-03-23 11:47:38 +1100	[diff] [blame]	469	static inline void
				470	xlog_write_adv_cnt(void *ptr, int len, int *off, size_t bytes)
				471	{
				472	*ptr += bytes;
				473	*len -= bytes;
				474	*off += bytes;
				475	}
				476
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	477	void xlog_print_tic_res(struct xfs_mount mp, struct xlog_ticket ticket);
Brian Foster	d4ca1d5	2017-06-14 21:29:50 -0700	[diff] [blame]	478	void xlog_print_trans(struct xfs_trans *);
Dave Chinner	7ec9492	2020-03-25 18:18:20 -0700	[diff] [blame]	479	int xlog_write(struct xlog log, struct xfs_log_vec log_vector,
				480	struct xlog_ticket tic, xfs_lsn_t start_lsn,
				481	struct xlog_in_core **commit_iclog, uint flags,
				482	bool need_start_rec);
Dave Chinner	f10e925d	2020-03-25 18:18:23 -0700	[diff] [blame]	483	int xlog_commit_record(struct xlog log, struct xlog_ticket ticket,
Dave Chinner	dd40177	2020-03-25 18:18:21 -0700	[diff] [blame]	484	struct xlog_in_core *iclog, xfs_lsn_t lsn);
Christoph Hellwig	8b41e3f	2020-03-25 18:18:23 -0700	[diff] [blame]	485	void xfs_log_ticket_ungrant(struct xlog log, struct xlog_ticket ticket);
				486	void xfs_log_ticket_regrant(struct xlog log, struct xlog_ticket ticket);
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	487
				488	/*
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	489	* When we crack an atomic LSN, we sample it first so that the value will not
				490	* change while we are cracking it into the component values. This means we
				491	* will always get consistent component values to work from. This should always
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	492	* be used to sample and crack LSNs that are stored and updated in atomic
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	493	* variables.
				494	*/
				495	static inline void
				496	xlog_crack_atomic_lsn(atomic64_t lsn, uint cycle, uint *block)
				497	{
				498	xfs_lsn_t val = atomic64_read(lsn);
				499
				500	*cycle = CYCLE_LSN(val);
				501	*block = BLOCK_LSN(val);
				502	}
				503
				504	/*
				505	* Calculate and assign a value to an atomic LSN variable from component pieces.
				506	*/
				507	static inline void
				508	xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
				509	{
				510	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
				511	}
				512
				513	/*
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	514	* When we crack the grant head, we sample it first so that the value will not
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	515	* change while we are cracking it into the component values. This means we
				516	* will always get consistent component values to work from.
				517	*/
				518	static inline void
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	519	xlog_crack_grant_head_val(int64_t val, int cycle, int space)
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	520	{
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	521	*cycle = val >> 32;
				522	*space = val & 0xffffffff;
				523	}
				524
				525	static inline void
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	526	xlog_crack_grant_head(atomic64_t head, int cycle, int *space)
				527	{
				528	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
				529	}
				530
				531	static inline int64_t
				532	xlog_assign_grant_head_val(int cycle, int space)
				533	{
				534	return ((int64_t)cycle << 32) \| space;
				535	}
				536
				537	static inline void
Dave Chinner	c8a09ff	2010-12-04 00:02:40 +1100	[diff] [blame]	538	xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	539	{
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	540	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	541	}
				542
				543	/*
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	544	* Committed Item List interfaces
				545	*/
Dave Chinner	2c6e24c	2013-10-15 09:17:49 +1100	[diff] [blame]	546	int xlog_cil_init(struct xlog *log);
				547	void xlog_cil_init_post_recovery(struct xlog *log);
				548	void xlog_cil_destroy(struct xlog *log);
				549	bool xlog_cil_empty(struct xlog *log);
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	550
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	551	/*
				552	* CIL force routines
				553	*/
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	554	xfs_lsn_t
				555	xlog_cil_force_lsn(
				556	struct xlog *log,
				557	xfs_lsn_t sequence);
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	558
				559	static inline void
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	560	xlog_cil_force(struct xlog *log)
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	561	{
				562	xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
				563	}
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	564
Tim Shimmin	955e47a	2006-09-28 11:04:16 +1000	[diff] [blame]	565	/*
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	566	* Wrapper function for waiting on a wait queue serialised against wakeups
				567	* by a spinlock. This matches the semantics of all the wait queues used in the
				568	* log code.
				569	*/
Darrick J. Wong	f755979	2019-11-06 08:41:20 -0800	[diff] [blame]	570	static inline void
				571	xlog_wait(
				572	struct wait_queue_head *wq,
				573	struct spinlock *lock)
				574	__releases(lock)
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	575	{
				576	DECLARE_WAITQUEUE(wait, current);
				577
				578	add_wait_queue_exclusive(wq, &wait);
				579	__set_current_state(TASK_UNINTERRUPTIBLE);
				580	spin_unlock(lock);
				581	schedule();
				582	remove_wait_queue(wq, &wait);
				583	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	584
Brian Foster	a45086e	2015-10-12 15:59:25 +1100	[diff] [blame]	585	/*
				586	* The LSN is valid so long as it is behind the current LSN. If it isn't, this
				587	* means that the next log record that includes this metadata could have a
				588	* smaller LSN. In turn, this means that the modification in the log would not
				589	* replay.
				590	*/
				591	static inline bool
				592	xlog_valid_lsn(
				593	struct xlog *log,
				594	xfs_lsn_t lsn)
				595	{
				596	int cur_cycle;
				597	int cur_block;
				598	bool valid = true;
				599
				600	/*
				601	* First, sample the current lsn without locking to avoid added
				602	* contention from metadata I/O. The current cycle and block are updated
				603	* (in xlog_state_switch_iclogs()) and read here in a particular order
				604	* to avoid false negatives (e.g., thinking the metadata LSN is valid
				605	* when it is not).
				606	*
				607	* The current block is always rewound before the cycle is bumped in
				608	* xlog_state_switch_iclogs() to ensure the current LSN is never seen in
				609	* a transiently forward state. Instead, we can see the LSN in a
				610	* transiently behind state if we happen to race with a cycle wrap.
				611	*/
Mark Rutland	6aa7de0	2017-10-23 14:07:29 -0700	[diff] [blame]	612	cur_cycle = READ_ONCE(log->l_curr_cycle);
Brian Foster	a45086e	2015-10-12 15:59:25 +1100	[diff] [blame]	613	smp_rmb();
Mark Rutland	6aa7de0	2017-10-23 14:07:29 -0700	[diff] [blame]	614	cur_block = READ_ONCE(log->l_curr_block);
Brian Foster	a45086e	2015-10-12 15:59:25 +1100	[diff] [blame]	615
				616	if ((CYCLE_LSN(lsn) > cur_cycle) \|\|
				617	(CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
				618	/*
				619	* If the metadata LSN appears invalid, it's possible the check
				620	* above raced with a wrap to the next log cycle. Grab the lock
				621	* to check for sure.
				622	*/
				623	spin_lock(&log->l_icloglock);
				624	cur_cycle = log->l_curr_cycle;
				625	cur_block = log->l_curr_block;
				626	spin_unlock(&log->l_icloglock);
				627
				628	if ((CYCLE_LSN(lsn) > cur_cycle) \|\|
				629	(CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
				630	valid = false;
				631	}
				632
				633	return valid;
				634	}
				635
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	636	#endif /* __XFS_LOG_PRIV_H__ */