Blame - fs/xfs/xfs_log_priv.h - SHIFTPHONES/kernel/common

blob: b880c23cb6e4ffd78324ff26a2890c0010f67d64 [file] [log] [blame]

Dave Chinner	0b61f8a	2018-06-05 19:42:14 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2	/*
Nathan Scott	7b71876	2005-11-02 14:58:39 +1100	[diff] [blame]	3	* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
				4	* All Rights Reserved.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	5	*/
				6	#ifndef __XFS_LOG_PRIV_H__
				7	#define __XFS_LOG_PRIV_H__
				8
				9	struct xfs_buf;
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	10	struct xlog;
Nathan Scott	a844f45	2005-11-02 14:38:42 +1100	[diff] [blame]	11	struct xlog_ticket;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	12	struct xfs_mount;
				13
				14	/*
Dave Chinner	fc06c6d	2013-08-12 20:49:22 +1000	[diff] [blame]	15	* Flags for log structure
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	16	*/
Dave Chinner	fc06c6d	2013-08-12 20:49:22 +1000	[diff] [blame]	17	#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
				18	#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
				19	#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
				20	shutdown */
				21	#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	22
				23	/*
				24	* get client id from packed copy.
				25	*
				26	* this hack is here because the xlog_pack code copies four bytes
				27	* of xlog_op_header containing the fields oh_clientid, oh_flags
				28	* and oh_res2 into the packed copy.
				29	*
				30	* later on this four byte chunk is treated as an int and the
				31	* client id is pulled out.
				32	*
				33	* this has endian issues, of course.
				34	*/
Christoph Hellwig	b53e675	2007-10-12 10:59:34 +1000	[diff] [blame]	35	static inline uint xlog_get_client_id(__be32 i)
Christoph Hellwig	03bea6f	2007-10-12 10:58:05 +1000	[diff] [blame]	36	{
Christoph Hellwig	b53e675	2007-10-12 10:59:34 +1000	[diff] [blame]	37	return be32_to_cpu(i) >> 24;
Christoph Hellwig	03bea6f	2007-10-12 10:58:05 +1000	[diff] [blame]	38	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	39
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	/*
				41	* In core log state
				42	*/
				43	#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */
				44	#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
				45	#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */
				46	#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
				47	#define XLOG_STATE_DO_CALLBACK \
				48	0x0010 /* Process callback functions */
				49	#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
				50	#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
				51	#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
				52	#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
				53	#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	54
				55	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	56	* Flags to log ticket
				57	*/
				58	#define XLOG_TIC_INITED 0x1 /* has been initialized */
				59	#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
Christoph Hellwig	0b1b213	2009-12-14 23:14:59 +0000	[diff] [blame]	60
				61	#define XLOG_TIC_FLAGS \
				62	{ XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
Dave Chinner	1054794	2010-12-21 12:02:25 +1100	[diff] [blame]	63	{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
Christoph Hellwig	0b1b213	2009-12-14 23:14:59 +0000	[diff] [blame]	64
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	65	/*
				66	* Below are states for covering allocation transactions.
				67	* By covering, we mean changing the h_tail_lsn in the last on-disk
				68	* log write such that no allocation transactions will be re-done during
				69	* recovery after a system crash. Recovery starts at the last on-disk
				70	* log write.
				71	*
				72	* These states are used to insert dummy log entries to cover
				73	* space allocation transactions which can undo non-transactional changes
				74	* after a crash. Writes to a file with space
				75	* already allocated do not result in any transactions. Allocations
				76	* might include space beyond the EOF. So if we just push the EOF a
				77	* little, the last transaction for the file could contain the wrong
				78	* size. If there is no file system activity, after an allocation
				79	* transaction, and the system crashes, the allocation transaction
				80	* will get replayed and the file will be truncated. This could
				81	* be hours/days/... after the allocation occurred.
				82	*
				83	* The fix for this is to do two dummy transactions when the
				84	* system is idle. We need two dummy transaction because the h_tail_lsn
				85	* in the log record header needs to point beyond the last possible
				86	* non-dummy transaction. The first dummy changes the h_tail_lsn to
				87	* the first transaction before the dummy. The second dummy causes
				88	* h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
				89	*
				90	* These dummy transactions get committed when everything
				91	* is idle (after there has been some activity).
				92	*
				93	* There are 5 states used to control this.
				94	*
				95	* IDLE -- no logging has been done on the file system or
				96	* we are done covering previous transactions.
				97	* NEED -- logging has occurred and we need a dummy transaction
				98	* when the log becomes idle.
				99	* DONE -- we were in the NEED state and have committed a dummy
				100	* transaction.
				101	* NEED2 -- we detected that a dummy transaction has gone to the
				102	* on disk log with no other transactions.
				103	* DONE2 -- we committed a dummy transaction when in the NEED2 state.
				104	*
				105	* There are two places where we switch states:
				106	*
				107	* 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
				108	* We commit the dummy transaction and switch to DONE or DONE2,
				109	* respectively. In all other states, we don't do anything.
				110	*
				111	* 2.) When we finish writing the on-disk log (xlog_state_clean_log).
				112	*
				113	* No matter what state we are in, if this isn't the dummy
				114	* transaction going out, the next state is NEED.
				115	* So, if we aren't in the DONE or DONE2 states, the next state
				116	* is NEED. We can't be finishing a write of the dummy record
				117	* unless it was committed and the state switched to DONE or DONE2.
				118	*
				119	* If we are in the DONE state and this was a write of the
				120	* dummy transaction, we move to NEED2.
				121	*
				122	* If we are in the DONE2 state and this was a write of the
				123	* dummy transaction, we move to IDLE.
				124	*
				125	*
				126	* Writing only one dummy transaction can get appended to
				127	* one file space allocation. When this happens, the log recovery
				128	* code replays the space allocation and a file could be truncated.
				129	* This is why we have the NEED2 and DONE2 states before going idle.
				130	*/
				131
				132	#define XLOG_STATE_COVER_IDLE 0
				133	#define XLOG_STATE_COVER_NEED 1
				134	#define XLOG_STATE_COVER_DONE 2
				135	#define XLOG_STATE_COVER_NEED2 3
				136	#define XLOG_STATE_COVER_DONE2 4
				137
				138	#define XLOG_COVER_OPS 5
				139
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	140	/* Ticket reservation region accounting */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	141	#define XLOG_TIC_LEN_MAX 15
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	142
				143	/*
				144	* Reservation region
				145	* As would be stored in xfs_log_iovec but without the i_addr which
				146	* we don't care about.
				147	*/
				148	typedef struct xlog_res {
Tim Shimmin	1259845	2006-01-11 21:02:47 +1100	[diff] [blame]	149	uint r_len; /* region length :4 */
				150	uint r_type; /* region's transaction type :4 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	151	} xlog_res_t;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	152
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	153	typedef struct xlog_ticket {
Dave Chinner	1054794	2010-12-21 12:02:25 +1100	[diff] [blame]	154	struct list_head t_queue; /* reserve/write queue */
Christoph Hellwig	14a7235f	2012-02-20 02:31:24 +0000	[diff] [blame]	155	struct task_struct t_task; / task that owns this ticket */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	156	xlog_tid_t t_tid; /* transaction identifier : 4 */
Dave Chinner	cc09c0d	2008-11-17 17:37:10 +1100	[diff] [blame]	157	atomic_t t_ref; /* ticket reference count : 4 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	158	int t_curr_res; /* current reservation in bytes : 4 */
				159	int t_unit_res; /* unit reservation in bytes : 4 */
				160	char t_ocnt; /* original count : 1 */
				161	char t_cnt; /* current count : 1 */
				162	char t_clientid; /* who does this belong to; : 1 */
				163	char t_flags; /* properties of reservation : 1 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	164
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	165	/* reservation array fields */
				166	uint t_res_num; /* num in array : 4 */
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	167	uint t_res_num_ophdrs; /* num op hdrs : 4 */
				168	uint t_res_arr_sum; /* array sum : 4 */
				169	uint t_res_o_flow; /* sum overflow : 4 */
Tim Shimmin	1259845	2006-01-11 21:02:47 +1100	[diff] [blame]	170	xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	171	} xlog_ticket_t;
Tim Shimmin	7e9c639	2005-09-02 16:42:05 +1000	[diff] [blame]	172
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	173	/*
				174	* - A log record header is 512 bytes. There is plenty of room to grow the
				175	* xlog_rec_header_t into the reserved space.
				176	* - ic_data follows, so a write to disk can start at the beginning of
				177	* the iclog.
David Chinner	12017fa	2008-08-13 16:34:31 +1000	[diff] [blame]	178	* - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	179	* - ic_next is the pointer to the next iclog in the ring.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	180	* - ic_log is a pointer back to the global log structure.
Christoph Hellwig	79b54d9	2019-06-28 19:27:25 -0700	[diff] [blame]	181	* - ic_size is the full size of the log buffer, minus the cycle headers.
				182	* - ic_io_size is the size of the currently pending log buffer write, which
				183	* might be smaller than ic_size
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	184	* - ic_offset is the current number of bytes written to in this iclog.
				185	* - ic_refcnt is bumped when someone is writing to the log.
				186	* - ic_state is the state of the iclog.
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	187	*
				188	* Because of cacheline contention on large machines, we need to separate
				189	* various resources onto different cachelines. To start with, make the
				190	* structure cacheline aligned. The following fields can be contended on
				191	* by independent processes:
				192	*
Christoph Hellwig	89ae379	2019-06-28 19:27:34 -0700	[diff] [blame]	193	* - ic_callbacks
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	194	* - ic_refcnt
				195	* - fields protected by the global l_icloglock
				196	*
				197	* so we need to ensure that these fields are located in separate cachelines.
				198	* We'll put all the read-only and l_icloglock fields in the first cacheline,
				199	* and move everything else out to subsequent cachelines.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	200	*/
Christoph Hellwig	b28708d	2008-11-28 14:23:38 +1100	[diff] [blame]	201	typedef struct xlog_in_core {
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	202	wait_queue_head_t ic_force_wait;
				203	wait_queue_head_t ic_write_wait;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	204	struct xlog_in_core *ic_next;
				205	struct xlog_in_core *ic_prev;
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	206	struct xlog *ic_log;
Christoph Hellwig	79b54d9	2019-06-28 19:27:25 -0700	[diff] [blame]	207	u32 ic_size;
				208	u32 ic_io_size;
				209	u32 ic_offset;
Christoph Hellwig	a568778	2009-02-09 08:37:39 +0100	[diff] [blame]	210	unsigned short ic_state;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	211	char ic_datap; / pointer to iclog data */
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	212
				213	/* Callback structures need their own cacheline */
				214	spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
Christoph Hellwig	89ae379	2019-06-28 19:27:34 -0700	[diff] [blame]	215	struct list_head ic_callbacks;
David Chinner	114d23a	2008-04-10 12:18:39 +1000	[diff] [blame]	216
				217	/* reference counts need their own cacheline */
				218	atomic_t ic_refcnt ____cacheline_aligned_in_smp;
Christoph Hellwig	b28708d	2008-11-28 14:23:38 +1100	[diff] [blame]	219	xlog_in_core_2_t *ic_data;
				220	#define ic_header ic_data->hic_header
Christoph Hellwig	366fc4b	2019-06-28 19:27:21 -0700	[diff] [blame]	221	#ifdef DEBUG
				222	bool ic_fail_crc : 1;
				223	#endif
Christoph Hellwig	79b54d9	2019-06-28 19:27:25 -0700	[diff] [blame]	224	struct semaphore ic_sema;
				225	struct work_struct ic_end_io_work;
				226	struct bio ic_bio;
				227	struct bio_vec ic_bvec[];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	228	} xlog_in_core_t;
				229
				230	/*
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	231	* The CIL context is used to aggregate per-transaction details as well be
				232	* passed to the iclog for checkpoint post-commit processing. After being
				233	* passed to the iclog, another context needs to be allocated for tracking the
				234	* next set of transactions to be aggregated into a checkpoint.
				235	*/
				236	struct xfs_cil;
				237
				238	struct xfs_cil_ctx {
				239	struct xfs_cil *cil;
				240	xfs_lsn_t sequence; /* chkpt sequence # */
				241	xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
				242	xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
				243	struct xlog_ticket ticket; / chkpt ticket */
				244	int nvecs; /* number of regions */
				245	int space_used; /* aggregate size of regions */
				246	struct list_head busy_extents; /* busy extents in chkpt */
				247	struct xfs_log_vec lv_chain; / logvecs being pushed */
Christoph Hellwig	89ae379	2019-06-28 19:27:34 -0700	[diff] [blame]	248	struct list_head iclog_entry;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	249	struct list_head committing; /* ctx committing list */
Christoph Hellwig	4560e78	2017-02-07 14:07:58 -0800	[diff] [blame]	250	struct work_struct discard_endio_work;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	251	};
				252
				253	/*
				254	* Committed Item List structure
				255	*
				256	* This structure is used to track log items that have been committed but not
				257	* yet written into the log. It is used only when the delayed logging mount
				258	* option is enabled.
				259	*
				260	* This structure tracks the list of committing checkpoint contexts so
				261	* we can avoid the problem of having to hold out new transactions during a
				262	* flush until we have a the commit record LSN of the checkpoint. We can
				263	* traverse the list of committing contexts in xlog_cil_push_lsn() to find a
				264	* sequence match and extract the commit LSN directly from there. If the
				265	* checkpoint is still in the process of committing, we can block waiting for
				266	* the commit LSN to be determined as well. This should make synchronous
				267	* operations almost as efficient as the old logging methods.
				268	*/
				269	struct xfs_cil {
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	270	struct xlog *xc_log;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	271	struct list_head xc_cil;
				272	spinlock_t xc_cil_lock;
Dave Chinner	4bb928c	2013-08-12 20:50:08 +1000	[diff] [blame]	273
				274	struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	275	struct xfs_cil_ctx *xc_ctx;
Dave Chinner	4bb928c	2013-08-12 20:50:08 +1000	[diff] [blame]	276
				277	spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
				278	xfs_lsn_t xc_push_seq;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	279	struct list_head xc_committing;
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	280	wait_queue_head_t xc_commit_wait;
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	281	xfs_lsn_t xc_current_sequence;
Dave Chinner	4c2d542	2012-04-23 17:54:32 +1000	[diff] [blame]	282	struct work_struct xc_push_work;
Dave Chinner	4bb928c	2013-08-12 20:50:08 +1000	[diff] [blame]	283	} ____cacheline_aligned_in_smp;
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	284
				285	/*
Dave Chinner	8016867	2010-09-24 18:13:44 +1000	[diff] [blame]	286	* The amount of log space we allow the CIL to aggregate is difficult to size.
				287	* Whatever we choose, we have to make sure we can get a reservation for the
				288	* log space effectively, that it is large enough to capture sufficient
				289	* relogging to reduce log buffer IO significantly, but it is not too large for
				290	* the log or induces too much latency when writing out through the iclogs. We
				291	* track both space consumed and the number of vectors in the checkpoint
				292	* context, so we need to decide which to use for limiting.
Dave Chinner	df80615	2010-05-17 15:52:13 +1000	[diff] [blame]	293	*
				294	* Every log buffer we write out during a push needs a header reserved, which
				295	* is at least one sector and more for v2 logs. Hence we need a reservation of
				296	* at least 512 bytes per 32k of log space just for the LR headers. That means
				297	* 16KB of reservation per megabyte of delayed logging space we will consume,
				298	* plus various headers. The number of headers will vary based on the num of
				299	* io vectors, so limiting on a specific number of vectors is going to result
				300	* in transactions of varying size. IOWs, it is more consistent to track and
				301	* limit space consumed in the log rather than by the number of objects being
				302	* logged in order to prevent checkpoint ticket overruns.
				303	*
				304	* Further, use of static reservations through the log grant mechanism is
				305	* problematic. It introduces a lot of complexity (e.g. reserve grant vs write
				306	* grant) and a significant deadlock potential because regranting write space
				307	* can block on log pushes. Hence if we have to regrant log space during a log
				308	* push, we can deadlock.
				309	*
				310	* However, we can avoid this by use of a dynamic "reservation stealing"
				311	* technique during transaction commit whereby unused reservation space in the
				312	* transaction ticket is transferred to the CIL ctx commit ticket to cover the
				313	* space needed by the checkpoint transaction. This means that we never need to
				314	* specifically reserve space for the CIL checkpoint transaction, nor do we
				315	* need to regrant space once the checkpoint completes. This also means the
				316	* checkpoint transaction ticket is specific to the checkpoint context, rather
				317	* than the CIL itself.
				318	*
Dave Chinner	8016867	2010-09-24 18:13:44 +1000	[diff] [blame]	319	* With dynamic reservations, we can effectively make up arbitrary limits for
				320	* the checkpoint size so long as they don't violate any other size rules.
				321	* Recovery imposes a rule that no transaction exceed half the log, so we are
				322	* limited by that. Furthermore, the log transaction reservation subsystem
				323	* tries to keep 25% of the log free, so we need to keep below that limit or we
				324	* risk running out of free log space to start any new transactions.
				325	*
				326	* In order to keep background CIL push efficient, we will set a lower
				327	* threshold at which background pushing is attempted without blocking current
				328	* transaction commits. A separate, higher bound defines when CIL pushes are
				329	* enforced to ensure we stay within our maximum checkpoint size bounds.
				330	* threshold, yet give us plenty of space for aggregation on large logs.
Dave Chinner	df80615	2010-05-17 15:52:13 +1000	[diff] [blame]	331	*/
Dave Chinner	8016867	2010-09-24 18:13:44 +1000	[diff] [blame]	332	#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
Dave Chinner	df80615	2010-05-17 15:52:13 +1000	[diff] [blame]	333
				334	/*
Christoph Hellwig	2849696	2012-02-20 02:31:25 +0000	[diff] [blame]	335	* ticket grant locks, queues and accounting have their own cachlines
				336	* as these are quite hot and can be operated on concurrently.
				337	*/
				338	struct xlog_grant_head {
				339	spinlock_t lock ____cacheline_aligned_in_smp;
				340	struct list_head waiters;
				341	atomic64_t grant;
				342	};
				343
				344	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	345	* The reservation head lsn is not made up of a cycle number and block number.
				346	* Instead, it uses a cycle number and byte number. Logs don't expect to
				347	* overflow 31 bits worth of byte offset, so using a byte number will mean
				348	* that round off problems won't occur when releasing partial reservations.
				349	*/
Mark Tinguely	9a8d2fd	2012-06-14 09:22:16 -0500	[diff] [blame]	350	struct xlog {
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	351	/* The following fields don't need locking */
				352	struct xfs_mount l_mp; / mount point */
David Chinner	a9c21c1	2008-10-30 17:39:35 +1100	[diff] [blame]	353	struct xfs_ail l_ailp; / AIL log is working with */
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	354	struct xfs_cil l_cilp; / CIL log is working with */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	355	struct xfs_buftarg l_targ; / buftarg of log */
Christoph Hellwig	1058d0f	2019-06-28 19:27:25 -0700	[diff] [blame]	356	struct workqueue_struct l_ioend_workqueue; / for I/O completions */
Dave Chinner	f661f1e	2012-10-08 21:56:02 +1100	[diff] [blame]	357	struct delayed_work l_work; /* background flush work */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	358	uint l_flags;
				359	uint l_quotaoffs_flag; /* XFS_DQ_, for QUOTAOFFs /
Christoph Hellwig	d5689ea	2010-12-01 22:06:22 +0000	[diff] [blame]	360	struct list_head *l_buf_cancel_table;
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	361	int l_iclog_hsize; /* size of iclog header */
				362	int l_iclog_heads; /* # of iclog header sectors */
Alex Elder	48389ef	2010-04-20 17:10:21 +1000	[diff] [blame]	363	uint l_sectBBsize; /* sector size in BBs (2^n) */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	364	int l_iclog_size; /* size of log in bytes */
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	365	int l_iclog_bufs; /* number of iclog buffers */
				366	xfs_daddr_t l_logBBstart; /* start block of log */
				367	int l_logsize; /* size of log in bytes */
				368	int l_logBBsize; /* size of log in BB chunks */
				369
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	370	/* The following block of fields are changed while holding icloglock */
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	371	wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
Matthew Wilcox	d748c62	2008-05-19 16:34:27 +1000	[diff] [blame]	372	/* waiting for iclog flush */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	373	int l_covered_state;/* state of "covering disk
				374	* log entries" */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	375	xlog_in_core_t l_iclog; / head log queue */
Eric Sandeen	b22cd72c	2007-10-11 17:37:10 +1000	[diff] [blame]	376	spinlock_t l_icloglock; /* grab to change iclog state */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	377	int l_curr_cycle; /* Cycle number of log writes */
				378	int l_prev_cycle; /* Cycle number before last
				379	* block increment */
				380	int l_curr_block; /* current logical log block */
				381	int l_prev_block; /* previous logical log block */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	382
Dave Chinner	84f3c68	2010-12-03 22:11:29 +1100	[diff] [blame]	383	/*
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	384	* l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
				385	* read without needing to hold specific locks. To avoid operations
				386	* contending with other hot objects, place each of them on a separate
				387	* cacheline.
Dave Chinner	84f3c68	2010-12-03 22:11:29 +1100	[diff] [blame]	388	*/
				389	/* lsn of last LR on disk */
				390	atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	391	/* lsn of 1st LR with unflushed * buffers */
				392	atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
Dave Chinner	84f3c68	2010-12-03 22:11:29 +1100	[diff] [blame]	393
Christoph Hellwig	2849696	2012-02-20 02:31:25 +0000	[diff] [blame]	394	struct xlog_grant_head l_reserve_head;
				395	struct xlog_grant_head l_write_head;
Dave Chinner	3f16b98	2010-12-21 12:29:01 +1100	[diff] [blame]	396
Brian Foster	baff4e4	2014-07-15 08:07:29 +1000	[diff] [blame]	397	struct xfs_kobj l_kobj;
				398
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	399	/* The following field are used for debugging; need to hold icloglock */
				400	#ifdef DEBUG
Christoph Hellwig	5809d5e	2015-06-22 09:44:47 +1000	[diff] [blame]	401	void *l_iclog_bak[XLOG_MAX_ICLOGS];
Brian Foster	609adfc	2016-01-05 07:41:16 +1100	[diff] [blame]	402	/* log record crc error injection factor */
				403	uint32_t l_badcrc_factor;
David Chinner	4679b2d	2008-04-10 12:18:54 +1000	[diff] [blame]	404	#endif
Brian Foster	12818d2	2016-09-26 08:22:16 +1000	[diff] [blame]	405	/* log recovery lsn tracking (for buffer submission */
				406	xfs_lsn_t l_recovery_lsn;
Mark Tinguely	9a8d2fd	2012-06-14 09:22:16 -0500	[diff] [blame]	407	};
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	408
Christoph Hellwig	d5689ea	2010-12-01 22:06:22 +0000	[diff] [blame]	409	#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
Darrick J. Wong	c8ce540	2017-06-16 11:00:05 -0700	[diff] [blame]	410	((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
Christoph Hellwig	d5689ea	2010-12-01 22:06:22 +0000	[diff] [blame]	411
Nathan Scott	cfcbbbd	2005-11-02 15:12:04 +1100	[diff] [blame]	412	#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
				413
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	414	/* common routines */
Mark Tinguely	9a8d2fd	2012-06-14 09:22:16 -0500	[diff] [blame]	415	extern int
				416	xlog_recover(
				417	struct xlog *log);
				418	extern int
				419	xlog_recover_finish(
				420	struct xlog *log);
Hariprasad Kelam	a7a9250	2019-07-03 07:34:18 -0700	[diff] [blame]	421	extern void
Brian Foster	f0b2efa	2015-08-19 09:58:36 +1000	[diff] [blame]	422	xlog_recover_cancel(struct xlog *);
Christoph Hellwig	0e446be	2012-11-12 22:54:24 +1100	[diff] [blame]	423
Dave Chinner	f9668a0	2012-11-28 13:01:03 +1100	[diff] [blame]	424	extern __le32 xlog_cksum(struct xlog log, struct xlog_rec_header rhead,
Christoph Hellwig	0e446be	2012-11-12 22:54:24 +1100	[diff] [blame]	425	char *dp, int size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	426
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	427	extern kmem_zone_t *xfs_log_ticket_zone;
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	428	struct xlog_ticket *
				429	xlog_ticket_alloc(
				430	struct xlog *log,
				431	int unit_bytes,
				432	int count,
				433	char client,
				434	bool permanent,
				435	xfs_km_flags_t alloc_flags);
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	436
David Chinner	eb01c9c	2008-04-10 12:18:46 +1000	[diff] [blame]	437
Christoph Hellwig	e6b1f27	2010-03-23 11:47:38 +1100	[diff] [blame]	438	static inline void
				439	xlog_write_adv_cnt(void *ptr, int len, int *off, size_t bytes)
				440	{
				441	*ptr += bytes;
				442	*len -= bytes;
				443	*off += bytes;
				444	}
				445
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	446	void xlog_print_tic_res(struct xfs_mount mp, struct xlog_ticket ticket);
Brian Foster	d4ca1d5	2017-06-14 21:29:50 -0700	[diff] [blame]	447	void xlog_print_trans(struct xfs_trans *);
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	448	int
				449	xlog_write(
				450	struct xlog *log,
				451	struct xfs_log_vec *log_vector,
				452	struct xlog_ticket *tic,
				453	xfs_lsn_t *start_lsn,
				454	struct xlog_in_core **commit_iclog,
				455	uint flags);
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	456
				457	/*
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	458	* When we crack an atomic LSN, we sample it first so that the value will not
				459	* change while we are cracking it into the component values. This means we
				460	* will always get consistent component values to work from. This should always
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	461	* be used to sample and crack LSNs that are stored and updated in atomic
Dave Chinner	1c3cb9e	2010-12-21 12:28:39 +1100	[diff] [blame]	462	* variables.
				463	*/
				464	static inline void
				465	xlog_crack_atomic_lsn(atomic64_t lsn, uint cycle, uint *block)
				466	{
				467	xfs_lsn_t val = atomic64_read(lsn);
				468
				469	*cycle = CYCLE_LSN(val);
				470	*block = BLOCK_LSN(val);
				471	}
				472
				473	/*
				474	* Calculate and assign a value to an atomic LSN variable from component pieces.
				475	*/
				476	static inline void
				477	xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
				478	{
				479	atomic64_set(lsn, xlog_assign_lsn(cycle, block));
				480	}
				481
				482	/*
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	483	* When we crack the grant head, we sample it first so that the value will not
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	484	* change while we are cracking it into the component values. This means we
				485	* will always get consistent component values to work from.
				486	*/
				487	static inline void
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	488	xlog_crack_grant_head_val(int64_t val, int cycle, int space)
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	489	{
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	490	*cycle = val >> 32;
				491	*space = val & 0xffffffff;
				492	}
				493
				494	static inline void
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	495	xlog_crack_grant_head(atomic64_t head, int cycle, int *space)
				496	{
				497	xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
				498	}
				499
				500	static inline int64_t
				501	xlog_assign_grant_head_val(int cycle, int space)
				502	{
				503	return ((int64_t)cycle << 32) \| space;
				504	}
				505
				506	static inline void
Dave Chinner	c8a09ff	2010-12-04 00:02:40 +1100	[diff] [blame]	507	xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	508	{
Dave Chinner	d0eb2f3	2010-12-21 12:29:14 +1100	[diff] [blame]	509	atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
Dave Chinner	a69ed03	2010-12-21 12:08:20 +1100	[diff] [blame]	510	}
				511
				512	/*
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	513	* Committed Item List interfaces
				514	*/
Dave Chinner	2c6e24c	2013-10-15 09:17:49 +1100	[diff] [blame]	515	int xlog_cil_init(struct xlog *log);
				516	void xlog_cil_init_post_recovery(struct xlog *log);
				517	void xlog_cil_destroy(struct xlog *log);
				518	bool xlog_cil_empty(struct xlog *log);
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	519
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	520	/*
				521	* CIL force routines
				522	*/
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	523	xfs_lsn_t
				524	xlog_cil_force_lsn(
				525	struct xlog *log,
				526	xfs_lsn_t sequence);
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	527
				528	static inline void
Mark Tinguely	ad223e6	2012-06-14 09:22:15 -0500	[diff] [blame]	529	xlog_cil_force(struct xlog *log)
Dave Chinner	a44f13e	2010-08-24 11:40:03 +1000	[diff] [blame]	530	{
				531	xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
				532	}
Dave Chinner	71e330b	2010-05-21 14:37:18 +1000	[diff] [blame]	533
Tim Shimmin	955e47a	2006-09-28 11:04:16 +1000	[diff] [blame]	534	/*
				535	* Unmount record type is used as a pseudo transaction type for the ticket.
				536	* It's value must be outside the range of XFS_TRANS_* values.
				537	*/
				538	#define XLOG_UNMOUNT_REC_TYPE (-1U)
				539
Dave Chinner	eb40a87	2010-12-21 12:09:01 +1100	[diff] [blame]	540	/*
				541	* Wrapper function for waiting on a wait queue serialised against wakeups
				542	* by a spinlock. This matches the semantics of all the wait queues used in the
				543	* log code.
				544	*/
				545	static inline void xlog_wait(wait_queue_head_t wq, spinlock_t lock)
				546	{
				547	DECLARE_WAITQUEUE(wait, current);
				548
				549	add_wait_queue_exclusive(wq, &wait);
				550	__set_current_state(TASK_UNINTERRUPTIBLE);
				551	spin_unlock(lock);
				552	schedule();
				553	remove_wait_queue(wq, &wait);
				554	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	555
Brian Foster	a45086e	2015-10-12 15:59:25 +1100	[diff] [blame]	556	/*
				557	* The LSN is valid so long as it is behind the current LSN. If it isn't, this
				558	* means that the next log record that includes this metadata could have a
				559	* smaller LSN. In turn, this means that the modification in the log would not
				560	* replay.
				561	*/
				562	static inline bool
				563	xlog_valid_lsn(
				564	struct xlog *log,
				565	xfs_lsn_t lsn)
				566	{
				567	int cur_cycle;
				568	int cur_block;
				569	bool valid = true;
				570
				571	/*
				572	* First, sample the current lsn without locking to avoid added
				573	* contention from metadata I/O. The current cycle and block are updated
				574	* (in xlog_state_switch_iclogs()) and read here in a particular order
				575	* to avoid false negatives (e.g., thinking the metadata LSN is valid
				576	* when it is not).
				577	*
				578	* The current block is always rewound before the cycle is bumped in
				579	* xlog_state_switch_iclogs() to ensure the current LSN is never seen in
				580	* a transiently forward state. Instead, we can see the LSN in a
				581	* transiently behind state if we happen to race with a cycle wrap.
				582	*/
Mark Rutland	6aa7de0	2017-10-23 14:07:29 -0700	[diff] [blame]	583	cur_cycle = READ_ONCE(log->l_curr_cycle);
Brian Foster	a45086e	2015-10-12 15:59:25 +1100	[diff] [blame]	584	smp_rmb();
Mark Rutland	6aa7de0	2017-10-23 14:07:29 -0700	[diff] [blame]	585	cur_block = READ_ONCE(log->l_curr_block);
Brian Foster	a45086e	2015-10-12 15:59:25 +1100	[diff] [blame]	586
				587	if ((CYCLE_LSN(lsn) > cur_cycle) \|\|
				588	(CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
				589	/*
				590	* If the metadata LSN appears invalid, it's possible the check
				591	* above raced with a wrap to the next log cycle. Grab the lock
				592	* to check for sure.
				593	*/
				594	spin_lock(&log->l_icloglock);
				595	cur_cycle = log->l_curr_cycle;
				596	cur_block = log->l_curr_block;
				597	spin_unlock(&log->l_icloglock);
				598
				599	if ((CYCLE_LSN(lsn) > cur_cycle) \|\|
				600	(CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
				601	valid = false;
				602	}
				603
				604	return valid;
				605	}
				606
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	607	#endif /* __XFS_LOG_PRIV_H__ */