Blame - fs/zonefs/super.c - SHIFTPHONES/mainline/linux

blob: b76dfb310ab650c24adfb2b6f3f329928e61a1fc [file] [log] [blame]

Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Simple file system for zoned block devices exposing zones as files.
				4	*
				5	* Copyright (C) 2019 Western Digital Corporation or its affiliates.
				6	*/
				7	#include <linux/module.h>
Matthew Wilcox (Oracle)	3a6b216	2021-06-28 19:36:30 -0700	[diff] [blame]	8	#include <linux/pagemap.h>
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	9	#include <linux/magic.h>
				10	#include <linux/iomap.h>
				11	#include <linux/init.h>
				12	#include <linux/slab.h>
				13	#include <linux/blkdev.h>
				14	#include <linux/statfs.h>
				15	#include <linux/writeback.h>
				16	#include <linux/quotaops.h>
				17	#include <linux/seq_file.h>
				18	#include <linux/parser.h>
				19	#include <linux/uio.h>
				20	#include <linux/mman.h>
				21	#include <linux/sched/mm.h>
				22	#include <linux/crc32.h>
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	23	#include <linux/task_io_accounting_ops.h>
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	24
				25	#include "zonefs.h"
				26
Johannes Thumshirn	62ab1aa	2021-01-27 05:21:15 +0900	[diff] [blame]	27	#define CREATE_TRACE_POINTS
				28	#include "trace.h"
				29
Johannes Thumshirn	5498d5f	2020-09-11 17:56:48 +0900	[diff] [blame]	30	static inline int zonefs_zone_mgmt(struct inode *inode,
				31	enum req_opf op)
				32	{
				33	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				34	int ret;
				35
				36	lockdep_assert_held(&zi->i_truncate_mutex);
				37
Johannes Thumshirn	62ab1aa	2021-01-27 05:21:15 +0900	[diff] [blame]	38	trace_zonefs_zone_mgmt(inode, op);
Johannes Thumshirn	5498d5f	2020-09-11 17:56:48 +0900	[diff] [blame]	39	ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
				40	zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
				41	if (ret) {
				42	zonefs_err(inode->i_sb,
				43	"Zone management operation %s at %llu failed %d\n",
				44	blk_op_str(op), zi->i_zsector, ret);
				45	return ret;
				46	}
				47
				48	return 0;
				49	}
				50
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	51	static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
				52	{
				53	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				54
				55	i_size_write(inode, isize);
				56	/*
				57	* A full zone is no longer open/active and does not need
				58	* explicit closing.
				59	*/
				60	if (isize >= zi->i_max_size)
				61	zi->i_flags &= ~ZONEFS_ZONE_OPEN;
				62	}
				63
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	64	static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
				65	unsigned int flags, struct iomap *iomap,
				66	struct iomap *srcmap)
				67	{
				68	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				69	struct super_block *sb = inode->i_sb;
				70	loff_t isize;
				71
				72	/* All I/Os should always be within the file maximum size */
				73	if (WARN_ON_ONCE(offset + length > zi->i_max_size))
				74	return -EIO;
				75
				76	/*
				77	* Sequential zones can only accept direct writes. This is already
				78	* checked when writes are issued, so warn if we see a page writeback
				79	* operation.
				80	*/
				81	if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
				82	(flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)))
				83	return -EIO;
				84
				85	/*
				86	* For conventional zones, all blocks are always mapped. For sequential
				87	* zones, all blocks after always mapped below the inode size (zone
				88	* write pointer) and unwriten beyond.
				89	*/
				90	mutex_lock(&zi->i_truncate_mutex);
				91	isize = i_size_read(inode);
				92	if (offset >= isize)
				93	iomap->type = IOMAP_UNWRITTEN;
				94	else
				95	iomap->type = IOMAP_MAPPED;
				96	if (flags & IOMAP_WRITE)
				97	length = zi->i_max_size - offset;
				98	else
				99	length = min(length, isize - offset);
				100	mutex_unlock(&zi->i_truncate_mutex);
				101
				102	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
				103	iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset;
				104	iomap->bdev = inode->i_sb->s_bdev;
				105	iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
				106
Johannes Thumshirn	62ab1aa	2021-01-27 05:21:15 +0900	[diff] [blame]	107	trace_zonefs_iomap_begin(inode, iomap);
				108
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	109	return 0;
				110	}
				111
				112	static const struct iomap_ops zonefs_iomap_ops = {
				113	.iomap_begin = zonefs_iomap_begin,
				114	};
				115
				116	static int zonefs_readpage(struct file unused, struct page page)
				117	{
				118	return iomap_readpage(page, &zonefs_iomap_ops);
				119	}
				120
Matthew Wilcox (Oracle)	9d24a13	2020-06-01 21:47:34 -0700	[diff] [blame]	121	static void zonefs_readahead(struct readahead_control *rac)
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	122	{
Matthew Wilcox (Oracle)	9d24a13	2020-06-01 21:47:34 -0700	[diff] [blame]	123	iomap_readahead(rac, &zonefs_iomap_ops);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	124	}
				125
				126	/*
				127	* Map blocks for page writeback. This is used only on conventional zone files,
				128	* which implies that the page range can only be within the fixed inode size.
				129	*/
				130	static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
				131	struct inode *inode, loff_t offset)
				132	{
				133	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				134
				135	if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
				136	return -EIO;
				137	if (WARN_ON_ONCE(offset >= i_size_read(inode)))
				138	return -EIO;
				139
				140	/* If the mapping is already OK, nothing needs to be done */
				141	if (offset >= wpc->iomap.offset &&
				142	offset < wpc->iomap.offset + wpc->iomap.length)
				143	return 0;
				144
				145	return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset,
				146	IOMAP_WRITE, &wpc->iomap, NULL);
				147	}
				148
				149	static const struct iomap_writeback_ops zonefs_writeback_ops = {
				150	.map_blocks = zonefs_map_blocks,
				151	};
				152
				153	static int zonefs_writepage(struct page page, struct writeback_control wbc)
				154	{
				155	struct iomap_writepage_ctx wpc = { };
				156
				157	return iomap_writepage(page, wbc, &wpc, &zonefs_writeback_ops);
				158	}
				159
				160	static int zonefs_writepages(struct address_space *mapping,
				161	struct writeback_control *wbc)
				162	{
				163	struct iomap_writepage_ctx wpc = { };
				164
				165	return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
				166	}
				167
Damien Le Moal	1601ea0	2021-03-15 12:43:55 +0900	[diff] [blame]	168	static int zonefs_swap_activate(struct swap_info_struct *sis,
				169	struct file swap_file, sector_t span)
				170	{
				171	struct inode *inode = file_inode(swap_file);
				172	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				173
				174	if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
				175	zonefs_err(inode->i_sb,
				176	"swap file: not a conventional zone file\n");
				177	return -EINVAL;
				178	}
				179
				180	return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
				181	}
				182
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	183	static const struct address_space_operations zonefs_file_aops = {
				184	.readpage = zonefs_readpage,
Matthew Wilcox (Oracle)	9d24a13	2020-06-01 21:47:34 -0700	[diff] [blame]	185	.readahead = zonefs_readahead,
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	186	.writepage = zonefs_writepage,
				187	.writepages = zonefs_writepages,
Matthew Wilcox (Oracle)	fd7353f	2021-06-28 19:36:21 -0700	[diff] [blame]	188	.set_page_dirty = __set_page_dirty_nobuffers,
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	189	.releasepage = iomap_releasepage,
				190	.invalidatepage = iomap_invalidatepage,
				191	.migratepage = iomap_migrate_page,
				192	.is_partially_uptodate = iomap_is_partially_uptodate,
				193	.error_remove_page = generic_error_remove_page,
				194	.direct_IO = noop_direct_IO,
Damien Le Moal	1601ea0	2021-03-15 12:43:55 +0900	[diff] [blame]	195	.swap_activate = zonefs_swap_activate,
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	196	};
				197
				198	static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
				199	{
				200	struct super_block *sb = inode->i_sb;
				201	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				202	loff_t old_isize = i_size_read(inode);
				203	loff_t nr_blocks;
				204
				205	if (new_isize == old_isize)
				206	return;
				207
				208	spin_lock(&sbi->s_lock);
				209
				210	/*
				211	* This may be called for an update after an IO error.
				212	* So beware of the values seen.
				213	*/
				214	if (new_isize < old_isize) {
				215	nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits;
				216	if (sbi->s_used_blocks > nr_blocks)
				217	sbi->s_used_blocks -= nr_blocks;
				218	else
				219	sbi->s_used_blocks = 0;
				220	} else {
				221	sbi->s_used_blocks +=
				222	(new_isize - old_isize) >> sb->s_blocksize_bits;
				223	if (sbi->s_used_blocks > sbi->s_blocks)
				224	sbi->s_used_blocks = sbi->s_blocks;
				225	}
				226
				227	spin_unlock(&sbi->s_lock);
				228	}
				229
				230	/*
				231	* Check a zone condition and adjust its file inode access permissions for
				232	* offline and readonly zones. Return the inode size corresponding to the
				233	* amount of readable data in the zone.
				234	*/
				235	static loff_t zonefs_check_zone_condition(struct inode *inode,
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	236	struct blk_zone *zone, bool warn,
				237	bool mount)
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	238	{
				239	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				240
				241	switch (zone->cond) {
				242	case BLK_ZONE_COND_OFFLINE:
				243	/*
				244	* Dead zone: make the inode immutable, disable all accesses
				245	* and set the file size to 0 (zone wp set to zone start).
				246	*/
				247	if (warn)
				248	zonefs_warn(inode->i_sb, "inode %lu: offline zone\n",
				249	inode->i_ino);
				250	inode->i_flags \|= S_IMMUTABLE;
				251	inode->i_mode &= ~0777;
				252	zone->wp = zone->start;
				253	return 0;
				254	case BLK_ZONE_COND_READONLY:
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	255	/*
				256	* The write pointer of read-only zones is invalid. If such a
				257	* zone is found during mount, the file size cannot be retrieved
				258	* so we treat the zone as offline (mount == true case).
				259	* Otherwise, keep the file size as it was when last updated
				260	* so that the user can recover data. In both cases, writes are
				261	* always disabled for the zone.
				262	*/
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	263	if (warn)
				264	zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
				265	inode->i_ino);
				266	inode->i_flags \|= S_IMMUTABLE;
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	267	if (mount) {
				268	zone->cond = BLK_ZONE_COND_OFFLINE;
				269	inode->i_mode &= ~0777;
				270	zone->wp = zone->start;
				271	return 0;
				272	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	273	inode->i_mode &= ~0222;
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	274	return i_size_read(inode);
Shin'ichiro Kawasaki	059c010	2021-02-17 18:58:11 +0900	[diff] [blame]	275	case BLK_ZONE_COND_FULL:
				276	/* The write pointer of full zones is invalid. */
				277	return zi->i_max_size;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	278	default:
				279	if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
				280	return zi->i_max_size;
				281	return (zone->wp - zone->start) << SECTOR_SHIFT;
				282	}
				283	}
				284
				285	struct zonefs_ioerr_data {
				286	struct inode *inode;
				287	bool write;
				288	};
				289
				290	static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
				291	void *data)
				292	{
				293	struct zonefs_ioerr_data *err = data;
				294	struct inode *inode = err->inode;
				295	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				296	struct super_block *sb = inode->i_sb;
				297	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				298	loff_t isize, data_size;
				299
				300	/*
				301	* Check the zone condition: if the zone is not "bad" (offline or
				302	* read-only), read errors are simply signaled to the IO issuer as long
				303	* as there is no inconsistency between the inode size and the amount of
				304	* data writen in the zone (data_size).
				305	*/
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	306	data_size = zonefs_check_zone_condition(inode, zone, true, false);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	307	isize = i_size_read(inode);
				308	if (zone->cond != BLK_ZONE_COND_OFFLINE &&
				309	zone->cond != BLK_ZONE_COND_READONLY &&
				310	!err->write && isize == data_size)
				311	return 0;
				312
				313	/*
				314	* At this point, we detected either a bad zone or an inconsistency
				315	* between the inode size and the amount of data written in the zone.
				316	* For the latter case, the cause may be a write IO error or an external
				317	* action on the device. Two error patterns exist:
				318	* 1) The inode size is lower than the amount of data in the zone:
				319	* a write operation partially failed and data was writen at the end
				320	* of the file. This can happen in the case of a large direct IO
				321	* needing several BIOs and/or write requests to be processed.
				322	* 2) The inode size is larger than the amount of data in the zone:
				323	* this can happen with a deferred write error with the use of the
				324	* device side write cache after getting successful write IO
				325	* completions. Other possibilities are (a) an external corruption,
				326	* e.g. an application reset the zone directly, or (b) the device
				327	* has a serious problem (e.g. firmware bug).
				328	*
				329	* In all cases, warn about inode size inconsistency and handle the
				330	* IO error according to the zone condition and to the mount options.
				331	*/
				332	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size)
				333	zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n",
				334	inode->i_ino, isize, data_size);
				335
				336	/*
				337	* First handle bad zones signaled by hardware. The mount options
				338	* errors=zone-ro and errors=zone-offline result in changing the
				339	* zone condition to read-only and offline respectively, as if the
				340	* condition was signaled by the hardware.
				341	*/
				342	if (zone->cond == BLK_ZONE_COND_OFFLINE \|\|
				343	sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) {
				344	zonefs_warn(sb, "inode %lu: read/write access disabled\n",
				345	inode->i_ino);
				346	if (zone->cond != BLK_ZONE_COND_OFFLINE) {
				347	zone->cond = BLK_ZONE_COND_OFFLINE;
				348	data_size = zonefs_check_zone_condition(inode, zone,
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	349	false, false);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	350	}
				351	} else if (zone->cond == BLK_ZONE_COND_READONLY \|\|
				352	sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) {
				353	zonefs_warn(sb, "inode %lu: write access disabled\n",
				354	inode->i_ino);
				355	if (zone->cond != BLK_ZONE_COND_READONLY) {
				356	zone->cond = BLK_ZONE_COND_READONLY;
				357	data_size = zonefs_check_zone_condition(inode, zone,
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	358	false, false);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	359	}
				360	}
				361
				362	/*
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	363	* If the filesystem is mounted with the explicit-open mount option, we
				364	* need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to
				365	* the read-only or offline condition, to avoid attempting an explicit
				366	* close of the zone when the inode file is closed.
				367	*/
				368	if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
				369	(zone->cond == BLK_ZONE_COND_OFFLINE \|\|
				370	zone->cond == BLK_ZONE_COND_READONLY))
				371	zi->i_flags &= ~ZONEFS_ZONE_OPEN;
				372
				373	/*
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	374	* If error=remount-ro was specified, any error result in remounting
				375	* the volume as read-only.
				376	*/
				377	if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) {
				378	zonefs_warn(sb, "remounting filesystem read-only\n");
				379	sb->s_flags \|= SB_RDONLY;
				380	}
				381
				382	/*
				383	* Update block usage stats and the inode size to prevent access to
				384	* invalid data.
				385	*/
				386	zonefs_update_stats(inode, data_size);
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	387	zonefs_i_size_write(inode, data_size);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	388	zi->i_wpoffset = data_size;
				389
				390	return 0;
				391	}
				392
				393	/*
				394	* When an file IO error occurs, check the file zone to see if there is a change
				395	* in the zone condition (e.g. offline or read-only). For a failed write to a
				396	* sequential zone, the zone write pointer position must also be checked to
				397	* eventually correct the file size and zonefs inode write pointer offset
				398	* (which can be out of sync with the drive due to partial write failures).
				399	*/
Johannes Thumshirn	48d546a	2020-09-11 17:56:49 +0900	[diff] [blame]	400	static void __zonefs_io_error(struct inode *inode, bool write)
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	401	{
				402	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				403	struct super_block *sb = inode->i_sb;
				404	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				405	unsigned int noio_flag;
				406	unsigned int nr_zones =
Johannes Thumshirn	e3c3155	2020-07-21 21:10:26 +0900	[diff] [blame]	407	zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	408	struct zonefs_ioerr_data err = {
				409	.inode = inode,
				410	.write = write,
				411	};
				412	int ret;
				413
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	414	/*
				415	* Memory allocations in blkdev_report_zones() can trigger a memory
				416	* reclaim which may in turn cause a recursion into zonefs as well as
				417	* struct request allocations for the same device. The former case may
				418	* end up in a deadlock on the inode truncate mutex, while the latter
				419	* may prevent IO forward progress. Executing the report zones under
				420	* the GFP_NOIO context avoids both problems.
				421	*/
				422	noio_flag = memalloc_noio_save();
				423	ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones,
				424	zonefs_io_error_cb, &err);
				425	if (ret != nr_zones)
				426	zonefs_err(sb, "Get inode %lu zone information failed %d\n",
				427	inode->i_ino, ret);
				428	memalloc_noio_restore(noio_flag);
Johannes Thumshirn	48d546a	2020-09-11 17:56:49 +0900	[diff] [blame]	429	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	430
Johannes Thumshirn	48d546a	2020-09-11 17:56:49 +0900	[diff] [blame]	431	static void zonefs_io_error(struct inode *inode, bool write)
				432	{
				433	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				434
				435	mutex_lock(&zi->i_truncate_mutex);
				436	__zonefs_io_error(inode, write);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	437	mutex_unlock(&zi->i_truncate_mutex);
				438	}
				439
				440	static int zonefs_file_truncate(struct inode *inode, loff_t isize)
				441	{
				442	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				443	loff_t old_isize;
				444	enum req_opf op;
				445	int ret = 0;
				446
				447	/*
				448	* Only sequential zone files can be truncated and truncation is allowed
				449	* only down to a 0 size, which is equivalent to a zone reset, and to
				450	* the maximum file size, which is equivalent to a zone finish.
				451	*/
				452	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
				453	return -EPERM;
				454
				455	if (!isize)
				456	op = REQ_OP_ZONE_RESET;
				457	else if (isize == zi->i_max_size)
				458	op = REQ_OP_ZONE_FINISH;
				459	else
				460	return -EPERM;
				461
				462	inode_dio_wait(inode);
				463
				464	/* Serialize against page faults */
Jan Kara	448f949	2021-04-13 09:38:27 +0200	[diff] [blame]	465	filemap_invalidate_lock(inode->i_mapping);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	466
				467	/* Serialize against zonefs_iomap_begin() */
				468	mutex_lock(&zi->i_truncate_mutex);
				469
				470	old_isize = i_size_read(inode);
				471	if (isize == old_isize)
				472	goto unlock;
				473
Johannes Thumshirn	5498d5f	2020-09-11 17:56:48 +0900	[diff] [blame]	474	ret = zonefs_zone_mgmt(inode, op);
				475	if (ret)
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	476	goto unlock;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	477
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	478	/*
				479	* If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
				480	* take care of open zones.
				481	*/
				482	if (zi->i_flags & ZONEFS_ZONE_OPEN) {
				483	/*
				484	* Truncating a zone to EMPTY or FULL is the equivalent of
				485	* closing the zone. For a truncation to 0, we need to
				486	* re-open the zone to ensure new writes can be processed.
				487	* For a truncation to the maximum file size, the zone is
				488	* closed and writes cannot be accepted anymore, so clear
				489	* the open flag.
				490	*/
				491	if (!isize)
				492	ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
				493	else
				494	zi->i_flags &= ~ZONEFS_ZONE_OPEN;
				495	}
				496
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	497	zonefs_update_stats(inode, isize);
				498	truncate_setsize(inode, isize);
				499	zi->i_wpoffset = isize;
				500
				501	unlock:
				502	mutex_unlock(&zi->i_truncate_mutex);
Jan Kara	448f949	2021-04-13 09:38:27 +0200	[diff] [blame]	503	filemap_invalidate_unlock(inode->i_mapping);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	504
				505	return ret;
				506	}
				507
Christian Brauner	549c729	2021-01-21 14:19:43 +0100	[diff] [blame]	508	static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
				509	struct dentry dentry, struct iattr iattr)
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	510	{
				511	struct inode *inode = d_inode(dentry);
				512	int ret;
				513
				514	if (unlikely(IS_IMMUTABLE(inode)))
				515	return -EPERM;
				516
Christian Brauner	2f221d6	2021-01-21 14:19:26 +0100	[diff] [blame]	517	ret = setattr_prepare(&init_user_ns, dentry, iattr);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	518	if (ret)
				519	return ret;
				520
				521	/*
				522	* Since files and directories cannot be created nor deleted, do not
				523	* allow setting any write attributes on the sub-directories grouping
				524	* files by zone type.
				525	*/
				526	if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) &&
				527	(iattr->ia_mode & 0222))
				528	return -EPERM;
				529
				530	if (((iattr->ia_valid & ATTR_UID) &&
				531	!uid_eq(iattr->ia_uid, inode->i_uid)) \|\|
				532	((iattr->ia_valid & ATTR_GID) &&
				533	!gid_eq(iattr->ia_gid, inode->i_gid))) {
				534	ret = dquot_transfer(inode, iattr);
				535	if (ret)
				536	return ret;
				537	}
				538
				539	if (iattr->ia_valid & ATTR_SIZE) {
				540	ret = zonefs_file_truncate(inode, iattr->ia_size);
				541	if (ret)
				542	return ret;
				543	}
				544
Christian Brauner	2f221d6	2021-01-21 14:19:26 +0100	[diff] [blame]	545	setattr_copy(&init_user_ns, inode, iattr);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	546
				547	return 0;
				548	}
				549
				550	static const struct inode_operations zonefs_file_inode_operations = {
				551	.setattr = zonefs_inode_setattr,
				552	};
				553
				554	static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
				555	int datasync)
				556	{
				557	struct inode *inode = file_inode(file);
				558	int ret = 0;
				559
				560	if (unlikely(IS_IMMUTABLE(inode)))
				561	return -EPERM;
				562
				563	/*
				564	* Since only direct writes are allowed in sequential files, page cache
				565	* flush is needed only for conventional zone files.
				566	*/
				567	if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
				568	ret = file_write_and_wait_range(file, start, end);
				569	if (!ret)
Christoph Hellwig	c6bf3f0	2021-01-26 15:52:35 +0100	[diff] [blame]	570	ret = blkdev_issue_flush(inode->i_sb->s_bdev);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	571
				572	if (ret)
				573	zonefs_io_error(inode, true);
				574
				575	return ret;
				576	}
				577
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	578	static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
				579	{
				580	struct inode *inode = file_inode(vmf->vma->vm_file);
				581	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				582	vm_fault_t ret;
				583
				584	if (unlikely(IS_IMMUTABLE(inode)))
				585	return VM_FAULT_SIGBUS;
				586
				587	/*
				588	* Sanity check: only conventional zone files can have shared
				589	* writeable mappings.
				590	*/
				591	if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
				592	return VM_FAULT_NOPAGE;
				593
				594	sb_start_pagefault(inode->i_sb);
				595	file_update_time(vmf->vma->vm_file);
				596
				597	/* Serialize against truncates */
Jan Kara	448f949	2021-04-13 09:38:27 +0200	[diff] [blame]	598	filemap_invalidate_lock_shared(inode->i_mapping);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	599	ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
Jan Kara	448f949	2021-04-13 09:38:27 +0200	[diff] [blame]	600	filemap_invalidate_unlock_shared(inode->i_mapping);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	601
				602	sb_end_pagefault(inode->i_sb);
				603	return ret;
				604	}
				605
				606	static const struct vm_operations_struct zonefs_file_vm_ops = {
Jan Kara	448f949	2021-04-13 09:38:27 +0200	[diff] [blame]	607	.fault = filemap_fault,
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	608	.map_pages = filemap_map_pages,
				609	.page_mkwrite = zonefs_filemap_page_mkwrite,
				610	};
				611
				612	static int zonefs_file_mmap(struct file file, struct vm_area_struct vma)
				613	{
				614	/*
				615	* Conventional zones accept random writes, so their files can support
				616	* shared writable mappings. For sequential zone files, only read
				617	* mappings are possible since there are no guarantees for write
				618	* ordering between msync() and page cache writeback.
				619	*/
				620	if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
				621	(vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
				622	return -EINVAL;
				623
				624	file_accessed(file);
				625	vma->vm_ops = &zonefs_file_vm_ops;
				626
				627	return 0;
				628	}
				629
				630	static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
				631	{
				632	loff_t isize = i_size_read(file_inode(file));
				633
				634	/*
				635	* Seeks are limited to below the zone size for conventional zones
				636	* and below the zone write pointer for sequential zones. In both
				637	* cases, this limit is the inode size.
				638	*/
				639	return generic_file_llseek_size(file, offset, whence, isize, isize);
				640	}
				641
				642	static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
				643	int error, unsigned int flags)
				644	{
				645	struct inode *inode = file_inode(iocb->ki_filp);
				646	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				647
				648	if (error) {
				649	zonefs_io_error(inode, true);
				650	return error;
				651	}
				652
				653	if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
				654	/*
				655	* Note that we may be seeing completions out of order,
				656	* but that is not a problem since a write completed
				657	* successfully necessarily means that all preceding writes
				658	* were also successful. So we can safely increase the inode
				659	* size to the write end location.
				660	*/
				661	mutex_lock(&zi->i_truncate_mutex);
				662	if (i_size_read(inode) < iocb->ki_pos + size) {
				663	zonefs_update_stats(inode, iocb->ki_pos + size);
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	664	zonefs_i_size_write(inode, iocb->ki_pos + size);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	665	}
				666	mutex_unlock(&zi->i_truncate_mutex);
				667	}
				668
				669	return 0;
				670	}
				671
				672	static const struct iomap_dio_ops zonefs_write_dio_ops = {
				673	.end_io = zonefs_file_write_dio_end_io,
				674	};
				675
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	676	static ssize_t zonefs_file_dio_append(struct kiocb iocb, struct iov_iter from)
				677	{
				678	struct inode *inode = file_inode(iocb->ki_filp);
				679	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				680	struct block_device *bdev = inode->i_sb->s_bdev;
				681	unsigned int max;
				682	struct bio *bio;
				683	ssize_t size;
				684	int nr_pages;
				685	ssize_t ret;
				686
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	687	max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
				688	max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
				689	iov_iter_truncate(from, max);
				690
Christoph Hellwig	a8affc0	2021-03-11 12:01:37 +0100	[diff] [blame]	691	nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
Johannes Thumshirn	89ee723	2020-07-16 19:37:23 +0900	[diff] [blame]	692	if (!nr_pages)
				693	return 0;
				694
Christoph Hellwig	f91ca2a	2021-01-26 15:52:31 +0100	[diff] [blame]	695	bio = bio_alloc(GFP_NOFS, nr_pages);
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	696	bio_set_dev(bio, bdev);
				697	bio->bi_iter.bi_sector = zi->i_zsector;
				698	bio->bi_write_hint = iocb->ki_hint;
				699	bio->bi_ioprio = iocb->ki_ioprio;
				700	bio->bi_opf = REQ_OP_ZONE_APPEND \| REQ_SYNC \| REQ_IDLE;
				701	if (iocb->ki_flags & IOCB_DSYNC)
				702	bio->bi_opf \|= REQ_FUA;
				703
				704	ret = bio_iov_iter_get_pages(bio, from);
Damien Le Moal	6bea022	2020-12-09 20:16:10 +0900	[diff] [blame]	705	if (unlikely(ret))
				706	goto out_release;
				707
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	708	size = bio->bi_iter.bi_size;
Damien Le Moal	6bea022	2020-12-09 20:16:10 +0900	[diff] [blame]	709	task_io_account_write(size);
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	710
				711	if (iocb->ki_flags & IOCB_HIPRI)
				712	bio_set_polled(bio, iocb);
				713
				714	ret = submit_bio_wait(bio);
				715
Damien Le Moal	6bea022	2020-12-09 20:16:10 +0900	[diff] [blame]	716	zonefs_file_write_dio_end_io(iocb, size, ret, 0);
Johannes Thumshirn	62ab1aa	2021-01-27 05:21:15 +0900	[diff] [blame]	717	trace_zonefs_file_dio_append(inode, size, ret);
Damien Le Moal	6bea022	2020-12-09 20:16:10 +0900	[diff] [blame]	718
				719	out_release:
				720	bio_release_pages(bio, false);
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	721	bio_put(bio);
				722
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	723	if (ret >= 0) {
				724	iocb->ki_pos += size;
				725	return size;
				726	}
				727
				728	return ret;
				729	}
				730
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	731	/*
Damien Le Moal	ebfd68c	2021-03-10 15:20:28 +0900	[diff] [blame]	732	* Do not exceed the LFS limits nor the file zone size. If pos is under the
				733	* limit it becomes a short access. If it exceeds the limit, return -EFBIG.
				734	*/
				735	static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
				736	loff_t count)
				737	{
				738	struct inode *inode = file_inode(file);
				739	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				740	loff_t limit = rlimit(RLIMIT_FSIZE);
				741	loff_t max_size = zi->i_max_size;
				742
				743	if (limit != RLIM_INFINITY) {
				744	if (pos >= limit) {
				745	send_sig(SIGXFSZ, current, 0);
				746	return -EFBIG;
				747	}
				748	count = min(count, limit - pos);
				749	}
				750
				751	if (!(file->f_flags & O_LARGEFILE))
				752	max_size = min_t(loff_t, MAX_NON_LFS, max_size);
				753
				754	if (unlikely(pos >= max_size))
				755	return -EFBIG;
				756
				757	return min(count, max_size - pos);
				758	}
				759
				760	static ssize_t zonefs_write_checks(struct kiocb iocb, struct iov_iter from)
				761	{
				762	struct file *file = iocb->ki_filp;
				763	struct inode *inode = file_inode(file);
				764	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				765	loff_t count;
				766
				767	if (IS_SWAPFILE(inode))
				768	return -ETXTBSY;
				769
				770	if (!iov_iter_count(from))
				771	return 0;
				772
				773	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
				774	return -EINVAL;
				775
				776	if (iocb->ki_flags & IOCB_APPEND) {
				777	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
				778	return -EINVAL;
				779	mutex_lock(&zi->i_truncate_mutex);
				780	iocb->ki_pos = zi->i_wpoffset;
				781	mutex_unlock(&zi->i_truncate_mutex);
				782	}
				783
				784	count = zonefs_write_check_limits(file, iocb->ki_pos,
				785	iov_iter_count(from));
				786	if (count < 0)
				787	return count;
				788
				789	iov_iter_truncate(from, count);
				790	return iov_iter_count(from);
				791	}
				792
				793	/*
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	794	* Handle direct writes. For sequential zone files, this is the only possible
				795	* write path. For these files, check that the user is issuing writes
				796	* sequentially from the end of the file. This code assumes that the block layer
				797	* delivers write requests to the device in sequential order. This is always the
				798	* case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
				799	* elevator feature is being used (e.g. mq-deadline). The block layer always
				800	* automatically select such an elevator for zoned block devices during the
				801	* device initialization.
				802	*/
				803	static ssize_t zonefs_file_dio_write(struct kiocb iocb, struct iov_iter from)
				804	{
				805	struct inode *inode = file_inode(iocb->ki_filp);
				806	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				807	struct super_block *sb = inode->i_sb;
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	808	bool sync = is_sync_kiocb(iocb);
				809	bool append = false;
Damien Le Moal	ebfd68c	2021-03-10 15:20:28 +0900	[diff] [blame]	810	ssize_t ret, count;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	811
				812	/*
Christoph Hellwig	7c69eb8	2020-02-21 06:37:23 -0800	[diff] [blame]	813	* For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	814	* as this can cause write reordering (e.g. the first aio gets EAGAIN
				815	* on the inode lock but the second goes through but is now unaligned).
				816	*/
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	817	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
Christoph Hellwig	7c69eb8	2020-02-21 06:37:23 -0800	[diff] [blame]	818	(iocb->ki_flags & IOCB_NOWAIT))
				819	return -EOPNOTSUPP;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	820
				821	if (iocb->ki_flags & IOCB_NOWAIT) {
				822	if (!inode_trylock(inode))
				823	return -EAGAIN;
				824	} else {
				825	inode_lock(inode);
				826	}
				827
Damien Le Moal	ebfd68c	2021-03-10 15:20:28 +0900	[diff] [blame]	828	count = zonefs_write_checks(iocb, from);
				829	if (count <= 0) {
				830	ret = count;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	831	goto inode_unlock;
Damien Le Moal	ebfd68c	2021-03-10 15:20:28 +0900	[diff] [blame]	832	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	833
				834	if ((iocb->ki_pos \| count) & (sb->s_blocksize - 1)) {
				835	ret = -EINVAL;
				836	goto inode_unlock;
				837	}
				838
				839	/* Enforce sequential writes (append only) in sequential zones */
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	840	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
				841	mutex_lock(&zi->i_truncate_mutex);
				842	if (iocb->ki_pos != zi->i_wpoffset) {
				843	mutex_unlock(&zi->i_truncate_mutex);
				844	ret = -EINVAL;
				845	goto inode_unlock;
				846	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	847	mutex_unlock(&zi->i_truncate_mutex);
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	848	append = sync;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	849	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	850
Johannes Thumshirn	02ef12a	2020-05-12 17:55:54 +0900	[diff] [blame]	851	if (append)
				852	ret = zonefs_file_dio_append(iocb, from);
				853	else
				854	ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
Andreas Gruenbacher	4fdccaa	2021-07-24 12:26:41 +0200	[diff] [blame]	855	&zonefs_write_dio_ops, 0, 0);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	856	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
				857	(ret > 0 \|\| ret == -EIOCBQUEUED)) {
				858	if (ret > 0)
				859	count = ret;
				860	mutex_lock(&zi->i_truncate_mutex);
				861	zi->i_wpoffset += count;
				862	mutex_unlock(&zi->i_truncate_mutex);
				863	}
				864
				865	inode_unlock:
				866	inode_unlock(inode);
				867
				868	return ret;
				869	}
				870
				871	static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
				872	struct iov_iter *from)
				873	{
				874	struct inode *inode = file_inode(iocb->ki_filp);
				875	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				876	ssize_t ret;
				877
				878	/*
				879	* Direct IO writes are mandatory for sequential zone files so that the
				880	* write IO issuing order is preserved.
				881	*/
				882	if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
				883	return -EIO;
				884
				885	if (iocb->ki_flags & IOCB_NOWAIT) {
				886	if (!inode_trylock(inode))
				887	return -EAGAIN;
				888	} else {
				889	inode_lock(inode);
				890	}
				891
Damien Le Moal	ebfd68c	2021-03-10 15:20:28 +0900	[diff] [blame]	892	ret = zonefs_write_checks(iocb, from);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	893	if (ret <= 0)
				894	goto inode_unlock;
				895
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	896	ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
				897	if (ret > 0)
				898	iocb->ki_pos += ret;
				899	else if (ret == -EIO)
				900	zonefs_io_error(inode, true);
				901
				902	inode_unlock:
				903	inode_unlock(inode);
				904	if (ret > 0)
				905	ret = generic_write_sync(iocb, ret);
				906
				907	return ret;
				908	}
				909
				910	static ssize_t zonefs_file_write_iter(struct kiocb iocb, struct iov_iter from)
				911	{
				912	struct inode *inode = file_inode(iocb->ki_filp);
				913
				914	if (unlikely(IS_IMMUTABLE(inode)))
				915	return -EPERM;
				916
				917	if (sb_rdonly(inode->i_sb))
				918	return -EROFS;
				919
				920	/* Write operations beyond the zone size are not allowed */
				921	if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
				922	return -EFBIG;
				923
Christoph Hellwig	60263d5	2020-07-23 22:45:59 -0700	[diff] [blame]	924	if (iocb->ki_flags & IOCB_DIRECT) {
				925	ssize_t ret = zonefs_file_dio_write(iocb, from);
				926	if (ret != -ENOTBLK)
				927	return ret;
				928	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	929
				930	return zonefs_file_buffered_write(iocb, from);
				931	}
				932
				933	static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
				934	int error, unsigned int flags)
				935	{
				936	if (error) {
				937	zonefs_io_error(file_inode(iocb->ki_filp), false);
				938	return error;
				939	}
				940
				941	return 0;
				942	}
				943
				944	static const struct iomap_dio_ops zonefs_read_dio_ops = {
				945	.end_io = zonefs_file_read_dio_end_io,
				946	};
				947
				948	static ssize_t zonefs_file_read_iter(struct kiocb iocb, struct iov_iter to)
				949	{
				950	struct inode *inode = file_inode(iocb->ki_filp);
				951	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				952	struct super_block *sb = inode->i_sb;
				953	loff_t isize;
				954	ssize_t ret;
				955
				956	/* Offline zones cannot be read */
				957	if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
				958	return -EPERM;
				959
				960	if (iocb->ki_pos >= zi->i_max_size)
				961	return 0;
				962
				963	if (iocb->ki_flags & IOCB_NOWAIT) {
				964	if (!inode_trylock_shared(inode))
				965	return -EAGAIN;
				966	} else {
				967	inode_lock_shared(inode);
				968	}
				969
				970	/* Limit read operations to written data */
				971	mutex_lock(&zi->i_truncate_mutex);
				972	isize = i_size_read(inode);
				973	if (iocb->ki_pos >= isize) {
				974	mutex_unlock(&zi->i_truncate_mutex);
				975	ret = 0;
				976	goto inode_unlock;
				977	}
				978	iov_iter_truncate(to, isize - iocb->ki_pos);
				979	mutex_unlock(&zi->i_truncate_mutex);
				980
				981	if (iocb->ki_flags & IOCB_DIRECT) {
				982	size_t count = iov_iter_count(to);
				983
				984	if ((iocb->ki_pos \| count) & (sb->s_blocksize - 1)) {
				985	ret = -EINVAL;
				986	goto inode_unlock;
				987	}
				988	file_accessed(iocb->ki_filp);
				989	ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
Andreas Gruenbacher	4fdccaa	2021-07-24 12:26:41 +0200	[diff] [blame]	990	&zonefs_read_dio_ops, 0, 0);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	991	} else {
				992	ret = generic_file_read_iter(iocb, to);
				993	if (ret == -EIO)
				994	zonefs_io_error(inode, false);
				995	}
				996
				997	inode_unlock:
				998	inode_unlock_shared(inode);
				999
				1000	return ret;
				1001	}
				1002
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1003	static inline bool zonefs_file_use_exp_open(struct inode inode, struct file file)
				1004	{
				1005	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				1006	struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
				1007
				1008	if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
				1009	return false;
				1010
				1011	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
				1012	return false;
				1013
				1014	if (!(file->f_mode & FMODE_WRITE))
				1015	return false;
				1016
				1017	return true;
				1018	}
				1019
				1020	static int zonefs_open_zone(struct inode *inode)
				1021	{
				1022	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				1023	struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
				1024	int ret = 0;
				1025
				1026	mutex_lock(&zi->i_truncate_mutex);
				1027
Chao Yu	6980d29	2021-03-16 20:30:26 +0800	[diff] [blame]	1028	if (!zi->i_wr_refcnt) {
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1029	if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
				1030	atomic_dec(&sbi->s_open_zones);
				1031	ret = -EBUSY;
				1032	goto unlock;
				1033	}
				1034
				1035	if (i_size_read(inode) < zi->i_max_size) {
				1036	ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
				1037	if (ret) {
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1038	atomic_dec(&sbi->s_open_zones);
				1039	goto unlock;
				1040	}
				1041	zi->i_flags \|= ZONEFS_ZONE_OPEN;
				1042	}
				1043	}
				1044
Chao Yu	6980d29	2021-03-16 20:30:26 +0800	[diff] [blame]	1045	zi->i_wr_refcnt++;
				1046
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1047	unlock:
				1048	mutex_unlock(&zi->i_truncate_mutex);
				1049
				1050	return ret;
				1051	}
				1052
				1053	static int zonefs_file_open(struct inode inode, struct file file)
				1054	{
				1055	int ret;
				1056
				1057	ret = generic_file_open(inode, file);
				1058	if (ret)
				1059	return ret;
				1060
				1061	if (zonefs_file_use_exp_open(inode, file))
				1062	return zonefs_open_zone(inode);
				1063
				1064	return 0;
				1065	}
				1066
				1067	static void zonefs_close_zone(struct inode *inode)
				1068	{
				1069	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				1070	int ret = 0;
				1071
				1072	mutex_lock(&zi->i_truncate_mutex);
				1073	zi->i_wr_refcnt--;
				1074	if (!zi->i_wr_refcnt) {
				1075	struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
				1076	struct super_block *sb = inode->i_sb;
				1077
				1078	/*
				1079	* If the file zone is full, it is not open anymore and we only
				1080	* need to decrement the open count.
				1081	*/
				1082	if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
				1083	goto dec;
				1084
				1085	ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
				1086	if (ret) {
				1087	__zonefs_io_error(inode, false);
				1088	/*
				1089	* Leaving zones explicitly open may lead to a state
				1090	* where most zones cannot be written (zone resources
				1091	* exhausted). So take preventive action by remounting
				1092	* read-only.
				1093	*/
				1094	if (zi->i_flags & ZONEFS_ZONE_OPEN &&
				1095	!(sb->s_flags & SB_RDONLY)) {
				1096	zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
				1097	sb->s_flags \|= SB_RDONLY;
				1098	}
				1099	}
				1100	zi->i_flags &= ~ZONEFS_ZONE_OPEN;
				1101	dec:
				1102	atomic_dec(&sbi->s_open_zones);
				1103	}
				1104	mutex_unlock(&zi->i_truncate_mutex);
				1105	}
				1106
				1107	static int zonefs_file_release(struct inode inode, struct file file)
				1108	{
				1109	/*
				1110	* If we explicitly open a zone we must close it again as well, but the
				1111	* zone management operation can fail (either due to an IO error or as
				1112	* the zone has gone offline or read-only). Make sure we don't fail the
				1113	* close(2) for user-space.
				1114	*/
				1115	if (zonefs_file_use_exp_open(inode, file))
				1116	zonefs_close_zone(inode);
				1117
				1118	return 0;
				1119	}
				1120
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1121	static const struct file_operations zonefs_file_operations = {
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1122	.open = zonefs_file_open,
				1123	.release = zonefs_file_release,
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1124	.fsync = zonefs_file_fsync,
				1125	.mmap = zonefs_file_mmap,
				1126	.llseek = zonefs_file_llseek,
				1127	.read_iter = zonefs_file_read_iter,
				1128	.write_iter = zonefs_file_write_iter,
				1129	.splice_read = generic_file_splice_read,
				1130	.splice_write = iter_file_splice_write,
Christoph Hellwig	3e08773	2021-10-12 13:12:24 +0200	[diff] [blame]	1131	.iopoll = iocb_bio_iopoll,
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1132	};
				1133
				1134	static struct kmem_cache *zonefs_inode_cachep;
				1135
				1136	static struct inode zonefs_alloc_inode(struct super_block sb)
				1137	{
				1138	struct zonefs_inode_info *zi;
				1139
				1140	zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL);
				1141	if (!zi)
				1142	return NULL;
				1143
				1144	inode_init_once(&zi->i_vnode);
				1145	mutex_init(&zi->i_truncate_mutex);
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1146	zi->i_wr_refcnt = 0;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1147
				1148	return &zi->i_vnode;
				1149	}
				1150
				1151	static void zonefs_free_inode(struct inode *inode)
				1152	{
				1153	kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode));
				1154	}
				1155
				1156	/*
				1157	* File system stat.
				1158	*/
				1159	static int zonefs_statfs(struct dentry dentry, struct kstatfs buf)
				1160	{
				1161	struct super_block *sb = dentry->d_sb;
				1162	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				1163	enum zonefs_ztype t;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1164
				1165	buf->f_type = ZONEFS_MAGIC;
				1166	buf->f_bsize = sb->s_blocksize;
				1167	buf->f_namelen = ZONEFS_NAME_MAX;
				1168
				1169	spin_lock(&sbi->s_lock);
				1170
				1171	buf->f_blocks = sbi->s_blocks;
				1172	if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks))
				1173	buf->f_bfree = 0;
				1174	else
				1175	buf->f_bfree = buf->f_blocks - sbi->s_used_blocks;
				1176	buf->f_bavail = buf->f_bfree;
				1177
				1178	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
				1179	if (sbi->s_nr_files[t])
				1180	buf->f_files += sbi->s_nr_files[t] + 1;
				1181	}
				1182	buf->f_ffree = 0;
				1183
				1184	spin_unlock(&sbi->s_lock);
				1185
Amir Goldstein	9591c3a	2021-03-22 19:39:43 +0200	[diff] [blame]	1186	buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1187
				1188	return 0;
				1189	}
				1190
				1191	enum {
				1192	Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1193	Opt_explicit_open, Opt_err,
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1194	};
				1195
				1196	static const match_table_t tokens = {
				1197	{ Opt_errors_ro, "errors=remount-ro"},
				1198	{ Opt_errors_zro, "errors=zone-ro"},
				1199	{ Opt_errors_zol, "errors=zone-offline"},
				1200	{ Opt_errors_repair, "errors=repair"},
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1201	{ Opt_explicit_open, "explicit-open" },
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1202	{ Opt_err, NULL}
				1203	};
				1204
				1205	static int zonefs_parse_options(struct super_block sb, char options)
				1206	{
				1207	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				1208	substring_t args[MAX_OPT_ARGS];
				1209	char *p;
				1210
				1211	if (!options)
				1212	return 0;
				1213
				1214	while ((p = strsep(&options, ",")) != NULL) {
				1215	int token;
				1216
				1217	if (!*p)
				1218	continue;
				1219
				1220	token = match_token(p, tokens, args);
				1221	switch (token) {
				1222	case Opt_errors_ro:
				1223	sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
				1224	sbi->s_mount_opts \|= ZONEFS_MNTOPT_ERRORS_RO;
				1225	break;
				1226	case Opt_errors_zro:
				1227	sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
				1228	sbi->s_mount_opts \|= ZONEFS_MNTOPT_ERRORS_ZRO;
				1229	break;
				1230	case Opt_errors_zol:
				1231	sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
				1232	sbi->s_mount_opts \|= ZONEFS_MNTOPT_ERRORS_ZOL;
				1233	break;
				1234	case Opt_errors_repair:
				1235	sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
				1236	sbi->s_mount_opts \|= ZONEFS_MNTOPT_ERRORS_REPAIR;
				1237	break;
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1238	case Opt_explicit_open:
				1239	sbi->s_mount_opts \|= ZONEFS_MNTOPT_EXPLICIT_OPEN;
				1240	break;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1241	default:
				1242	return -EINVAL;
				1243	}
				1244	}
				1245
				1246	return 0;
				1247	}
				1248
				1249	static int zonefs_show_options(struct seq_file seq, struct dentry root)
				1250	{
				1251	struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb);
				1252
				1253	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO)
				1254	seq_puts(seq, ",errors=remount-ro");
				1255	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)
				1256	seq_puts(seq, ",errors=zone-ro");
				1257	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)
				1258	seq_puts(seq, ",errors=zone-offline");
				1259	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR)
				1260	seq_puts(seq, ",errors=repair");
				1261
				1262	return 0;
				1263	}
				1264
				1265	static int zonefs_remount(struct super_block sb, int flags, char *data)
				1266	{
				1267	sync_filesystem(sb);
				1268
				1269	return zonefs_parse_options(sb, data);
				1270	}
				1271
				1272	static const struct super_operations zonefs_sops = {
				1273	.alloc_inode = zonefs_alloc_inode,
				1274	.free_inode = zonefs_free_inode,
				1275	.statfs = zonefs_statfs,
				1276	.remount_fs = zonefs_remount,
				1277	.show_options = zonefs_show_options,
				1278	};
				1279
				1280	static const struct inode_operations zonefs_dir_inode_operations = {
				1281	.lookup = simple_lookup,
				1282	.setattr = zonefs_inode_setattr,
				1283	};
				1284
				1285	static void zonefs_init_dir_inode(struct inode parent, struct inode inode,
				1286	enum zonefs_ztype type)
				1287	{
				1288	struct super_block *sb = parent->i_sb;
				1289
				1290	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
Christian Brauner	21cb47b	2021-01-21 14:19:25 +0100	[diff] [blame]	1291	inode_init_owner(&init_user_ns, inode, parent, S_IFDIR \| 0555);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1292	inode->i_op = &zonefs_dir_inode_operations;
				1293	inode->i_fop = &simple_dir_operations;
				1294	set_nlink(inode, 2);
				1295	inc_nlink(parent);
				1296	}
				1297
				1298	static void zonefs_init_file_inode(struct inode inode, struct blk_zone zone,
				1299	enum zonefs_ztype type)
				1300	{
				1301	struct super_block *sb = inode->i_sb;
				1302	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				1303	struct zonefs_inode_info *zi = ZONEFS_I(inode);
				1304
				1305	inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
				1306	inode->i_mode = S_IFREG \| sbi->s_perm;
				1307
				1308	zi->i_ztype = type;
				1309	zi->i_zsector = zone->start;
Johannes Thumshirn	e3c3155	2020-07-21 21:10:26 +0900	[diff] [blame]	1310	zi->i_zone_size = zone->len << SECTOR_SHIFT;
				1311
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1312	zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
Johannes Thumshirn	e3c3155	2020-07-21 21:10:26 +0900	[diff] [blame]	1313	zone->capacity << SECTOR_SHIFT);
Damien Le Moal	ccf4ad7	2020-03-20 21:36:54 +0900	[diff] [blame]	1314	zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1315
				1316	inode->i_uid = sbi->s_uid;
				1317	inode->i_gid = sbi->s_gid;
				1318	inode->i_size = zi->i_wpoffset;
Johannes Thumshirn	e3c3155	2020-07-21 21:10:26 +0900	[diff] [blame]	1319	inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1320
				1321	inode->i_op = &zonefs_file_inode_operations;
				1322	inode->i_fop = &zonefs_file_operations;
				1323	inode->i_mapping->a_ops = &zonefs_file_aops;
				1324
				1325	sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
				1326	sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
				1327	sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
				1328	}
				1329
				1330	static struct dentry zonefs_create_inode(struct dentry parent,
				1331	const char name, struct blk_zone zone,
				1332	enum zonefs_ztype type)
				1333	{
				1334	struct inode *dir = d_inode(parent);
				1335	struct dentry *dentry;
				1336	struct inode *inode;
				1337
				1338	dentry = d_alloc_name(parent, name);
				1339	if (!dentry)
				1340	return NULL;
				1341
				1342	inode = new_inode(parent->d_sb);
				1343	if (!inode)
				1344	goto dput;
				1345
				1346	inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
				1347	if (zone)
				1348	zonefs_init_file_inode(inode, zone, type);
				1349	else
				1350	zonefs_init_dir_inode(dir, inode, type);
				1351	d_add(dentry, inode);
				1352	dir->i_size++;
				1353
				1354	return dentry;
				1355
				1356	dput:
				1357	dput(dentry);
				1358
				1359	return NULL;
				1360	}
				1361
				1362	struct zonefs_zone_data {
				1363	struct super_block *sb;
				1364	unsigned int nr_zones[ZONEFS_ZTYPE_MAX];
				1365	struct blk_zone *zones;
				1366	};
				1367
				1368	/*
				1369	* Create a zone group and populate it with zone files.
				1370	*/
				1371	static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
				1372	enum zonefs_ztype type)
				1373	{
				1374	struct super_block *sb = zd->sb;
				1375	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				1376	struct blk_zone zone, next, *end;
				1377	const char *zgroup_name;
				1378	char *file_name;
				1379	struct dentry *dir;
				1380	unsigned int n = 0;
Damien Le Moal	01b2651c	2020-07-20 17:52:52 +0900	[diff] [blame]	1381	int ret;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1382
				1383	/* If the group is empty, there is nothing to do */
				1384	if (!zd->nr_zones[type])
				1385	return 0;
				1386
				1387	file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
				1388	if (!file_name)
				1389	return -ENOMEM;
				1390
				1391	if (type == ZONEFS_ZTYPE_CNV)
				1392	zgroup_name = "cnv";
				1393	else
				1394	zgroup_name = "seq";
				1395
				1396	dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
Damien Le Moal	01b2651c	2020-07-20 17:52:52 +0900	[diff] [blame]	1397	if (!dir) {
				1398	ret = -ENOMEM;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1399	goto free;
Damien Le Moal	01b2651c	2020-07-20 17:52:52 +0900	[diff] [blame]	1400	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1401
				1402	/*
				1403	* The first zone contains the super block: skip it.
				1404	*/
				1405	end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk);
				1406	for (zone = &zd->zones[1]; zone < end; zone = next) {
				1407
				1408	next = zone + 1;
				1409	if (zonefs_zone_type(zone) != type)
				1410	continue;
				1411
				1412	/*
				1413	* For conventional zones, contiguous zones can be aggregated
				1414	* together to form larger files. Note that this overwrites the
				1415	* length of the first zone of the set of contiguous zones
				1416	* aggregated together. If one offline or read-only zone is
				1417	* found, assume that all zones aggregated have the same
				1418	* condition.
				1419	*/
				1420	if (type == ZONEFS_ZTYPE_CNV &&
				1421	(sbi->s_features & ZONEFS_F_AGGRCNV)) {
				1422	for (; next < end; next++) {
				1423	if (zonefs_zone_type(next) != type)
				1424	break;
				1425	zone->len += next->len;
Johannes Thumshirn	e3c3155	2020-07-21 21:10:26 +0900	[diff] [blame]	1426	zone->capacity += next->capacity;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1427	if (next->cond == BLK_ZONE_COND_READONLY &&
				1428	zone->cond != BLK_ZONE_COND_OFFLINE)
				1429	zone->cond = BLK_ZONE_COND_READONLY;
				1430	else if (next->cond == BLK_ZONE_COND_OFFLINE)
				1431	zone->cond = BLK_ZONE_COND_OFFLINE;
				1432	}
Johannes Thumshirn	e3c3155	2020-07-21 21:10:26 +0900	[diff] [blame]	1433	if (zone->capacity != zone->len) {
				1434	zonefs_err(sb, "Invalid conventional zone capacity\n");
				1435	ret = -EINVAL;
				1436	goto free;
				1437	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1438	}
				1439
				1440	/*
				1441	* Use the file number within its group as file name.
				1442	*/
				1443	snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
Damien Le Moal	01b2651c	2020-07-20 17:52:52 +0900	[diff] [blame]	1444	if (!zonefs_create_inode(dir, file_name, zone, type)) {
				1445	ret = -ENOMEM;
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1446	goto free;
Damien Le Moal	01b2651c	2020-07-20 17:52:52 +0900	[diff] [blame]	1447	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1448
				1449	n++;
				1450	}
				1451
				1452	zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
				1453	zgroup_name, n, n > 1 ? "s" : "");
				1454
				1455	sbi->s_nr_files[type] = n;
				1456	ret = 0;
				1457
				1458	free:
				1459	kfree(file_name);
				1460
				1461	return ret;
				1462	}
				1463
				1464	static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx,
				1465	void *data)
				1466	{
				1467	struct zonefs_zone_data *zd = data;
				1468
				1469	/*
				1470	* Count the number of usable zones: the first zone at index 0 contains
				1471	* the super block and is ignored.
				1472	*/
				1473	switch (zone->type) {
				1474	case BLK_ZONE_TYPE_CONVENTIONAL:
				1475	zone->wp = zone->start + zone->len;
				1476	if (idx)
				1477	zd->nr_zones[ZONEFS_ZTYPE_CNV]++;
				1478	break;
				1479	case BLK_ZONE_TYPE_SEQWRITE_REQ:
				1480	case BLK_ZONE_TYPE_SEQWRITE_PREF:
				1481	if (idx)
				1482	zd->nr_zones[ZONEFS_ZTYPE_SEQ]++;
				1483	break;
				1484	default:
				1485	zonefs_err(zd->sb, "Unsupported zone type 0x%x\n",
				1486	zone->type);
				1487	return -EIO;
				1488	}
				1489
				1490	memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone));
				1491
				1492	return 0;
				1493	}
				1494
				1495	static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
				1496	{
				1497	struct block_device *bdev = zd->sb->s_bdev;
				1498	int ret;
				1499
				1500	zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk),
				1501	sizeof(struct blk_zone), GFP_KERNEL);
				1502	if (!zd->zones)
				1503	return -ENOMEM;
				1504
				1505	/* Get zones information from the device */
				1506	ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES,
				1507	zonefs_get_zone_info_cb, zd);
				1508	if (ret < 0) {
				1509	zonefs_err(zd->sb, "Zone report failed %d\n", ret);
				1510	return ret;
				1511	}
				1512
				1513	if (ret != blkdev_nr_zones(bdev->bd_disk)) {
				1514	zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n",
				1515	ret, blkdev_nr_zones(bdev->bd_disk));
				1516	return -EIO;
				1517	}
				1518
				1519	return 0;
				1520	}
				1521
				1522	static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd)
				1523	{
				1524	kvfree(zd->zones);
				1525	}
				1526
				1527	/*
				1528	* Read super block information from the device.
				1529	*/
				1530	static int zonefs_read_super(struct super_block *sb)
				1531	{
				1532	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				1533	struct zonefs_super *super;
				1534	u32 crc, stored_crc;
				1535	struct page *page;
				1536	struct bio_vec bio_vec;
				1537	struct bio bio;
				1538	int ret;
				1539
				1540	page = alloc_page(GFP_KERNEL);
				1541	if (!page)
				1542	return -ENOMEM;
				1543
				1544	bio_init(&bio, &bio_vec, 1);
				1545	bio.bi_iter.bi_sector = 0;
				1546	bio.bi_opf = REQ_OP_READ;
				1547	bio_set_dev(&bio, sb->s_bdev);
				1548	bio_add_page(&bio, page, PAGE_SIZE, 0);
				1549
				1550	ret = submit_bio_wait(&bio);
				1551	if (ret)
				1552	goto free_page;
				1553
				1554	super = kmap(page);
				1555
				1556	ret = -EINVAL;
				1557	if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
				1558	goto unmap;
				1559
				1560	stored_crc = le32_to_cpu(super->s_crc);
				1561	super->s_crc = 0;
				1562	crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super));
				1563	if (crc != stored_crc) {
				1564	zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
				1565	crc, stored_crc);
				1566	goto unmap;
				1567	}
				1568
				1569	sbi->s_features = le64_to_cpu(super->s_features);
				1570	if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
				1571	zonefs_err(sb, "Unknown features set 0x%llx\n",
				1572	sbi->s_features);
				1573	goto unmap;
				1574	}
				1575
				1576	if (sbi->s_features & ZONEFS_F_UID) {
				1577	sbi->s_uid = make_kuid(current_user_ns(),
				1578	le32_to_cpu(super->s_uid));
				1579	if (!uid_valid(sbi->s_uid)) {
				1580	zonefs_err(sb, "Invalid UID feature\n");
				1581	goto unmap;
				1582	}
				1583	}
				1584
				1585	if (sbi->s_features & ZONEFS_F_GID) {
				1586	sbi->s_gid = make_kgid(current_user_ns(),
				1587	le32_to_cpu(super->s_gid));
				1588	if (!gid_valid(sbi->s_gid)) {
				1589	zonefs_err(sb, "Invalid GID feature\n");
				1590	goto unmap;
				1591	}
				1592	}
				1593
				1594	if (sbi->s_features & ZONEFS_F_PERM)
				1595	sbi->s_perm = le32_to_cpu(super->s_perm);
				1596
				1597	if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
				1598	zonefs_err(sb, "Reserved area is being used\n");
				1599	goto unmap;
				1600	}
				1601
Andy Shevchenko	568776f	2020-04-23 18:32:11 +0300	[diff] [blame]	1602	import_uuid(&sbi->s_uuid, super->s_uuid);
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1603	ret = 0;
				1604
				1605	unmap:
				1606	kunmap(page);
				1607	free_page:
				1608	__free_page(page);
				1609
				1610	return ret;
				1611	}
				1612
				1613	/*
				1614	* Check that the device is zoned. If it is, get the list of zones and create
				1615	* sub-directories and files according to the device zone configuration and
				1616	* format options.
				1617	*/
				1618	static int zonefs_fill_super(struct super_block sb, void data, int silent)
				1619	{
				1620	struct zonefs_zone_data zd;
				1621	struct zonefs_sb_info *sbi;
				1622	struct inode *inode;
				1623	enum zonefs_ztype t;
				1624	int ret;
				1625
				1626	if (!bdev_is_zoned(sb->s_bdev)) {
				1627	zonefs_err(sb, "Not a zoned block device\n");
				1628	return -EINVAL;
				1629	}
				1630
				1631	/*
				1632	* Initialize super block information: the maximum file size is updated
				1633	* when the zone files are created so that the format option
				1634	* ZONEFS_F_AGGRCNV which increases the maximum file size of a file
				1635	* beyond the zone size is taken into account.
				1636	*/
				1637	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
				1638	if (!sbi)
				1639	return -ENOMEM;
				1640
				1641	spin_lock_init(&sbi->s_lock);
				1642	sb->s_fs_info = sbi;
				1643	sb->s_magic = ZONEFS_MAGIC;
				1644	sb->s_maxbytes = 0;
				1645	sb->s_op = &zonefs_sops;
				1646	sb->s_time_gran = 1;
				1647
				1648	/*
Damien Le Moal	0f1ba5f	2021-01-28 13:47:31 +0900	[diff] [blame]	1649	* The block size is set to the device zone write granularity to ensure
				1650	* that write operations are always aligned according to the device
				1651	* interface constraints.
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1652	*/
Damien Le Moal	0f1ba5f	2021-01-28 13:47:31 +0900	[diff] [blame]	1653	sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev));
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1654	sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
				1655	sbi->s_uid = GLOBAL_ROOT_UID;
				1656	sbi->s_gid = GLOBAL_ROOT_GID;
				1657	sbi->s_perm = 0640;
				1658	sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
Johannes Thumshirn	b5c00e9	2020-09-11 17:56:50 +0900	[diff] [blame]	1659	sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
				1660	atomic_set(&sbi->s_open_zones, 0);
				1661	if (!sbi->s_max_open_zones &&
				1662	sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
				1663	zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
				1664	sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
				1665	}
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1666
				1667	ret = zonefs_read_super(sb);
				1668	if (ret)
				1669	return ret;
				1670
				1671	ret = zonefs_parse_options(sb, data);
				1672	if (ret)
				1673	return ret;
				1674
				1675	memset(&zd, 0, sizeof(struct zonefs_zone_data));
				1676	zd.sb = sb;
				1677	ret = zonefs_get_zone_info(&zd);
				1678	if (ret)
				1679	goto cleanup;
				1680
				1681	zonefs_info(sb, "Mounting %u zones",
				1682	blkdev_nr_zones(sb->s_bdev->bd_disk));
				1683
				1684	/* Create root directory inode */
				1685	ret = -ENOMEM;
				1686	inode = new_inode(sb);
				1687	if (!inode)
				1688	goto cleanup;
				1689
				1690	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk);
				1691	inode->i_mode = S_IFDIR \| 0555;
				1692	inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
				1693	inode->i_op = &zonefs_dir_inode_operations;
				1694	inode->i_fop = &simple_dir_operations;
				1695	set_nlink(inode, 2);
				1696
				1697	sb->s_root = d_make_root(inode);
				1698	if (!sb->s_root)
				1699	goto cleanup;
				1700
				1701	/* Create and populate files in zone groups directories */
				1702	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
				1703	ret = zonefs_create_zgroup(&zd, t);
				1704	if (ret)
				1705	break;
				1706	}
				1707
				1708	cleanup:
				1709	zonefs_cleanup_zone_info(&zd);
				1710
				1711	return ret;
				1712	}
				1713
				1714	static struct dentry zonefs_mount(struct file_system_type fs_type,
				1715	int flags, const char dev_name, void data)
				1716	{
				1717	return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super);
				1718	}
				1719
				1720	static void zonefs_kill_super(struct super_block *sb)
				1721	{
				1722	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
				1723
				1724	if (sb->s_root)
				1725	d_genocide(sb->s_root);
				1726	kill_block_super(sb);
				1727	kfree(sbi);
				1728	}
				1729
				1730	/*
				1731	* File system definition and registration.
				1732	*/
				1733	static struct file_system_type zonefs_type = {
				1734	.owner = THIS_MODULE,
				1735	.name = "zonefs",
				1736	.mount = zonefs_mount,
				1737	.kill_sb = zonefs_kill_super,
				1738	.fs_flags = FS_REQUIRES_DEV,
				1739	};
				1740
				1741	static int __init zonefs_init_inodecache(void)
				1742	{
				1743	zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache",
				1744	sizeof(struct zonefs_inode_info), 0,
				1745	(SLAB_RECLAIM_ACCOUNT \| SLAB_MEM_SPREAD \| SLAB_ACCOUNT),
				1746	NULL);
				1747	if (zonefs_inode_cachep == NULL)
				1748	return -ENOMEM;
				1749	return 0;
				1750	}
				1751
				1752	static void zonefs_destroy_inodecache(void)
				1753	{
				1754	/*
				1755	* Make sure all delayed rcu free inodes are flushed before we
				1756	* destroy the inode cache.
				1757	*/
				1758	rcu_barrier();
				1759	kmem_cache_destroy(zonefs_inode_cachep);
				1760	}
				1761
				1762	static int __init zonefs_init(void)
				1763	{
				1764	int ret;
				1765
				1766	BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE);
				1767
				1768	ret = zonefs_init_inodecache();
				1769	if (ret)
				1770	return ret;
				1771
				1772	ret = register_filesystem(&zonefs_type);
				1773	if (ret) {
				1774	zonefs_destroy_inodecache();
				1775	return ret;
				1776	}
				1777
				1778	return 0;
				1779	}
				1780
				1781	static void __exit zonefs_exit(void)
				1782	{
				1783	zonefs_destroy_inodecache();
				1784	unregister_filesystem(&zonefs_type);
				1785	}
				1786
				1787	MODULE_AUTHOR("Damien Le Moal");
				1788	MODULE_DESCRIPTION("Zone file system for zoned block devices");
				1789	MODULE_LICENSE("GPL");
Naohiro Aota	8ffea25	2021-12-17 15:15:45 +0900	[diff] [blame]	1790	MODULE_ALIAS_FS("zonefs");
Damien Le Moal	8dcc1a9	2019-12-25 16:07:44 +0900	[diff] [blame]	1791	module_init(zonefs_init);
				1792	module_exit(zonefs_exit);