Blame - fs/ceph/snap.c - SHIFTPHONES/mainline/linux

blob: 49d0c4c59d81b6cdc919b1b088dd9e8db92c355d [file] [log] [blame]

Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	1	#include "ceph_debug.h"
				2
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	3	#include <linux/sort.h>
				4
				5	#include "super.h"
				6	#include "decode.h"
				7
				8	/*
				9	* Snapshots in ceph are driven in large part by cooperation from the
				10	* client. In contrast to local file systems or file servers that
				11	* implement snapshots at a single point in the system, ceph's
				12	* distributed access to storage requires clients to help decide
				13	* whether a write logically occurs before or after a recently created
				14	* snapshot.
				15	*
				16	* This provides a perfect instantanous client-wide snapshot. Between
				17	* clients, however, snapshots may appear to be applied at slightly
				18	* different points in time, depending on delays in delivering the
				19	* snapshot notification.
				20	*
				21	* Snapshots are _not_ file system-wide. Instead, each snapshot
				22	* applies to the subdirectory nested beneath some directory. This
				23	* effectively divides the hierarchy into multiple "realms," where all
				24	* of the files contained by each realm share the same set of
				25	* snapshots. An individual realm's snap set contains snapshots
				26	* explicitly created on that realm, as well as any snaps in its
				27	* parent's snap set _after_ the point at which the parent became it's
				28	* parent (due to, say, a rename). Similarly, snaps from prior parents
				29	* during the time intervals during which they were the parent are included.
				30	*
				31	* The client is spared most of this detail, fortunately... it must only
				32	* maintains a hierarchy of realms reflecting the current parent/child
				33	* realm relationship, and for each realm has an explicit list of snaps
				34	* inherited from prior parents.
				35	*
				36	* A snap_realm struct is maintained for realms containing every inode
				37	* with an open cap in the system. (The needed snap realm information is
				38	* provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
				39	* version number is used to ensure that as realm parameters change (new
				40	* snapshot, new parent, etc.) the client's realm hierarchy is updated.
				41	*
				42	* The realm hierarchy drives the generation of a 'snap context' for each
				43	* realm, which simply lists the resulting set of snaps for the realm. This
				44	* is attached to any writes sent to OSDs.
				45	*/
				46	/*
				47	* Unfortunately error handling is a bit mixed here. If we get a snap
				48	* update, but don't have enough memory to update our realm hierarchy,
				49	* it's not clear what we can do about it (besides complaining to the
				50	* console).
				51	*/
				52
				53
				54	/*
				55	* increase ref count for the realm
				56	*
				57	* caller must hold snap_rwsem for write.
				58	*/
				59	void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
				60	struct ceph_snap_realm *realm)
				61	{
				62	dout("get_realm %p %d -> %d\n", realm,
				63	atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
				64	/*
				65	* since we _only_ increment realm refs or empty the empty
				66	* list with snap_rwsem held, adjusting the empty list here is
				67	* safe. we do need to protect against concurrent empty list
				68	* additions, however.
				69	*/
				70	if (atomic_read(&realm->nref) == 0) {
				71	spin_lock(&mdsc->snap_empty_lock);
				72	list_del_init(&realm->empty_item);
				73	spin_unlock(&mdsc->snap_empty_lock);
				74	}
				75
				76	atomic_inc(&realm->nref);
				77	}
				78
Sage Weil	a105f00	2010-02-15 14:37:55 -0800	[diff] [blame^]	79	static void __insert_snap_realm(struct rb_root *root,
				80	struct ceph_snap_realm *new)
				81	{
				82	struct rb_node **p = &root->rb_node;
				83	struct rb_node *parent = NULL;
				84	struct ceph_snap_realm *r = NULL;
				85
				86	while (*p) {
				87	parent = *p;
				88	r = rb_entry(parent, struct ceph_snap_realm, node);
				89	if (new->ino < r->ino)
				90	p = &(*p)->rb_left;
				91	else if (new->ino > r->ino)
				92	p = &(*p)->rb_right;
				93	else
				94	BUG();
				95	}
				96
				97	rb_link_node(&new->node, parent, p);
				98	rb_insert_color(&new->node, root);
				99	}
				100
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	101	/*
				102	* create and get the realm rooted at @ino and bump its ref count.
				103	*
				104	* caller must hold snap_rwsem for write.
				105	*/
				106	static struct ceph_snap_realm *ceph_create_snap_realm(
				107	struct ceph_mds_client *mdsc,
				108	u64 ino)
				109	{
				110	struct ceph_snap_realm *realm;
				111
				112	realm = kzalloc(sizeof(*realm), GFP_NOFS);
				113	if (!realm)
				114	return ERR_PTR(-ENOMEM);
				115
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	116	atomic_set(&realm->nref, 0); /* tree does not take a ref */
				117	realm->ino = ino;
				118	INIT_LIST_HEAD(&realm->children);
				119	INIT_LIST_HEAD(&realm->child_item);
				120	INIT_LIST_HEAD(&realm->empty_item);
				121	INIT_LIST_HEAD(&realm->inodes_with_caps);
				122	spin_lock_init(&realm->inodes_with_caps_lock);
Sage Weil	a105f00	2010-02-15 14:37:55 -0800	[diff] [blame^]	123	__insert_snap_realm(&mdsc->snap_realms, realm);
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	124	dout("create_snap_realm %llx %p\n", realm->ino, realm);
				125	return realm;
				126	}
				127
				128	/*
Sage Weil	a105f00	2010-02-15 14:37:55 -0800	[diff] [blame^]	129	* lookup the realm rooted at @ino.
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	130	*
				131	* caller must hold snap_rwsem for write.
				132	*/
				133	struct ceph_snap_realm ceph_lookup_snap_realm(struct ceph_mds_client mdsc,
				134	u64 ino)
				135	{
Sage Weil	a105f00	2010-02-15 14:37:55 -0800	[diff] [blame^]	136	struct rb_node *n = mdsc->snap_realms.rb_node;
				137	struct ceph_snap_realm *r;
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	138
Sage Weil	a105f00	2010-02-15 14:37:55 -0800	[diff] [blame^]	139	while (n) {
				140	r = rb_entry(n, struct ceph_snap_realm, node);
				141	if (ino < r->ino)
				142	n = n->rb_left;
				143	else if (ino > r->ino)
				144	n = n->rb_right;
				145	else {
				146	dout("lookup_snap_realm %llx %p\n", r->ino, r);
				147	return r;
				148	}
				149	}
				150	return NULL;
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	151	}
				152
				153	static void __put_snap_realm(struct ceph_mds_client *mdsc,
				154	struct ceph_snap_realm *realm);
				155
				156	/*
				157	* called with snap_rwsem (write)
				158	*/
				159	static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
				160	struct ceph_snap_realm *realm)
				161	{
				162	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
				163
Sage Weil	a105f00	2010-02-15 14:37:55 -0800	[diff] [blame^]	164	rb_erase(&realm->node, &mdsc->snap_realms);
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	165
				166	if (realm->parent) {
				167	list_del_init(&realm->child_item);
				168	__put_snap_realm(mdsc, realm->parent);
				169	}
				170
				171	kfree(realm->prior_parent_snaps);
				172	kfree(realm->snaps);
				173	ceph_put_snap_context(realm->cached_context);
				174	kfree(realm);
				175	}
				176
				177	/*
				178	* caller holds snap_rwsem (write)
				179	*/
				180	static void __put_snap_realm(struct ceph_mds_client *mdsc,
				181	struct ceph_snap_realm *realm)
				182	{
				183	dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
				184	atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
				185	if (atomic_dec_and_test(&realm->nref))
				186	__destroy_snap_realm(mdsc, realm);
				187	}
				188
				189	/*
				190	* caller needn't hold any locks
				191	*/
				192	void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
				193	struct ceph_snap_realm *realm)
				194	{
				195	dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
				196	atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
				197	if (!atomic_dec_and_test(&realm->nref))
				198	return;
				199
				200	if (down_write_trylock(&mdsc->snap_rwsem)) {
				201	__destroy_snap_realm(mdsc, realm);
				202	up_write(&mdsc->snap_rwsem);
				203	} else {
				204	spin_lock(&mdsc->snap_empty_lock);
				205	list_add(&mdsc->snap_empty, &realm->empty_item);
				206	spin_unlock(&mdsc->snap_empty_lock);
				207	}
				208	}
				209
				210	/*
				211	* Clean up any realms whose ref counts have dropped to zero. Note
				212	* that this does not include realms who were created but not yet
				213	* used.
				214	*
				215	* Called under snap_rwsem (write)
				216	*/
				217	static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
				218	{
				219	struct ceph_snap_realm *realm;
				220
				221	spin_lock(&mdsc->snap_empty_lock);
				222	while (!list_empty(&mdsc->snap_empty)) {
				223	realm = list_first_entry(&mdsc->snap_empty,
				224	struct ceph_snap_realm, empty_item);
				225	list_del(&realm->empty_item);
				226	spin_unlock(&mdsc->snap_empty_lock);
				227	__destroy_snap_realm(mdsc, realm);
				228	spin_lock(&mdsc->snap_empty_lock);
				229	}
				230	spin_unlock(&mdsc->snap_empty_lock);
				231	}
				232
				233	void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
				234	{
				235	down_write(&mdsc->snap_rwsem);
				236	__cleanup_empty_realms(mdsc);
				237	up_write(&mdsc->snap_rwsem);
				238	}
				239
				240	/*
				241	* adjust the parent realm of a given @realm. adjust child list, and parent
				242	* pointers, and ref counts appropriately.
				243	*
				244	* return true if parent was changed, 0 if unchanged, <0 on error.
				245	*
				246	* caller must hold snap_rwsem for write.
				247	*/
				248	static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
				249	struct ceph_snap_realm *realm,
				250	u64 parentino)
				251	{
				252	struct ceph_snap_realm *parent;
				253
				254	if (realm->parent_ino == parentino)
				255	return 0;
				256
				257	parent = ceph_lookup_snap_realm(mdsc, parentino);
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	258	if (!parent) {
				259	parent = ceph_create_snap_realm(mdsc, parentino);
				260	if (IS_ERR(parent))
				261	return PTR_ERR(parent);
				262	}
				263	dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
				264	realm->ino, realm, realm->parent_ino, realm->parent,
				265	parentino, parent);
				266	if (realm->parent) {
				267	list_del_init(&realm->child_item);
				268	ceph_put_snap_realm(mdsc, realm->parent);
				269	}
				270	realm->parent_ino = parentino;
				271	realm->parent = parent;
				272	ceph_get_snap_realm(mdsc, parent);
				273	list_add(&realm->child_item, &parent->children);
				274	return 1;
				275	}
				276
				277
				278	static int cmpu64_rev(const void a, const void b)
				279	{
				280	if ((u64 )a < (u64 )b)
				281	return 1;
				282	if ((u64 )a > (u64 )b)
				283	return -1;
				284	return 0;
				285	}
				286
				287	/*
				288	* build the snap context for a given realm.
				289	*/
				290	static int build_snap_context(struct ceph_snap_realm *realm)
				291	{
				292	struct ceph_snap_realm *parent = realm->parent;
				293	struct ceph_snap_context *snapc;
				294	int err = 0;
				295	int i;
				296	int num = realm->num_prior_parent_snaps + realm->num_snaps;
				297
				298	/*
				299	* build parent context, if it hasn't been built.
				300	* conservatively estimate that all parent snaps might be
				301	* included by us.
				302	*/
				303	if (parent) {
				304	if (!parent->cached_context) {
				305	err = build_snap_context(parent);
				306	if (err)
				307	goto fail;
				308	}
				309	num += parent->cached_context->num_snaps;
				310	}
				311
				312	/* do i actually need to update? not if my context seq
				313	matches realm seq, and my parents' does to. (this works
				314	because we rebuild_snap_realms() works _downward_ in
				315	hierarchy after each update.) */
				316	if (realm->cached_context &&
				317	realm->cached_context->seq <= realm->seq &&
				318	(!parent \|\|
				319	realm->cached_context->seq <= parent->cached_context->seq)) {
				320	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
				321	" (unchanged)\n",
				322	realm->ino, realm, realm->cached_context,
				323	realm->cached_context->seq,
				324	realm->cached_context->num_snaps);
				325	return 0;
				326	}
				327
				328	/* alloc new snap context */
				329	err = -ENOMEM;
				330	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
				331	goto fail;
				332	snapc = kzalloc(sizeof(snapc) + numsizeof(u64), GFP_NOFS);
				333	if (!snapc)
				334	goto fail;
				335	atomic_set(&snapc->nref, 1);
				336
				337	/* build (reverse sorted) snap vector */
				338	num = 0;
				339	snapc->seq = realm->seq;
				340	if (parent) {
				341	/* include any of parent's snaps occuring _after_ my
				342	parent became my parent */
				343	for (i = 0; i < parent->cached_context->num_snaps; i++)
				344	if (parent->cached_context->snaps[i] >=
				345	realm->parent_since)
				346	snapc->snaps[num++] =
				347	parent->cached_context->snaps[i];
				348	if (parent->cached_context->seq > snapc->seq)
				349	snapc->seq = parent->cached_context->seq;
				350	}
				351	memcpy(snapc->snaps + num, realm->snaps,
				352	sizeof(u64)*realm->num_snaps);
				353	num += realm->num_snaps;
				354	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
				355	sizeof(u64)*realm->num_prior_parent_snaps);
				356	num += realm->num_prior_parent_snaps;
				357
				358	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
				359	snapc->num_snaps = num;
				360	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
				361	realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
				362
				363	if (realm->cached_context)
				364	ceph_put_snap_context(realm->cached_context);
				365	realm->cached_context = snapc;
				366	return 0;
				367
				368	fail:
				369	/*
				370	* if we fail, clear old (incorrect) cached_context... hopefully
				371	* we'll have better luck building it later
				372	*/
				373	if (realm->cached_context) {
				374	ceph_put_snap_context(realm->cached_context);
				375	realm->cached_context = NULL;
				376	}
				377	pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
				378	realm, err);
				379	return err;
				380	}
				381
				382	/*
				383	* rebuild snap context for the given realm and all of its children.
				384	*/
				385	static void rebuild_snap_realms(struct ceph_snap_realm *realm)
				386	{
				387	struct ceph_snap_realm *child;
				388
				389	dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
				390	build_snap_context(realm);
				391
				392	list_for_each_entry(child, &realm->children, child_item)
				393	rebuild_snap_realms(child);
				394	}
				395
				396
				397	/*
				398	* helper to allocate and decode an array of snapids. free prior
				399	* instance, if any.
				400	*/
				401	static int dup_array(u64 *dst, __le64 src, int num)
				402	{
				403	int i;
				404
				405	kfree(*dst);
				406	if (num) {
				407	*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
				408	if (!*dst)
				409	return -ENOMEM;
				410	for (i = 0; i < num; i++)
				411	(*dst)[i] = get_unaligned_le64(src + i);
				412	} else {
				413	*dst = NULL;
				414	}
				415	return 0;
				416	}
				417
				418
				419	/*
				420	* When a snapshot is applied, the size/mtime inode metadata is queued
				421	* in a ceph_cap_snap (one for each snapshot) until writeback
				422	* completes and the metadata can be flushed back to the MDS.
				423	*
				424	* However, if a (sync) write is currently in-progress when we apply
				425	* the snapshot, we have to wait until the write succeeds or fails
				426	* (and a final size/mtime is known). In this case the
				427	* cap_snap->writing = 1, and is said to be "pending." When the write
				428	* finishes, we __ceph_finish_cap_snap().
				429	*
				430	* Caller must hold snap_rwsem for read (i.e., the realm topology won't
				431	* change).
				432	*/
				433	void ceph_queue_cap_snap(struct ceph_inode_info *ci,
				434	struct ceph_snap_context *snapc)
				435	{
				436	struct inode *inode = &ci->vfs_inode;
				437	struct ceph_cap_snap *capsnap;
				438	int used;
				439
				440	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
				441	if (!capsnap) {
				442	pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
				443	return;
				444	}
				445
				446	spin_lock(&inode->i_lock);
				447	used = __ceph_caps_used(ci);
				448	if (__ceph_have_pending_cap_snap(ci)) {
				449	/* there is no point in queuing multiple "pending" cap_snaps,
				450	as no new writes are allowed to start when pending, so any
				451	writes in progress now were started before the previous
				452	cap_snap. lucky us. */
				453	dout("queue_cap_snap %p snapc %p seq %llu used %d"
				454	" already pending\n", inode, snapc, snapc->seq, used);
				455	kfree(capsnap);
				456	} else if (ci->i_wrbuffer_ref_head \|\| (used & CEPH_CAP_FILE_WR)) {
				457	igrab(inode);
				458
				459	atomic_set(&capsnap->nref, 1);
				460	capsnap->ci = ci;
				461	INIT_LIST_HEAD(&capsnap->ci_item);
				462	INIT_LIST_HEAD(&capsnap->flushing_item);
				463
				464	capsnap->follows = snapc->seq - 1;
				465	capsnap->context = ceph_get_snap_context(snapc);
				466	capsnap->issued = __ceph_caps_issued(ci, NULL);
				467	capsnap->dirty = __ceph_caps_dirty(ci);
				468
				469	capsnap->mode = inode->i_mode;
				470	capsnap->uid = inode->i_uid;
				471	capsnap->gid = inode->i_gid;
				472
				473	/* fixme? */
				474	capsnap->xattr_blob = NULL;
				475	capsnap->xattr_len = 0;
				476
				477	/* dirty page count moved from _head to this cap_snap;
				478	all subsequent writes page dirties occur _after_ this
				479	snapshot. */
				480	capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
				481	ci->i_wrbuffer_ref_head = 0;
				482	ceph_put_snap_context(ci->i_head_snapc);
				483	ci->i_head_snapc = NULL;
				484	list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
				485
				486	if (used & CEPH_CAP_FILE_WR) {
				487	dout("queue_cap_snap %p cap_snap %p snapc %p"
				488	" seq %llu used WR, now pending\n", inode,
				489	capsnap, snapc, snapc->seq);
				490	capsnap->writing = 1;
				491	} else {
				492	/* note mtime, size NOW. */
				493	__ceph_finish_cap_snap(ci, capsnap);
				494	}
				495	} else {
				496	dout("queue_cap_snap %p nothing dirty\|writing\n", inode);
				497	kfree(capsnap);
				498	}
				499
				500	spin_unlock(&inode->i_lock);
				501	}
				502
				503	/*
				504	* Finalize the size, mtime for a cap_snap.. that is, settle on final values
				505	* to be used for the snapshot, to be flushed back to the mds.
				506	*
				507	* If capsnap can now be flushed, add to snap_flush list, and return 1.
				508	*
				509	* Caller must hold i_lock.
				510	*/
				511	int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
				512	struct ceph_cap_snap *capsnap)
				513	{
				514	struct inode *inode = &ci->vfs_inode;
				515	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				516
				517	BUG_ON(capsnap->writing);
				518	capsnap->size = inode->i_size;
				519	capsnap->mtime = inode->i_mtime;
				520	capsnap->atime = inode->i_atime;
				521	capsnap->ctime = inode->i_ctime;
				522	capsnap->time_warp_seq = ci->i_time_warp_seq;
				523	if (capsnap->dirty_pages) {
				524	dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
				525	"still has %d dirty pages\n", inode, capsnap,
				526	capsnap->context, capsnap->context->seq,
				527	capsnap->size, capsnap->dirty_pages);
				528	return 0;
				529	}
				530	dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
				531	inode, capsnap, capsnap->context,
				532	capsnap->context->seq, capsnap->size);
				533
				534	spin_lock(&mdsc->snap_flush_lock);
				535	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
				536	spin_unlock(&mdsc->snap_flush_lock);
				537	return 1; /* caller may want to ceph_flush_snaps */
				538	}
				539
				540
				541	/*
				542	* Parse and apply a snapblob "snap trace" from the MDS. This specifies
				543	* the snap realm parameters from a given realm and all of its ancestors,
				544	* up to the root.
				545	*
				546	* Caller must hold snap_rwsem for write.
				547	*/
				548	int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
				549	void p, void e, bool deletion)
				550	{
				551	struct ceph_mds_snap_realm ri; / encoded */
				552	__le64 snaps; / encoded */
				553	__le64 prior_parent_snaps; / encoded */
				554	struct ceph_snap_realm *realm;
				555	int invalidate = 0;
				556	int err = -ENOMEM;
				557
				558	dout("update_snap_trace deletion=%d\n", deletion);
				559	more:
				560	ceph_decode_need(&p, e, sizeof(*ri), bad);
				561	ri = p;
				562	p += sizeof(*ri);
				563	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
				564	le32_to_cpu(ri->num_prior_parent_snaps)), bad);
				565	snaps = p;
				566	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
				567	prior_parent_snaps = p;
				568	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
				569
				570	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	571	if (!realm) {
				572	realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
				573	if (IS_ERR(realm)) {
				574	err = PTR_ERR(realm);
				575	goto fail;
				576	}
				577	}
				578
				579	if (le64_to_cpu(ri->seq) > realm->seq) {
				580	dout("update_snap_trace updating %llx %p %lld -> %lld\n",
				581	realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
				582	/*
				583	* if the realm seq has changed, queue a cap_snap for every
				584	* inode with open caps. we do this _before_ we update
				585	* the realm info so that we prepare for writeback under the
				586	* _previous_ snap context.
				587	*
				588	* ...unless it's a snap deletion!
				589	*/
				590	if (!deletion) {
				591	struct ceph_inode_info *ci;
				592	struct inode *lastinode = NULL;
				593
				594	spin_lock(&realm->inodes_with_caps_lock);
				595	list_for_each_entry(ci, &realm->inodes_with_caps,
				596	i_snap_realm_item) {
				597	struct inode *inode = igrab(&ci->vfs_inode);
				598	if (!inode)
				599	continue;
				600	spin_unlock(&realm->inodes_with_caps_lock);
				601	if (lastinode)
				602	iput(lastinode);
				603	lastinode = inode;
				604	ceph_queue_cap_snap(ci, realm->cached_context);
				605	spin_lock(&realm->inodes_with_caps_lock);
				606	}
				607	spin_unlock(&realm->inodes_with_caps_lock);
				608	if (lastinode)
				609	iput(lastinode);
				610	dout("update_snap_trace cap_snaps queued\n");
				611	}
				612
				613	} else {
				614	dout("update_snap_trace %llx %p seq %lld unchanged\n",
				615	realm->ino, realm, realm->seq);
				616	}
				617
				618	/* ensure the parent is correct */
				619	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
				620	if (err < 0)
				621	goto fail;
				622	invalidate += err;
				623
				624	if (le64_to_cpu(ri->seq) > realm->seq) {
				625	/* update realm parameters, snap lists */
				626	realm->seq = le64_to_cpu(ri->seq);
				627	realm->created = le64_to_cpu(ri->created);
				628	realm->parent_since = le64_to_cpu(ri->parent_since);
				629
				630	realm->num_snaps = le32_to_cpu(ri->num_snaps);
				631	err = dup_array(&realm->snaps, snaps, realm->num_snaps);
				632	if (err < 0)
				633	goto fail;
				634
				635	realm->num_prior_parent_snaps =
				636	le32_to_cpu(ri->num_prior_parent_snaps);
				637	err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
				638	realm->num_prior_parent_snaps);
				639	if (err < 0)
				640	goto fail;
				641
				642	invalidate = 1;
				643	} else if (!realm->cached_context) {
				644	invalidate = 1;
				645	}
				646
				647	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
				648	realm, invalidate, p, e);
				649
				650	if (p < e)
				651	goto more;
				652
				653	/* invalidate when we reach the _end_ (root) of the trace */
				654	if (invalidate)
				655	rebuild_snap_realms(realm);
				656
				657	__cleanup_empty_realms(mdsc);
				658	return 0;
				659
				660	bad:
				661	err = -EINVAL;
				662	fail:
				663	pr_err("update_snap_trace error %d\n", err);
				664	return err;
				665	}
				666
				667
				668	/*
				669	* Send any cap_snaps that are queued for flush. Try to carry
				670	* s_mutex across multiple snap flushes to avoid locking overhead.
				671	*
				672	* Caller holds no locks.
				673	*/
				674	static void flush_snaps(struct ceph_mds_client *mdsc)
				675	{
				676	struct ceph_inode_info *ci;
				677	struct inode *inode;
				678	struct ceph_mds_session *session = NULL;
				679
				680	dout("flush_snaps\n");
				681	spin_lock(&mdsc->snap_flush_lock);
				682	while (!list_empty(&mdsc->snap_flush_list)) {
				683	ci = list_first_entry(&mdsc->snap_flush_list,
				684	struct ceph_inode_info, i_snap_flush_item);
				685	inode = &ci->vfs_inode;
				686	igrab(inode);
				687	spin_unlock(&mdsc->snap_flush_lock);
				688	spin_lock(&inode->i_lock);
				689	__ceph_flush_snaps(ci, &session);
				690	spin_unlock(&inode->i_lock);
				691	iput(inode);
				692	spin_lock(&mdsc->snap_flush_lock);
				693	}
				694	spin_unlock(&mdsc->snap_flush_lock);
				695
				696	if (session) {
				697	mutex_unlock(&session->s_mutex);
				698	ceph_put_mds_session(session);
				699	}
				700	dout("flush_snaps done\n");
				701	}
				702
				703
				704	/*
				705	* Handle a snap notification from the MDS.
				706	*
				707	* This can take two basic forms: the simplest is just a snap creation
				708	* or deletion notification on an existing realm. This should update the
				709	* realm and its children.
				710	*
				711	* The more difficult case is realm creation, due to snap creation at a
				712	* new point in the file hierarchy, or due to a rename that moves a file or
				713	* directory into another realm.
				714	*/
				715	void ceph_handle_snap(struct ceph_mds_client *mdsc,
				716	struct ceph_msg *msg)
				717	{
				718	struct super_block *sb = mdsc->client->sb;
				719	struct ceph_mds_session *session;
				720	int mds;
				721	u64 split;
				722	int op;
				723	int trace_len;
				724	struct ceph_snap_realm *realm = NULL;
				725	void *p = msg->front.iov_base;
				726	void *e = p + msg->front.iov_len;
				727	struct ceph_mds_snap_head *h;
				728	int num_split_inos, num_split_realms;
				729	__le64 split_inos = NULL, split_realms = NULL;
				730	int i;
				731	int locked_rwsem = 0;
				732
				733	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
				734	return;
				735	mds = le64_to_cpu(msg->hdr.src.name.num);
				736
				737	/* decode */
				738	if (msg->front.iov_len < sizeof(*h))
				739	goto bad;
				740	h = p;
				741	op = le32_to_cpu(h->op);
				742	split = le64_to_cpu(h->split); /* non-zero if we are splitting an
				743	* existing realm */
				744	num_split_inos = le32_to_cpu(h->num_split_inos);
				745	num_split_realms = le32_to_cpu(h->num_split_realms);
				746	trace_len = le32_to_cpu(h->trace_len);
				747	p += sizeof(*h);
				748
				749	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
				750	ceph_snap_op_name(op), split, trace_len);
				751
				752	/* find session */
				753	mutex_lock(&mdsc->mutex);
				754	session = __ceph_lookup_mds_session(mdsc, mds);
				755	mutex_unlock(&mdsc->mutex);
				756	if (!session) {
				757	dout("WTF, got snap but no session for mds%d\n", mds);
				758	return;
				759	}
				760
				761	mutex_lock(&session->s_mutex);
				762	session->s_seq++;
				763	mutex_unlock(&session->s_mutex);
				764
				765	down_write(&mdsc->snap_rwsem);
				766	locked_rwsem = 1;
				767
				768	if (op == CEPH_SNAP_OP_SPLIT) {
				769	struct ceph_mds_snap_realm *ri;
				770
				771	/*
				772	* A "split" breaks part of an existing realm off into
				773	* a new realm. The MDS provides a list of inodes
				774	* (with caps) and child realms that belong to the new
				775	* child.
				776	*/
				777	split_inos = p;
				778	p += sizeof(u64) * num_split_inos;
				779	split_realms = p;
				780	p += sizeof(u64) * num_split_realms;
				781	ceph_decode_need(&p, e, sizeof(*ri), bad);
				782	/* we will peek at realm info here, but will _not_
				783	* advance p, as the realm update will occur below in
				784	* ceph_update_snap_trace. */
				785	ri = p;
				786
				787	realm = ceph_lookup_snap_realm(mdsc, split);
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	788	if (!realm) {
				789	realm = ceph_create_snap_realm(mdsc, split);
				790	if (IS_ERR(realm))
				791	goto out;
				792	}
				793	ceph_get_snap_realm(mdsc, realm);
				794
				795	dout("splitting snap_realm %llx %p\n", realm->ino, realm);
				796	for (i = 0; i < num_split_inos; i++) {
				797	struct ceph_vino vino = {
				798	.ino = le64_to_cpu(split_inos[i]),
				799	.snap = CEPH_NOSNAP,
				800	};
				801	struct inode *inode = ceph_find_inode(sb, vino);
				802	struct ceph_inode_info *ci;
				803
				804	if (!inode)
				805	continue;
				806	ci = ceph_inode(inode);
				807
				808	spin_lock(&inode->i_lock);
				809	if (!ci->i_snap_realm)
				810	goto skip_inode;
				811	/*
				812	* If this inode belongs to a realm that was
				813	* created after our new realm, we experienced
				814	* a race (due to another split notifications
				815	* arriving from a different MDS). So skip
				816	* this inode.
				817	*/
				818	if (ci->i_snap_realm->created >
				819	le64_to_cpu(ri->created)) {
				820	dout(" leaving %p in newer realm %llx %p\n",
				821	inode, ci->i_snap_realm->ino,
				822	ci->i_snap_realm);
				823	goto skip_inode;
				824	}
				825	dout(" will move %p to split realm %llx %p\n",
				826	inode, realm->ino, realm);
				827	/*
				828	* Remove the inode from the realm's inode
				829	* list, but don't add it to the new realm
				830	* yet. We don't want the cap_snap to be
				831	* queued (again) by ceph_update_snap_trace()
				832	* below. Queue it _now_, under the old context.
				833	*/
				834	list_del_init(&ci->i_snap_realm_item);
				835	spin_unlock(&inode->i_lock);
				836
				837	ceph_queue_cap_snap(ci,
				838	ci->i_snap_realm->cached_context);
				839
				840	iput(inode);
				841	continue;
				842
				843	skip_inode:
				844	spin_unlock(&inode->i_lock);
				845	iput(inode);
				846	}
				847
				848	/* we may have taken some of the old realm's children. */
				849	for (i = 0; i < num_split_realms; i++) {
				850	struct ceph_snap_realm *child =
				851	ceph_lookup_snap_realm(mdsc,
				852	le64_to_cpu(split_realms[i]));
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	853	if (!child)
				854	continue;
				855	adjust_snap_realm_parent(mdsc, child, realm->ino);
				856	}
				857	}
				858
				859	/*
				860	* update using the provided snap trace. if we are deleting a
				861	* snap, we can avoid queueing cap_snaps.
				862	*/
				863	ceph_update_snap_trace(mdsc, p, e,
				864	op == CEPH_SNAP_OP_DESTROY);
				865
				866	if (op == CEPH_SNAP_OP_SPLIT) {
				867	/*
				868	* ok, _now_ add the inodes into the new realm.
				869	*/
				870	for (i = 0; i < num_split_inos; i++) {
				871	struct ceph_vino vino = {
				872	.ino = le64_to_cpu(split_inos[i]),
				873	.snap = CEPH_NOSNAP,
				874	};
				875	struct inode *inode = ceph_find_inode(sb, vino);
				876	struct ceph_inode_info *ci;
				877
				878	if (!inode)
				879	continue;
				880	ci = ceph_inode(inode);
				881	spin_lock(&inode->i_lock);
				882	if (!ci->i_snap_realm)
				883	goto split_skip_inode;
				884	ceph_put_snap_realm(mdsc, ci->i_snap_realm);
				885	spin_lock(&realm->inodes_with_caps_lock);
				886	list_add(&ci->i_snap_realm_item,
				887	&realm->inodes_with_caps);
				888	ci->i_snap_realm = realm;
				889	spin_unlock(&realm->inodes_with_caps_lock);
				890	ceph_get_snap_realm(mdsc, realm);
				891	split_skip_inode:
				892	spin_unlock(&inode->i_lock);
				893	iput(inode);
				894	}
				895
				896	/* we took a reference when we created the realm, above */
				897	ceph_put_snap_realm(mdsc, realm);
				898	}
				899
				900	__cleanup_empty_realms(mdsc);
				901
				902	up_write(&mdsc->snap_rwsem);
				903
				904	flush_snaps(mdsc);
				905	return;
				906
				907	bad:
				908	pr_err("corrupt snap message from mds%d\n", mds);
Sage Weil	9ec7cab	2009-12-14 15:13:47 -0800	[diff] [blame]	909	ceph_msg_dump(msg);
Sage Weil	963b61e	2009-10-06 11:31:12 -0700	[diff] [blame]	910	out:
				911	if (locked_rwsem)
				912	up_write(&mdsc->snap_rwsem);
				913	return;
				914	}
				915
				916
				917