fsnotify: avoid spurious EMFILE errors from inotify_init()
Inotify instance is destroyed when all references to it are dropped.
That not only means that the corresponding file descriptor needs to be
closed but also that all corresponding instance marks are freed (as each
mark holds a reference to the inotify instance). However marks are
freed only after SRCU period ends which can take some time and thus if
user rapidly creates and frees inotify instances, number of existing
inotify instances can exceed max_user_instances limit although from user
point of view there is always at most one existing instance. Thus
inotify_init() returns EMFILE error which is hard to justify from user
point of view. This problem is exposed by LTP inotify06 testcase on
some machines.
We fix the problem by making sure all group marks are properly freed
while destroying inotify instance. We wait for SRCU period to end in
that path anyway since we have to make sure there is no event being
added to the instance while we are tearing down the instance. So it
takes only some plumbing to allow for marks to be destroyed in that path
as well and not from a dedicated work item.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Tested-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b44c68a..0a3bc2c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -56,6 +56,13 @@
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
&mnt->mnt_root->d_lock);
}
+/* prepare for freeing all marks associated with given group */
+extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
+/*
+ * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
+ */
+extern void fsnotify_mark_destroy_list(void);
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d16b62c..3e2dd85 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -47,12 +47,21 @@
*/
void fsnotify_destroy_group(struct fsnotify_group *group)
{
- /* clear all inode marks for this group */
- fsnotify_clear_marks_by_group(group);
+ /* clear all inode marks for this group, attach them to destroy_list */
+ fsnotify_detach_group_marks(group);
- synchronize_srcu(&fsnotify_mark_srcu);
+ /*
+ * Wait for fsnotify_mark_srcu period to end and free all marks in
+ * destroy_list
+ */
+ fsnotify_mark_destroy_list();
- /* clear the notification queue of all events */
+ /*
+ * Since we have waited for fsnotify_mark_srcu in
+ * fsnotify_mark_destroy_list() there can be no outstanding event
+ * notification against this group. So clearing the notification queue
+ * of all events is reliable now.
+ */
fsnotify_flush_notify(group);
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 7115c5d..d3fea0b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -97,8 +97,8 @@
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
-static void fsnotify_mark_destroy(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
+static void fsnotify_mark_destroy_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -173,11 +173,15 @@
}
/*
- * Free fsnotify mark. The freeing is actually happening from a kthread which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
+ * Prepare mark for freeing and add it to the list of marks prepared for
+ * freeing. The actual freeing must happen after SRCU period ends and the
+ * caller is responsible for this.
+ *
+ * The function returns true if the mark was added to the list of marks for
+ * freeing. The function returns false if someone else has already called
+ * __fsnotify_free_mark() for the mark.
*/
-void fsnotify_free_mark(struct fsnotify_mark *mark)
+static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
@@ -185,17 +189,11 @@
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
- return;
+ return false;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
-
/*
* Some groups like to know that marks are being freed. This is a
* callback to the group function to let it know that this mark
@@ -203,6 +201,25 @@
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
+
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+
+ return true;
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a workqueue which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ if (__fsnotify_free_mark(mark)) {
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+ }
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -468,11 +485,29 @@
}
/*
- * Given a group, destroy all of the marks associated with that group.
+ * Given a group, prepare for freeing all the marks associated with that group.
+ * The marks are attached to the list of marks prepared for destruction, the
+ * caller is responsible for freeing marks in that list after SRCU period has
+ * ended.
*/
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_detach_group_marks(struct fsnotify_group *group)
{
- fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+ struct fsnotify_mark *mark;
+
+ while (1) {
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ if (list_empty(&group->marks_list)) {
+ mutex_unlock(&group->mark_mutex);
+ break;
+ }
+ mark = list_first_entry(&group->marks_list,
+ struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&group->mark_mutex);
+ __fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ }
}
void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
@@ -499,7 +534,11 @@
mark->free_mark = free_mark;
}
-static void fsnotify_mark_destroy(struct work_struct *work)
+/*
+ * Destroy all marks in destroy_list, waits for SRCU period to finish before
+ * actually freeing marks.
+ */
+void fsnotify_mark_destroy_list(void)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
@@ -516,3 +555,8 @@
fsnotify_put_mark(mark);
}
}
+
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+{
+ fsnotify_mark_destroy_list();
+}