[PATCH] files: files struct with RCU
Patch to eliminate struct files_struct.file_lock spinlock on the reader side
and use rcu refcounting rcuref_xxx api for the f_count refcounter. The
updates to the fdtable are done by allocating a new fdtable structure and
setting files->fdt to point to the new structure. The fdtable structure is
protected by RCU thereby allowing lock-free lookup. For fd arrays/sets that
are vmalloced, we use keventd to free them since RCU callbacks can't sleep. A
global list of fdtable to be freed is not scalable, so we use a per-cpu list.
If keventd is already handling the current cpu's work, we use a timer to defer
queueing of that work.
Since the last publication, this patch has been re-written to avoid using
explicit memory barriers and use rcu_assign_pointer(), rcu_dereference()
premitives instead. This required that the fd information is kept in a
separate structure (fdtable) and updated atomically.
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/linux/file.h b/include/linux/file.h
index db37223..f5bbd4c 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
/*
* The default fd array needs to be at least BITS_PER_LONG,
@@ -23,6 +24,9 @@
struct file ** fd; /* current fd array */
fd_set *close_on_exec;
fd_set *open_fds;
+ struct rcu_head rcu;
+ struct files_struct *free_files;
+ struct fdtable *next;
};
/*
@@ -31,13 +35,14 @@
struct files_struct {
atomic_t count;
spinlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */
+ struct fdtable *fdt;
struct fdtable fdtab;
fd_set close_on_exec_init;
fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
};
-#define files_fdtable(files) (&(files)->fdtab)
+#define files_fdtable(files) (rcu_dereference((files)->fdt))
extern void FASTCALL(__fput(struct file *));
extern void FASTCALL(fput(struct file *));
@@ -65,6 +70,8 @@
extern void free_fdset(fd_set *, int);
extern int expand_files(struct files_struct *, int nr);
+extern void free_fdtable(struct fdtable *fdt);
+extern void __init files_defer_init(void);
static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
{
@@ -72,7 +79,7 @@
struct fdtable *fdt = files_fdtable(files);
if (fd < fdt->max_fds)
- file = fdt->fd[fd];
+ file = rcu_dereference(fdt->fd[fd]);
return file;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd93ab7..7f61227 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -9,6 +9,7 @@
#include <linux/config.h>
#include <linux/limits.h>
#include <linux/ioctl.h>
+#include <linux/rcuref.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -597,12 +598,13 @@
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
+ struct rcu_head f_rcuhead;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
#define file_list_unlock() spin_unlock(&files_lock);
-#define get_file(x) atomic_inc(&(x)->f_count)
+#define get_file(x) rcuref_inc(&(x)->f_count)
#define file_count(x) atomic_read(&(x)->f_count)
#define MAX_NON_LFS ((1UL<<31) - 1)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 94aefa5..68ab5f2 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -2,6 +2,7 @@
#define _LINUX__INIT_TASK_H
#include <linux/file.h>
+#include <linux/rcupdate.h>
#define INIT_FDTABLE \
{ \
@@ -11,12 +12,16 @@
.fd = &init_files.fd_array[0], \
.close_on_exec = &init_files.close_on_exec_init, \
.open_fds = &init_files.open_fds_init, \
+ .rcu = RCU_HEAD_INIT, \
+ .free_files = NULL, \
+ .next = NULL, \
}
#define INIT_FILES \
{ \
.count = ATOMIC_INIT(1), \
.file_lock = SPIN_LOCK_UNLOCKED, \
+ .fdt = &init_files.fdtab, \
.fdtab = INIT_FDTABLE, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \