blob: e8998d15f3cd16330ca29de648f476d35cff580a [file] [log] [blame]
Dan Williams7b6be842017-04-11 09:49:49 -07001/*
2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/pagemap.h>
14#include <linux/module.h>
15#include <linux/mount.h>
16#include <linux/magic.h>
Dan Williamsef510422017-05-08 10:55:27 -070017#include <linux/genhd.h>
Dan Williams7b6be842017-04-11 09:49:49 -070018#include <linux/cdev.h>
19#include <linux/hash.h>
20#include <linux/slab.h>
Dan Williams6568b082017-01-24 18:44:18 -080021#include <linux/dax.h>
Dan Williams7b6be842017-04-11 09:49:49 -070022#include <linux/fs.h>
23
24static int nr_dax = CONFIG_NR_DEV_DAX;
25module_param(nr_dax, int, S_IRUGO);
26MODULE_PARM_DESC(nr_dax, "max number of dax device instances");
27
28static dev_t dax_devt;
29DEFINE_STATIC_SRCU(dax_srcu);
30static struct vfsmount *dax_mnt;
31static DEFINE_IDA(dax_minor_ida);
32static struct kmem_cache *dax_cache __read_mostly;
33static struct super_block *dax_superblock __read_mostly;
34
Dan Williams72058002017-04-19 15:14:31 -070035#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
36static struct hlist_head dax_host_list[DAX_HASH_SIZE];
37static DEFINE_SPINLOCK(dax_host_lock);
38
Dan Williams7b6be842017-04-11 09:49:49 -070039int dax_read_lock(void)
40{
41 return srcu_read_lock(&dax_srcu);
42}
43EXPORT_SYMBOL_GPL(dax_read_lock);
44
45void dax_read_unlock(int id)
46{
47 srcu_read_unlock(&dax_srcu, id);
48}
49EXPORT_SYMBOL_GPL(dax_read_unlock);
50
Dan Williamsef510422017-05-08 10:55:27 -070051int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
52 pgoff_t *pgoff)
53{
54 phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
55
56 if (pgoff)
57 *pgoff = PHYS_PFN(phys_off);
58 if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
59 return -EINVAL;
60 return 0;
61}
62EXPORT_SYMBOL(bdev_dax_pgoff);
63
64/**
65 * __bdev_dax_supported() - Check if the device supports dax for filesystem
66 * @sb: The superblock of the device
67 * @blocksize: The block size of the device
68 *
69 * This is a library function for filesystems to check if the block device
70 * can be mounted with dax option.
71 *
72 * Return: negative errno if unsupported, 0 if supported.
73 */
74int __bdev_dax_supported(struct super_block *sb, int blocksize)
75{
76 struct block_device *bdev = sb->s_bdev;
77 struct dax_device *dax_dev;
78 pgoff_t pgoff;
79 int err, id;
80 void *kaddr;
81 pfn_t pfn;
82 long len;
83
84 if (blocksize != PAGE_SIZE) {
85 pr_err("VFS (%s): error: unsupported blocksize for dax\n",
86 sb->s_id);
87 return -EINVAL;
88 }
89
90 err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
91 if (err) {
92 pr_err("VFS (%s): error: unaligned partition for dax\n",
93 sb->s_id);
94 return err;
95 }
96
97 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
98 if (!dax_dev) {
99 pr_err("VFS (%s): error: device does not support dax\n",
100 sb->s_id);
101 return -EOPNOTSUPP;
102 }
103
104 id = dax_read_lock();
105 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
106 dax_read_unlock(id);
107
108 put_dax(dax_dev);
109
110 if (len < 1) {
111 pr_err("VFS (%s): error: dax access failed (%ld)",
112 sb->s_id, len);
113 return len < 0 ? len : -EIO;
114 }
115
116 return 0;
117}
118EXPORT_SYMBOL_GPL(__bdev_dax_supported);
119
Dan Williams7b6be842017-04-11 09:49:49 -0700120/**
121 * struct dax_device - anchor object for dax services
122 * @inode: core vfs
123 * @cdev: optional character interface for "device dax"
Dan Williams72058002017-04-19 15:14:31 -0700124 * @host: optional name for lookups where the device path is not available
Dan Williams7b6be842017-04-11 09:49:49 -0700125 * @private: dax driver private data
126 * @alive: !alive + rcu grace period == no new operations / mappings
127 */
128struct dax_device {
Dan Williams72058002017-04-19 15:14:31 -0700129 struct hlist_node list;
Dan Williams7b6be842017-04-11 09:49:49 -0700130 struct inode inode;
131 struct cdev cdev;
Dan Williams72058002017-04-19 15:14:31 -0700132 const char *host;
Dan Williams7b6be842017-04-11 09:49:49 -0700133 void *private;
134 bool alive;
Dan Williams6568b082017-01-24 18:44:18 -0800135 const struct dax_operations *ops;
Dan Williams7b6be842017-04-11 09:49:49 -0700136};
137
Dan Williamsb0686262017-01-26 20:37:35 -0800138/**
139 * dax_direct_access() - translate a device pgoff to an absolute pfn
140 * @dax_dev: a dax_device instance representing the logical memory range
141 * @pgoff: offset in pages from the start of the device to translate
142 * @nr_pages: number of consecutive pages caller can handle relative to @pfn
143 * @kaddr: output parameter that returns a virtual address mapping of pfn
144 * @pfn: output parameter that returns an absolute pfn translation of @pgoff
145 *
146 * Return: negative errno if an error occurs, otherwise the number of
147 * pages accessible at the device relative @pgoff.
148 */
149long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
150 void **kaddr, pfn_t *pfn)
151{
152 long avail;
153
154 /*
155 * The device driver is allowed to sleep, in order to make the
156 * memory directly accessible.
157 */
158 might_sleep();
159
160 if (!dax_dev)
161 return -EOPNOTSUPP;
162
163 if (!dax_alive(dax_dev))
164 return -ENXIO;
165
166 if (nr_pages < 0)
167 return nr_pages;
168
169 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
170 kaddr, pfn);
171 if (!avail)
172 return -ERANGE;
173 return min(avail, nr_pages);
174}
175EXPORT_SYMBOL_GPL(dax_direct_access);
176
Dan Williams7b6be842017-04-11 09:49:49 -0700177bool dax_alive(struct dax_device *dax_dev)
178{
179 lockdep_assert_held(&dax_srcu);
180 return dax_dev->alive;
181}
182EXPORT_SYMBOL_GPL(dax_alive);
183
Dan Williams72058002017-04-19 15:14:31 -0700184static int dax_host_hash(const char *host)
185{
186 return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
187}
188
Dan Williams7b6be842017-04-11 09:49:49 -0700189/*
190 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
191 * that any fault handlers or operations that might have seen
192 * dax_alive(), have completed. Any operations that start after
193 * synchronize_srcu() has run will abort upon seeing !dax_alive().
194 */
195void kill_dax(struct dax_device *dax_dev)
196{
197 if (!dax_dev)
198 return;
199
200 dax_dev->alive = false;
Dan Williams72058002017-04-19 15:14:31 -0700201
Dan Williams7b6be842017-04-11 09:49:49 -0700202 synchronize_srcu(&dax_srcu);
Dan Williams72058002017-04-19 15:14:31 -0700203
204 spin_lock(&dax_host_lock);
205 hlist_del_init(&dax_dev->list);
206 spin_unlock(&dax_host_lock);
207
Dan Williams7b6be842017-04-11 09:49:49 -0700208 dax_dev->private = NULL;
209}
210EXPORT_SYMBOL_GPL(kill_dax);
211
212static struct inode *dax_alloc_inode(struct super_block *sb)
213{
214 struct dax_device *dax_dev;
215
216 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
217 return &dax_dev->inode;
218}
219
220static struct dax_device *to_dax_dev(struct inode *inode)
221{
222 return container_of(inode, struct dax_device, inode);
223}
224
225static void dax_i_callback(struct rcu_head *head)
226{
227 struct inode *inode = container_of(head, struct inode, i_rcu);
228 struct dax_device *dax_dev = to_dax_dev(inode);
229
Dan Williams72058002017-04-19 15:14:31 -0700230 kfree(dax_dev->host);
231 dax_dev->host = NULL;
Dan Williams7b6be842017-04-11 09:49:49 -0700232 ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
233 kmem_cache_free(dax_cache, dax_dev);
234}
235
236static void dax_destroy_inode(struct inode *inode)
237{
238 struct dax_device *dax_dev = to_dax_dev(inode);
239
240 WARN_ONCE(dax_dev->alive,
241 "kill_dax() must be called before final iput()\n");
242 call_rcu(&inode->i_rcu, dax_i_callback);
243}
244
245static const struct super_operations dax_sops = {
246 .statfs = simple_statfs,
247 .alloc_inode = dax_alloc_inode,
248 .destroy_inode = dax_destroy_inode,
249 .drop_inode = generic_delete_inode,
250};
251
252static struct dentry *dax_mount(struct file_system_type *fs_type,
253 int flags, const char *dev_name, void *data)
254{
255 return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
256}
257
258static struct file_system_type dax_fs_type = {
259 .name = "dax",
260 .mount = dax_mount,
261 .kill_sb = kill_anon_super,
262};
263
264static int dax_test(struct inode *inode, void *data)
265{
266 dev_t devt = *(dev_t *) data;
267
268 return inode->i_rdev == devt;
269}
270
271static int dax_set(struct inode *inode, void *data)
272{
273 dev_t devt = *(dev_t *) data;
274
275 inode->i_rdev = devt;
276 return 0;
277}
278
279static struct dax_device *dax_dev_get(dev_t devt)
280{
281 struct dax_device *dax_dev;
282 struct inode *inode;
283
284 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
285 dax_test, dax_set, &devt);
286
287 if (!inode)
288 return NULL;
289
290 dax_dev = to_dax_dev(inode);
291 if (inode->i_state & I_NEW) {
292 dax_dev->alive = true;
293 inode->i_cdev = &dax_dev->cdev;
294 inode->i_mode = S_IFCHR;
295 inode->i_flags = S_DAX;
296 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
297 unlock_new_inode(inode);
298 }
299
300 return dax_dev;
301}
302
Dan Williams72058002017-04-19 15:14:31 -0700303static void dax_add_host(struct dax_device *dax_dev, const char *host)
304{
305 int hash;
306
307 /*
308 * Unconditionally init dax_dev since it's coming from a
309 * non-zeroed slab cache
310 */
311 INIT_HLIST_NODE(&dax_dev->list);
312 dax_dev->host = host;
313 if (!host)
314 return;
315
316 hash = dax_host_hash(host);
317 spin_lock(&dax_host_lock);
318 hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
319 spin_unlock(&dax_host_lock);
320}
321
Dan Williams6568b082017-01-24 18:44:18 -0800322struct dax_device *alloc_dax(void *private, const char *__host,
323 const struct dax_operations *ops)
Dan Williams7b6be842017-04-11 09:49:49 -0700324{
325 struct dax_device *dax_dev;
Dan Williams72058002017-04-19 15:14:31 -0700326 const char *host;
Dan Williams7b6be842017-04-11 09:49:49 -0700327 dev_t devt;
328 int minor;
329
Dan Williams72058002017-04-19 15:14:31 -0700330 host = kstrdup(__host, GFP_KERNEL);
331 if (__host && !host)
332 return NULL;
333
Dan Williams7b6be842017-04-11 09:49:49 -0700334 minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
335 if (minor < 0)
Dan Williams72058002017-04-19 15:14:31 -0700336 goto err_minor;
Dan Williams7b6be842017-04-11 09:49:49 -0700337
338 devt = MKDEV(MAJOR(dax_devt), minor);
339 dax_dev = dax_dev_get(devt);
340 if (!dax_dev)
Dan Williams72058002017-04-19 15:14:31 -0700341 goto err_dev;
Dan Williams7b6be842017-04-11 09:49:49 -0700342
Dan Williams72058002017-04-19 15:14:31 -0700343 dax_add_host(dax_dev, host);
Dan Williams6568b082017-01-24 18:44:18 -0800344 dax_dev->ops = ops;
Dan Williams7b6be842017-04-11 09:49:49 -0700345 dax_dev->private = private;
346 return dax_dev;
347
Dan Williams72058002017-04-19 15:14:31 -0700348 err_dev:
Dan Williams7b6be842017-04-11 09:49:49 -0700349 ida_simple_remove(&dax_minor_ida, minor);
Dan Williams72058002017-04-19 15:14:31 -0700350 err_minor:
351 kfree(host);
Dan Williams7b6be842017-04-11 09:49:49 -0700352 return NULL;
353}
354EXPORT_SYMBOL_GPL(alloc_dax);
355
356void put_dax(struct dax_device *dax_dev)
357{
358 if (!dax_dev)
359 return;
360 iput(&dax_dev->inode);
361}
362EXPORT_SYMBOL_GPL(put_dax);
363
364/**
Dan Williams72058002017-04-19 15:14:31 -0700365 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
366 * @host: alternate name for the device registered by a dax driver
367 */
368struct dax_device *dax_get_by_host(const char *host)
369{
370 struct dax_device *dax_dev, *found = NULL;
371 int hash, id;
372
373 if (!host)
374 return NULL;
375
376 hash = dax_host_hash(host);
377
378 id = dax_read_lock();
379 spin_lock(&dax_host_lock);
380 hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
381 if (!dax_alive(dax_dev)
382 || strcmp(host, dax_dev->host) != 0)
383 continue;
384
385 if (igrab(&dax_dev->inode))
386 found = dax_dev;
387 break;
388 }
389 spin_unlock(&dax_host_lock);
390 dax_read_unlock(id);
391
392 return found;
393}
394EXPORT_SYMBOL_GPL(dax_get_by_host);
395
396/**
Dan Williams7b6be842017-04-11 09:49:49 -0700397 * inode_dax: convert a public inode into its dax_dev
398 * @inode: An inode with i_cdev pointing to a dax_dev
399 *
400 * Note this is not equivalent to to_dax_dev() which is for private
401 * internal use where we know the inode filesystem type == dax_fs_type.
402 */
403struct dax_device *inode_dax(struct inode *inode)
404{
405 struct cdev *cdev = inode->i_cdev;
406
407 return container_of(cdev, struct dax_device, cdev);
408}
409EXPORT_SYMBOL_GPL(inode_dax);
410
411struct inode *dax_inode(struct dax_device *dax_dev)
412{
413 return &dax_dev->inode;
414}
415EXPORT_SYMBOL_GPL(dax_inode);
416
417void *dax_get_private(struct dax_device *dax_dev)
418{
419 return dax_dev->private;
420}
421EXPORT_SYMBOL_GPL(dax_get_private);
422
423static void init_once(void *_dax_dev)
424{
425 struct dax_device *dax_dev = _dax_dev;
426 struct inode *inode = &dax_dev->inode;
427
428 inode_init_once(inode);
429}
430
431static int __dax_fs_init(void)
432{
433 int rc;
434
435 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
436 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
437 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
438 init_once);
439 if (!dax_cache)
440 return -ENOMEM;
441
442 rc = register_filesystem(&dax_fs_type);
443 if (rc)
444 goto err_register_fs;
445
446 dax_mnt = kern_mount(&dax_fs_type);
447 if (IS_ERR(dax_mnt)) {
448 rc = PTR_ERR(dax_mnt);
449 goto err_mount;
450 }
451 dax_superblock = dax_mnt->mnt_sb;
452
453 return 0;
454
455 err_mount:
456 unregister_filesystem(&dax_fs_type);
457 err_register_fs:
458 kmem_cache_destroy(dax_cache);
459
460 return rc;
461}
462
463static void __dax_fs_exit(void)
464{
465 kern_unmount(dax_mnt);
466 unregister_filesystem(&dax_fs_type);
467 kmem_cache_destroy(dax_cache);
468}
469
470static int __init dax_fs_init(void)
471{
472 int rc;
473
474 rc = __dax_fs_init();
475 if (rc)
476 return rc;
477
478 nr_dax = max(nr_dax, 256);
479 rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
480 if (rc)
481 __dax_fs_exit();
482 return rc;
483}
484
485static void __exit dax_fs_exit(void)
486{
487 unregister_chrdev_region(dax_devt, nr_dax);
488 ida_destroy(&dax_minor_ida);
489 __dax_fs_exit();
490}
491
492MODULE_AUTHOR("Intel Corporation");
493MODULE_LICENSE("GPL v2");
494subsys_initcall(dax_fs_init);
495module_exit(dax_fs_exit);