Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* |
| 2 | * ramdisk.c - Multiple RAM disk driver - gzip-loading version - v. 0.8 beta. |
| 3 | * |
| 4 | * (C) Chad Page, Theodore Ts'o, et. al, 1995. |
| 5 | * |
| 6 | * This RAM disk is designed to have filesystems created on it and mounted |
| 7 | * just like a regular floppy disk. |
| 8 | * |
| 9 | * It also does something suggested by Linus: use the buffer cache as the |
| 10 | * RAM disk data. This makes it possible to dynamically allocate the RAM disk |
| 11 | * buffer - with some consequences I have to deal with as I write this. |
| 12 | * |
| 13 | * This code is based on the original ramdisk.c, written mostly by |
| 14 | * Theodore Ts'o (TYT) in 1991. The code was largely rewritten by |
| 15 | * Chad Page to use the buffer cache to store the RAM disk data in |
| 16 | * 1995; Theodore then took over the driver again, and cleaned it up |
| 17 | * for inclusion in the mainline kernel. |
| 18 | * |
| 19 | * The original CRAMDISK code was written by Richard Lyons, and |
| 20 | * adapted by Chad Page to use the new RAM disk interface. Theodore |
| 21 | * Ts'o rewrote it so that both the compressed RAM disk loader and the |
| 22 | * kernel decompressor uses the same inflate.c codebase. The RAM disk |
| 23 | * loader now also loads into a dynamic (buffer cache based) RAM disk, |
| 24 | * not the old static RAM disk. Support for the old static RAM disk has |
| 25 | * been completely removed. |
| 26 | * |
| 27 | * Loadable module support added by Tom Dyas. |
| 28 | * |
| 29 | * Further cleanups by Chad Page (page0588@sundance.sjsu.edu): |
| 30 | * Cosmetic changes in #ifdef MODULE, code movement, etc. |
| 31 | * When the RAM disk module is removed, free the protected buffers |
| 32 | * Default RAM disk size changed to 2.88 MB |
| 33 | * |
| 34 | * Added initrd: Werner Almesberger & Hans Lermen, Feb '96 |
| 35 | * |
| 36 | * 4/25/96 : Made RAM disk size a parameter (default is now 4 MB) |
| 37 | * - Chad Page |
| 38 | * |
| 39 | * Add support for fs images split across >1 disk, Paul Gortmaker, Mar '98 |
| 40 | * |
| 41 | * Make block size and block size shift for RAM disks a global macro |
| 42 | * and set blk_size for -ENOSPC, Werner Fink <werner@suse.de>, Apr '99 |
| 43 | */ |
| 44 | |
| 45 | #include <linux/config.h> |
| 46 | #include <linux/string.h> |
| 47 | #include <linux/slab.h> |
| 48 | #include <asm/atomic.h> |
| 49 | #include <linux/bio.h> |
| 50 | #include <linux/module.h> |
| 51 | #include <linux/moduleparam.h> |
| 52 | #include <linux/init.h> |
| 53 | #include <linux/devfs_fs_kernel.h> |
| 54 | #include <linux/pagemap.h> |
| 55 | #include <linux/blkdev.h> |
| 56 | #include <linux/genhd.h> |
| 57 | #include <linux/buffer_head.h> /* for invalidate_bdev() */ |
| 58 | #include <linux/backing-dev.h> |
| 59 | #include <linux/blkpg.h> |
| 60 | #include <linux/writeback.h> |
| 61 | |
| 62 | #include <asm/uaccess.h> |
| 63 | |
| 64 | /* Various static variables go here. Most are used only in the RAM disk code. |
| 65 | */ |
| 66 | |
| 67 | static struct gendisk *rd_disks[CONFIG_BLK_DEV_RAM_COUNT]; |
| 68 | static struct block_device *rd_bdev[CONFIG_BLK_DEV_RAM_COUNT];/* Protected device data */ |
| 69 | static struct request_queue *rd_queue[CONFIG_BLK_DEV_RAM_COUNT]; |
| 70 | |
| 71 | /* |
| 72 | * Parameters for the boot-loading of the RAM disk. These are set by |
| 73 | * init/main.c (from arguments to the kernel command line) or from the |
| 74 | * architecture-specific setup routine (from the stored boot sector |
| 75 | * information). |
| 76 | */ |
Adrian Bunk | cccf250 | 2005-05-07 01:28:45 +0200 | [diff] [blame] | 77 | int rd_size = CONFIG_BLK_DEV_RAM_SIZE; /* Size of the RAM disks */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 78 | /* |
| 79 | * It would be very desirable to have a soft-blocksize (that in the case |
| 80 | * of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because |
| 81 | * doing that we'll achieve a far better MM footprint. Using a rd_blocksize of |
| 82 | * BLOCK_SIZE in the worst case we'll make PAGE_SIZE/BLOCK_SIZE buffer-pages |
| 83 | * unfreeable. With a rd_blocksize of PAGE_SIZE instead we are sure that only |
| 84 | * 1 page will be protected. Depending on the size of the ramdisk you |
| 85 | * may want to change the ramdisk blocksize to achieve a better or worse MM |
| 86 | * behaviour. The default is still BLOCK_SIZE (needed by rd_load_image that |
| 87 | * supposes the filesystem in the image uses a BLOCK_SIZE blocksize). |
| 88 | */ |
| 89 | static int rd_blocksize = BLOCK_SIZE; /* blocksize of the RAM disks */ |
| 90 | |
| 91 | /* |
| 92 | * Copyright (C) 2000 Linus Torvalds. |
| 93 | * 2000 Transmeta Corp. |
| 94 | * aops copied from ramfs. |
| 95 | */ |
| 96 | |
| 97 | /* |
| 98 | * If a ramdisk page has buffers, some may be uptodate and some may be not. |
| 99 | * To bring the page uptodate we zero out the non-uptodate buffers. The |
| 100 | * page must be locked. |
| 101 | */ |
| 102 | static void make_page_uptodate(struct page *page) |
| 103 | { |
| 104 | if (page_has_buffers(page)) { |
| 105 | struct buffer_head *bh = page_buffers(page); |
| 106 | struct buffer_head *head = bh; |
| 107 | |
| 108 | do { |
| 109 | if (!buffer_uptodate(bh)) { |
| 110 | memset(bh->b_data, 0, bh->b_size); |
| 111 | /* |
| 112 | * akpm: I'm totally undecided about this. The |
| 113 | * buffer has just been magically brought "up to |
| 114 | * date", but nobody should want to be reading |
| 115 | * it anyway, because it hasn't been used for |
| 116 | * anything yet. It is still in a "not read |
| 117 | * from disk yet" state. |
| 118 | * |
| 119 | * But non-uptodate buffers against an uptodate |
| 120 | * page are against the rules. So do it anyway. |
| 121 | */ |
| 122 | set_buffer_uptodate(bh); |
| 123 | } |
| 124 | } while ((bh = bh->b_this_page) != head); |
| 125 | } else { |
| 126 | memset(page_address(page), 0, PAGE_CACHE_SIZE); |
| 127 | } |
| 128 | flush_dcache_page(page); |
| 129 | SetPageUptodate(page); |
| 130 | } |
| 131 | |
| 132 | static int ramdisk_readpage(struct file *file, struct page *page) |
| 133 | { |
| 134 | if (!PageUptodate(page)) |
| 135 | make_page_uptodate(page); |
| 136 | unlock_page(page); |
| 137 | return 0; |
| 138 | } |
| 139 | |
| 140 | static int ramdisk_prepare_write(struct file *file, struct page *page, |
| 141 | unsigned offset, unsigned to) |
| 142 | { |
| 143 | if (!PageUptodate(page)) |
| 144 | make_page_uptodate(page); |
| 145 | return 0; |
| 146 | } |
| 147 | |
| 148 | static int ramdisk_commit_write(struct file *file, struct page *page, |
| 149 | unsigned offset, unsigned to) |
| 150 | { |
| 151 | set_page_dirty(page); |
| 152 | return 0; |
| 153 | } |
| 154 | |
| 155 | /* |
| 156 | * ->writepage to the the blockdev's mapping has to redirty the page so that the |
| 157 | * VM doesn't go and steal it. We return WRITEPAGE_ACTIVATE so that the VM |
| 158 | * won't try to (pointlessly) write the page again for a while. |
| 159 | * |
| 160 | * Really, these pages should not be on the LRU at all. |
| 161 | */ |
| 162 | static int ramdisk_writepage(struct page *page, struct writeback_control *wbc) |
| 163 | { |
| 164 | if (!PageUptodate(page)) |
| 165 | make_page_uptodate(page); |
| 166 | SetPageDirty(page); |
| 167 | if (wbc->for_reclaim) |
| 168 | return WRITEPAGE_ACTIVATE; |
| 169 | unlock_page(page); |
| 170 | return 0; |
| 171 | } |
| 172 | |
| 173 | /* |
| 174 | * This is a little speedup thing: short-circuit attempts to write back the |
| 175 | * ramdisk blockdev inode to its non-existent backing store. |
| 176 | */ |
| 177 | static int ramdisk_writepages(struct address_space *mapping, |
| 178 | struct writeback_control *wbc) |
| 179 | { |
| 180 | return 0; |
| 181 | } |
| 182 | |
| 183 | /* |
| 184 | * ramdisk blockdev pages have their own ->set_page_dirty() because we don't |
| 185 | * want them to contribute to dirty memory accounting. |
| 186 | */ |
| 187 | static int ramdisk_set_page_dirty(struct page *page) |
| 188 | { |
| 189 | SetPageDirty(page); |
| 190 | return 0; |
| 191 | } |
| 192 | |
| 193 | static struct address_space_operations ramdisk_aops = { |
| 194 | .readpage = ramdisk_readpage, |
| 195 | .prepare_write = ramdisk_prepare_write, |
| 196 | .commit_write = ramdisk_commit_write, |
| 197 | .writepage = ramdisk_writepage, |
| 198 | .set_page_dirty = ramdisk_set_page_dirty, |
| 199 | .writepages = ramdisk_writepages, |
| 200 | }; |
| 201 | |
| 202 | static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector, |
| 203 | struct address_space *mapping) |
| 204 | { |
| 205 | pgoff_t index = sector >> (PAGE_CACHE_SHIFT - 9); |
| 206 | unsigned int vec_offset = vec->bv_offset; |
| 207 | int offset = (sector << 9) & ~PAGE_CACHE_MASK; |
| 208 | int size = vec->bv_len; |
| 209 | int err = 0; |
| 210 | |
| 211 | do { |
| 212 | int count; |
| 213 | struct page *page; |
| 214 | char *src; |
| 215 | char *dst; |
| 216 | |
| 217 | count = PAGE_CACHE_SIZE - offset; |
| 218 | if (count > size) |
| 219 | count = size; |
| 220 | size -= count; |
| 221 | |
| 222 | page = grab_cache_page(mapping, index); |
| 223 | if (!page) { |
| 224 | err = -ENOMEM; |
| 225 | goto out; |
| 226 | } |
| 227 | |
| 228 | if (!PageUptodate(page)) |
| 229 | make_page_uptodate(page); |
| 230 | |
| 231 | index++; |
| 232 | |
| 233 | if (rw == READ) { |
| 234 | src = kmap_atomic(page, KM_USER0) + offset; |
| 235 | dst = kmap_atomic(vec->bv_page, KM_USER1) + vec_offset; |
| 236 | } else { |
| 237 | src = kmap_atomic(vec->bv_page, KM_USER0) + vec_offset; |
| 238 | dst = kmap_atomic(page, KM_USER1) + offset; |
| 239 | } |
| 240 | offset = 0; |
| 241 | vec_offset += count; |
| 242 | |
| 243 | memcpy(dst, src, count); |
| 244 | |
| 245 | kunmap_atomic(src, KM_USER0); |
| 246 | kunmap_atomic(dst, KM_USER1); |
| 247 | |
| 248 | if (rw == READ) |
| 249 | flush_dcache_page(vec->bv_page); |
| 250 | else |
| 251 | set_page_dirty(page); |
| 252 | unlock_page(page); |
| 253 | put_page(page); |
| 254 | } while (size); |
| 255 | |
| 256 | out: |
| 257 | return err; |
| 258 | } |
| 259 | |
| 260 | /* |
| 261 | * Basically, my strategy here is to set up a buffer-head which can't be |
| 262 | * deleted, and make that my Ramdisk. If the request is outside of the |
| 263 | * allocated size, we must get rid of it... |
| 264 | * |
| 265 | * 19-JAN-1998 Richard Gooch <rgooch@atnf.csiro.au> Added devfs support |
| 266 | * |
| 267 | */ |
| 268 | static int rd_make_request(request_queue_t *q, struct bio *bio) |
| 269 | { |
| 270 | struct block_device *bdev = bio->bi_bdev; |
| 271 | struct address_space * mapping = bdev->bd_inode->i_mapping; |
| 272 | sector_t sector = bio->bi_sector; |
| 273 | unsigned long len = bio->bi_size >> 9; |
| 274 | int rw = bio_data_dir(bio); |
| 275 | struct bio_vec *bvec; |
| 276 | int ret = 0, i; |
| 277 | |
| 278 | if (sector + len > get_capacity(bdev->bd_disk)) |
| 279 | goto fail; |
| 280 | |
| 281 | if (rw==READA) |
| 282 | rw=READ; |
| 283 | |
| 284 | bio_for_each_segment(bvec, bio, i) { |
| 285 | ret |= rd_blkdev_pagecache_IO(rw, bvec, sector, mapping); |
| 286 | sector += bvec->bv_len >> 9; |
| 287 | } |
| 288 | if (ret) |
| 289 | goto fail; |
| 290 | |
| 291 | bio_endio(bio, bio->bi_size, 0); |
| 292 | return 0; |
| 293 | fail: |
| 294 | bio_io_error(bio, bio->bi_size); |
| 295 | return 0; |
| 296 | } |
| 297 | |
| 298 | static int rd_ioctl(struct inode *inode, struct file *file, |
| 299 | unsigned int cmd, unsigned long arg) |
| 300 | { |
| 301 | int error; |
| 302 | struct block_device *bdev = inode->i_bdev; |
| 303 | |
| 304 | if (cmd != BLKFLSBUF) |
| 305 | return -ENOTTY; |
| 306 | |
| 307 | /* |
| 308 | * special: we want to release the ramdisk memory, it's not like with |
| 309 | * the other blockdevices where this ioctl only flushes away the buffer |
| 310 | * cache |
| 311 | */ |
| 312 | error = -EBUSY; |
| 313 | down(&bdev->bd_sem); |
| 314 | if (bdev->bd_openers <= 2) { |
| 315 | truncate_inode_pages(bdev->bd_inode->i_mapping, 0); |
| 316 | error = 0; |
| 317 | } |
| 318 | up(&bdev->bd_sem); |
| 319 | return error; |
| 320 | } |
| 321 | |
| 322 | /* |
| 323 | * This is the backing_dev_info for the blockdev inode itself. It doesn't need |
| 324 | * writeback and it does not contribute to dirty memory accounting. |
| 325 | */ |
| 326 | static struct backing_dev_info rd_backing_dev_info = { |
| 327 | .ra_pages = 0, /* No readahead */ |
| 328 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY, |
| 329 | .unplug_io_fn = default_unplug_io_fn, |
| 330 | }; |
| 331 | |
| 332 | /* |
| 333 | * This is the backing_dev_info for the files which live atop the ramdisk |
| 334 | * "device". These files do need writeback and they do contribute to dirty |
| 335 | * memory accounting. |
| 336 | */ |
| 337 | static struct backing_dev_info rd_file_backing_dev_info = { |
| 338 | .ra_pages = 0, /* No readahead */ |
| 339 | .capabilities = BDI_CAP_MAP_COPY, /* Does contribute to dirty memory */ |
| 340 | .unplug_io_fn = default_unplug_io_fn, |
| 341 | }; |
| 342 | |
| 343 | static int rd_open(struct inode *inode, struct file *filp) |
| 344 | { |
| 345 | unsigned unit = iminor(inode); |
| 346 | |
| 347 | if (rd_bdev[unit] == NULL) { |
| 348 | struct block_device *bdev = inode->i_bdev; |
| 349 | struct address_space *mapping; |
| 350 | unsigned bsize; |
Al Viro | b4e3ca1 | 2005-10-21 03:22:34 -0400 | [diff] [blame] | 351 | gfp_t gfp_mask; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 352 | |
| 353 | inode = igrab(bdev->bd_inode); |
| 354 | rd_bdev[unit] = bdev; |
| 355 | bdev->bd_openers++; |
| 356 | bsize = bdev_hardsect_size(bdev); |
| 357 | bdev->bd_block_size = bsize; |
| 358 | inode->i_blkbits = blksize_bits(bsize); |
| 359 | inode->i_size = get_capacity(bdev->bd_disk)<<9; |
| 360 | |
| 361 | mapping = inode->i_mapping; |
| 362 | mapping->a_ops = &ramdisk_aops; |
| 363 | mapping->backing_dev_info = &rd_backing_dev_info; |
| 364 | bdev->bd_inode_backing_dev_info = &rd_file_backing_dev_info; |
| 365 | |
| 366 | /* |
| 367 | * Deep badness. rd_blkdev_pagecache_IO() needs to allocate |
| 368 | * pagecache pages within a request_fn. We cannot recur back |
| 369 | * into the filesytem which is mounted atop the ramdisk, because |
| 370 | * that would deadlock on fs locks. And we really don't want |
| 371 | * to reenter rd_blkdev_pagecache_IO when we're already within |
| 372 | * that function. |
| 373 | * |
| 374 | * So we turn off __GFP_FS and __GFP_IO. |
| 375 | * |
| 376 | * And to give this thing a hope of working, turn on __GFP_HIGH. |
| 377 | * Hopefully, there's enough regular memory allocation going on |
| 378 | * for the page allocator emergency pools to keep the ramdisk |
| 379 | * driver happy. |
| 380 | */ |
| 381 | gfp_mask = mapping_gfp_mask(mapping); |
| 382 | gfp_mask &= ~(__GFP_FS|__GFP_IO); |
| 383 | gfp_mask |= __GFP_HIGH; |
| 384 | mapping_set_gfp_mask(mapping, gfp_mask); |
| 385 | } |
| 386 | |
| 387 | return 0; |
| 388 | } |
| 389 | |
| 390 | static struct block_device_operations rd_bd_op = { |
| 391 | .owner = THIS_MODULE, |
| 392 | .open = rd_open, |
| 393 | .ioctl = rd_ioctl, |
| 394 | }; |
| 395 | |
| 396 | /* |
| 397 | * Before freeing the module, invalidate all of the protected buffers! |
| 398 | */ |
| 399 | static void __exit rd_cleanup(void) |
| 400 | { |
| 401 | int i; |
| 402 | |
| 403 | for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) { |
| 404 | struct block_device *bdev = rd_bdev[i]; |
| 405 | rd_bdev[i] = NULL; |
| 406 | if (bdev) { |
| 407 | invalidate_bdev(bdev, 1); |
| 408 | blkdev_put(bdev); |
| 409 | } |
| 410 | del_gendisk(rd_disks[i]); |
| 411 | put_disk(rd_disks[i]); |
| 412 | blk_cleanup_queue(rd_queue[i]); |
| 413 | } |
| 414 | devfs_remove("rd"); |
| 415 | unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); |
| 416 | } |
| 417 | |
| 418 | /* |
| 419 | * This is the registration and initialization section of the RAM disk driver |
| 420 | */ |
| 421 | static int __init rd_init(void) |
| 422 | { |
| 423 | int i; |
| 424 | int err = -ENOMEM; |
| 425 | |
| 426 | if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 || |
| 427 | (rd_blocksize & (rd_blocksize-1))) { |
| 428 | printk("RAMDISK: wrong blocksize %d, reverting to defaults\n", |
| 429 | rd_blocksize); |
| 430 | rd_blocksize = BLOCK_SIZE; |
| 431 | } |
| 432 | |
| 433 | for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) { |
| 434 | rd_disks[i] = alloc_disk(1); |
| 435 | if (!rd_disks[i]) |
| 436 | goto out; |
| 437 | } |
| 438 | |
| 439 | if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) { |
| 440 | err = -EIO; |
| 441 | goto out; |
| 442 | } |
| 443 | |
| 444 | devfs_mk_dir("rd"); |
| 445 | |
| 446 | for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) { |
| 447 | struct gendisk *disk = rd_disks[i]; |
| 448 | |
| 449 | rd_queue[i] = blk_alloc_queue(GFP_KERNEL); |
| 450 | if (!rd_queue[i]) |
| 451 | goto out_queue; |
| 452 | |
| 453 | blk_queue_make_request(rd_queue[i], &rd_make_request); |
| 454 | blk_queue_hardsect_size(rd_queue[i], rd_blocksize); |
| 455 | |
| 456 | /* rd_size is given in kB */ |
| 457 | disk->major = RAMDISK_MAJOR; |
| 458 | disk->first_minor = i; |
| 459 | disk->fops = &rd_bd_op; |
| 460 | disk->queue = rd_queue[i]; |
| 461 | disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; |
| 462 | sprintf(disk->disk_name, "ram%d", i); |
| 463 | sprintf(disk->devfs_name, "rd/%d", i); |
| 464 | set_capacity(disk, rd_size * 2); |
| 465 | add_disk(rd_disks[i]); |
| 466 | } |
| 467 | |
| 468 | /* rd_size is given in kB */ |
| 469 | printk("RAMDISK driver initialized: " |
| 470 | "%d RAM disks of %dK size %d blocksize\n", |
| 471 | CONFIG_BLK_DEV_RAM_COUNT, rd_size, rd_blocksize); |
| 472 | |
| 473 | return 0; |
| 474 | out_queue: |
| 475 | unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); |
| 476 | out: |
| 477 | while (i--) { |
| 478 | put_disk(rd_disks[i]); |
| 479 | blk_cleanup_queue(rd_queue[i]); |
| 480 | } |
| 481 | return err; |
| 482 | } |
| 483 | |
| 484 | module_init(rd_init); |
| 485 | module_exit(rd_cleanup); |
| 486 | |
| 487 | /* options - nonmodular */ |
| 488 | #ifndef MODULE |
| 489 | static int __init ramdisk_size(char *str) |
| 490 | { |
| 491 | rd_size = simple_strtol(str,NULL,0); |
| 492 | return 1; |
| 493 | } |
| 494 | static int __init ramdisk_size2(char *str) /* kludge */ |
| 495 | { |
| 496 | return ramdisk_size(str); |
| 497 | } |
| 498 | static int __init ramdisk_blocksize(char *str) |
| 499 | { |
| 500 | rd_blocksize = simple_strtol(str,NULL,0); |
| 501 | return 1; |
| 502 | } |
| 503 | __setup("ramdisk=", ramdisk_size); |
| 504 | __setup("ramdisk_size=", ramdisk_size2); |
| 505 | __setup("ramdisk_blocksize=", ramdisk_blocksize); |
| 506 | #endif |
| 507 | |
| 508 | /* options - modular */ |
| 509 | module_param(rd_size, int, 0); |
| 510 | MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); |
| 511 | module_param(rd_blocksize, int, 0); |
| 512 | MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes."); |
| 513 | MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); |
| 514 | |
| 515 | MODULE_LICENSE("GPL"); |