blob: 59da3327a6b5eaa4250ba1dabf63bdb9678c34ce [file] [log] [blame]
David Chinnerfe4fa4b2008-10-30 17:06:08 +11001/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_inode.h"
37#include "xfs_dinode.h"
38#include "xfs_error.h"
39#include "xfs_mru_cache.h"
40#include "xfs_filestream.h"
41#include "xfs_vnodeops.h"
42#include "xfs_utils.h"
43#include "xfs_buf_item.h"
44#include "xfs_inode_item.h"
45#include "xfs_rw.h"
46
David Chinnera167b172008-10-30 17:06:18 +110047#include <linux/kthread.h>
48#include <linux/freezer.h>
49
David Chinnerfe4fa4b2008-10-30 17:06:08 +110050/*
51 * xfs_sync flushes any pending I/O to file system vfsp.
52 *
53 * This routine is called by vfs_sync() to make sure that things make it
54 * out to disk eventually, on sync() system calls to flush out everything,
55 * and when the file system is unmounted. For the vfs_sync() case, all
56 * we really need to do is sync out the log to make all of our meta-data
57 * updates permanent (except for timestamps). For calls from pflushd(),
58 * dirty pages are kept moving by calling pdflush() on the inodes
59 * containing them. We also flush the inodes that we can lock without
60 * sleeping and the superblock if we can lock it without sleeping from
61 * vfs_sync() so that items at the tail of the log are always moving out.
62 *
63 * Flags:
64 * SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
65 * to sleep if we can help it. All we really need
66 * to do is ensure that the log is synced at least
67 * periodically. We also push the inodes and
68 * superblock if we can lock them without sleeping
69 * and they are not pinned.
70 * SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not
71 * set, then we really want to lock each inode and flush
72 * it.
73 * SYNC_WAIT - All the flushes that take place in this call should
74 * be synchronous.
75 * SYNC_DELWRI - This tells us to push dirty pages associated with
76 * inodes. SYNC_WAIT and SYNC_BDFLUSH are used to
77 * determine if they should be flushed sync, async, or
78 * delwri.
79 * SYNC_CLOSE - This flag is passed when the system is being
80 * unmounted. We should sync and invalidate everything.
81 * SYNC_FSDATA - This indicates that the caller would like to make
82 * sure the superblock is safe on disk. We can ensure
83 * this by simply making sure the log gets flushed
84 * if SYNC_BDFLUSH is set, and by actually writing it
85 * out otherwise.
86 * SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete
87 * before we return (including direct I/O). Forms the drain
88 * side of the write barrier needed to safely quiesce the
89 * filesystem.
90 *
91 */
92int
93xfs_sync(
94 xfs_mount_t *mp,
95 int flags)
96{
97 int error;
98
99 /*
100 * Get the Quota Manager to flush the dquots.
101 *
102 * If XFS quota support is not enabled or this filesystem
103 * instance does not use quotas XFS_QM_DQSYNC will always
104 * return zero.
105 */
106 error = XFS_QM_DQSYNC(mp, flags);
107 if (error) {
108 /*
109 * If we got an IO error, we will be shutting down.
110 * So, there's nothing more for us to do here.
111 */
112 ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
113 if (XFS_FORCED_SHUTDOWN(mp))
114 return XFS_ERROR(error);
115 }
116
117 if (flags & SYNC_IOWAIT)
118 xfs_filestream_flush(mp);
119
120 return xfs_syncsub(mp, flags, NULL);
121}
122
123/*
David Chinner683a8972008-10-30 17:07:29 +1100124 * Sync all the inodes in the given AG according to the
125 * direction given by the flags.
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100126 */
David Chinner683a8972008-10-30 17:07:29 +1100127STATIC int
128xfs_sync_inodes_ag(
129 xfs_mount_t *mp,
130 int ag,
131 int flags,
132 int *bypassed)
133{
134 xfs_inode_t *ip = NULL;
135 struct inode *vp = NULL;
136 xfs_perag_t *pag = &mp->m_perag[ag];
137 boolean_t vnode_refed = B_FALSE;
138 int nr_found;
139 int first_index = 0;
140 int error = 0;
141 int last_error = 0;
142 int fflag = XFS_B_ASYNC;
143 int lock_flags = XFS_ILOCK_SHARED;
144
145 if (flags & SYNC_DELWRI)
146 fflag = XFS_B_DELWRI;
147 if (flags & SYNC_WAIT)
148 fflag = 0; /* synchronous overrides all */
149
150 if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
151 /*
152 * We need the I/O lock if we're going to call any of
153 * the flush/inval routines.
154 */
155 lock_flags |= XFS_IOLOCK_SHARED;
156 }
157
158 do {
159 /*
160 * use a gang lookup to find the next inode in the tree
161 * as the tree is sparse and a gang lookup walks to find
162 * the number of objects requested.
163 */
164 read_lock(&pag->pag_ici_lock);
165 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
166 (void**)&ip, first_index, 1);
167
168 if (!nr_found) {
169 read_unlock(&pag->pag_ici_lock);
170 break;
171 }
172
173 /* update the index for the next lookup */
174 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
175
176 /*
177 * skip inodes in reclaim. Let xfs_syncsub do that for
178 * us so we don't need to worry.
179 */
180 vp = VFS_I(ip);
181 if (!vp) {
182 read_unlock(&pag->pag_ici_lock);
183 continue;
184 }
185
186 /* bad inodes are dealt with elsewhere */
187 if (VN_BAD(vp)) {
188 read_unlock(&pag->pag_ici_lock);
189 continue;
190 }
191
192 /* nothing to sync during shutdown */
193 if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
194 read_unlock(&pag->pag_ici_lock);
195 return 0;
196 }
197
198 /*
199 * The inode lock here actually coordinates with the almost
200 * spurious inode lock in xfs_ireclaim() to prevent the vnode
201 * we handle here without a reference from being freed while we
202 * reference it. If we lock the inode while it's on the mount
203 * list here, then the spurious inode lock in xfs_ireclaim()
204 * after the inode is pulled from the mount list will sleep
205 * until we release it here. This keeps the vnode from being
206 * freed while we reference it.
207 */
208 if (xfs_ilock_nowait(ip, lock_flags) == 0) {
209 vp = vn_grab(vp);
210 read_unlock(&pag->pag_ici_lock);
211 if (!vp)
212 continue;
213 xfs_ilock(ip, lock_flags);
214
215 ASSERT(vp == VFS_I(ip));
216 ASSERT(ip->i_mount == mp);
217
218 vnode_refed = B_TRUE;
219 } else {
220 /* safe to unlock here as we have a reference */
221 read_unlock(&pag->pag_ici_lock);
222 }
223 /*
224 * If we have to flush data or wait for I/O completion
225 * we need to drop the ilock that we currently hold.
226 * If we need to drop the lock, insert a marker if we
227 * have not already done so.
228 */
229 if (flags & SYNC_CLOSE) {
230 xfs_iunlock(ip, XFS_ILOCK_SHARED);
231 if (XFS_FORCED_SHUTDOWN(mp))
232 xfs_tosspages(ip, 0, -1, FI_REMAPF);
233 else
234 error = xfs_flushinval_pages(ip, 0, -1,
235 FI_REMAPF);
236 /* wait for I/O on freeze */
237 if (flags & SYNC_IOWAIT)
238 vn_iowait(ip);
239
240 xfs_ilock(ip, XFS_ILOCK_SHARED);
241 }
242
243 if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
244 xfs_iunlock(ip, XFS_ILOCK_SHARED);
245 error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
246 if (flags & SYNC_IOWAIT)
247 vn_iowait(ip);
248 xfs_ilock(ip, XFS_ILOCK_SHARED);
249 }
250
251 if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
252 if (flags & SYNC_WAIT) {
253 xfs_iflock(ip);
254 if (!xfs_inode_clean(ip))
255 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
256 else
257 xfs_ifunlock(ip);
258 } else if (xfs_iflock_nowait(ip)) {
259 if (!xfs_inode_clean(ip))
260 error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
261 else
262 xfs_ifunlock(ip);
263 } else if (bypassed) {
264 (*bypassed)++;
265 }
266 }
267
268 if (lock_flags)
269 xfs_iunlock(ip, lock_flags);
270
271 if (vnode_refed) {
272 IRELE(ip);
273 vnode_refed = B_FALSE;
274 }
275
276 if (error)
277 last_error = error;
278 /*
279 * bail out if the filesystem is corrupted.
280 */
281 if (error == EFSCORRUPTED)
282 return XFS_ERROR(error);
283
284 } while (nr_found);
285
286 return last_error;
287}
288
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100289int
290xfs_sync_inodes(
291 xfs_mount_t *mp,
292 int flags,
293 int *bypassed)
294{
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100295 int error;
296 int last_error;
David Chinner683a8972008-10-30 17:07:29 +1100297 int i;
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100298
299 if (bypassed)
300 *bypassed = 0;
301 if (mp->m_flags & XFS_MOUNT_RDONLY)
302 return 0;
303 error = 0;
304 last_error = 0;
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100305
David Chinner683a8972008-10-30 17:07:29 +1100306 for (i = 0; i < mp->m_sb.sb_agcount; i++) {
307 if (!mp->m_perag[i].pag_ici_init)
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100308 continue;
David Chinner683a8972008-10-30 17:07:29 +1100309 error = xfs_sync_inodes_ag(mp, i, flags, bypassed);
310 if (error)
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100311 last_error = error;
David Chinner683a8972008-10-30 17:07:29 +1100312 if (error == EFSCORRUPTED)
313 break;
314 }
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100315 return XFS_ERROR(last_error);
316}
317
Christoph Hellwig2af75df2008-10-30 17:14:53 +1100318STATIC int
319xfs_commit_dummy_trans(
320 struct xfs_mount *mp,
321 uint log_flags)
322{
323 struct xfs_inode *ip = mp->m_rootip;
324 struct xfs_trans *tp;
325 int error;
326
327 /*
328 * Put a dummy transaction in the log to tell recovery
329 * that all others are OK.
330 */
331 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
332 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
333 if (error) {
334 xfs_trans_cancel(tp, 0);
335 return error;
336 }
337
338 xfs_ilock(ip, XFS_ILOCK_EXCL);
339
340 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
341 xfs_trans_ihold(tp, ip);
342 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
343 /* XXX(hch): ignoring the error here.. */
344 error = xfs_trans_commit(tp, 0);
345
346 xfs_iunlock(ip, XFS_ILOCK_EXCL);
347
348 xfs_log_force(mp, 0, log_flags);
349 return 0;
350}
351
352STATIC int
353xfs_sync_fsdata(
354 struct xfs_mount *mp,
355 int flags)
356{
357 struct xfs_buf *bp;
358 struct xfs_buf_log_item *bip;
359 int error = 0;
360
361 /*
362 * If this is xfssyncd() then only sync the superblock if we can
363 * lock it without sleeping and it is not pinned.
364 */
365 if (flags & SYNC_BDFLUSH) {
366 ASSERT(!(flags & SYNC_WAIT));
367
368 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
369 if (!bp)
370 goto out;
371
372 bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
373 if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
374 goto out_brelse;
375 } else {
376 bp = xfs_getsb(mp, 0);
377
378 /*
379 * If the buffer is pinned then push on the log so we won't
380 * get stuck waiting in the write for someone, maybe
381 * ourselves, to flush the log.
382 *
383 * Even though we just pushed the log above, we did not have
384 * the superblock buffer locked at that point so it can
385 * become pinned in between there and here.
386 */
387 if (XFS_BUF_ISPINNED(bp))
388 xfs_log_force(mp, 0, XFS_LOG_FORCE);
389 }
390
391
392 if (flags & SYNC_WAIT)
393 XFS_BUF_UNASYNC(bp);
394 else
395 XFS_BUF_ASYNC(bp);
396
397 return xfs_bwrite(mp, bp);
398
399 out_brelse:
400 xfs_buf_relse(bp);
401 out:
402 return error;
403}
404
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100405/*
406 * xfs sync routine for internal use
407 *
408 * This routine supports all of the flags defined for the generic vfs_sync
409 * interface as explained above under xfs_sync.
410 *
411 */
412int
413xfs_syncsub(
414 xfs_mount_t *mp,
415 int flags,
416 int *bypassed)
417{
418 int error = 0;
419 int last_error = 0;
420 uint log_flags = XFS_LOG_FORCE;
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100421
422 /*
423 * Sync out the log. This ensures that the log is periodically
424 * flushed even if there is not enough activity to fill it up.
425 */
426 if (flags & SYNC_WAIT)
427 log_flags |= XFS_LOG_SYNC;
428
429 xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
430
431 if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
432 if (flags & SYNC_BDFLUSH)
David Chinner75c68f42008-10-30 17:06:28 +1100433 xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100434 else
435 error = xfs_sync_inodes(mp, flags, bypassed);
436 }
437
438 /*
439 * Flushing out dirty data above probably generated more
440 * log activity, so if this isn't vfs_sync() then flush
441 * the log again.
442 */
Christoph Hellwig2af75df2008-10-30 17:14:53 +1100443 if (flags & SYNC_DELWRI)
444 xfs_log_force(mp, 0, log_flags);
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100445
446 if (flags & SYNC_FSDATA) {
Christoph Hellwig2af75df2008-10-30 17:14:53 +1100447 error = xfs_sync_fsdata(mp, flags);
448 if (error)
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100449 last_error = error;
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100450 }
451
452 /*
453 * Now check to see if the log needs a "dummy" transaction.
454 */
455 if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
Christoph Hellwig2af75df2008-10-30 17:14:53 +1100456 error = xfs_commit_dummy_trans(mp, log_flags);
457 if (error)
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100458 return error;
David Chinnerfe4fa4b2008-10-30 17:06:08 +1100459 }
460
461 /*
462 * When shutting down, we need to insure that the AIL is pushed
463 * to disk or the filesystem can appear corrupt from the PROM.
464 */
465 if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
466 XFS_bflush(mp->m_ddev_targp);
467 if (mp->m_rtdev_targp) {
468 XFS_bflush(mp->m_rtdev_targp);
469 }
470 }
471
472 return XFS_ERROR(last_error);
473}
David Chinnera167b172008-10-30 17:06:18 +1100474
475/*
476 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
477 * Doing this has two advantages:
478 * - It saves on stack space, which is tight in certain situations
479 * - It can be used (with care) as a mechanism to avoid deadlocks.
480 * Flushing while allocating in a full filesystem requires both.
481 */
482STATIC void
483xfs_syncd_queue_work(
484 struct xfs_mount *mp,
485 void *data,
486 void (*syncer)(struct xfs_mount *, void *))
487{
488 struct bhv_vfs_sync_work *work;
489
490 work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
491 INIT_LIST_HEAD(&work->w_list);
492 work->w_syncer = syncer;
493 work->w_data = data;
494 work->w_mount = mp;
495 spin_lock(&mp->m_sync_lock);
496 list_add_tail(&work->w_list, &mp->m_sync_list);
497 spin_unlock(&mp->m_sync_lock);
498 wake_up_process(mp->m_sync_task);
499}
500
501/*
502 * Flush delayed allocate data, attempting to free up reserved space
503 * from existing allocations. At this point a new allocation attempt
504 * has failed with ENOSPC and we are in the process of scratching our
505 * heads, looking about for more room...
506 */
507STATIC void
508xfs_flush_inode_work(
509 struct xfs_mount *mp,
510 void *arg)
511{
512 struct inode *inode = arg;
513 filemap_flush(inode->i_mapping);
514 iput(inode);
515}
516
517void
518xfs_flush_inode(
519 xfs_inode_t *ip)
520{
521 struct inode *inode = VFS_I(ip);
522
523 igrab(inode);
524 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
525 delay(msecs_to_jiffies(500));
526}
527
528/*
529 * This is the "bigger hammer" version of xfs_flush_inode_work...
530 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
531 */
532STATIC void
533xfs_flush_device_work(
534 struct xfs_mount *mp,
535 void *arg)
536{
537 struct inode *inode = arg;
538 sync_blockdev(mp->m_super->s_bdev);
539 iput(inode);
540}
541
542void
543xfs_flush_device(
544 xfs_inode_t *ip)
545{
546 struct inode *inode = VFS_I(ip);
547
548 igrab(inode);
549 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
550 delay(msecs_to_jiffies(500));
551 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
552}
553
554STATIC void
555xfs_sync_worker(
556 struct xfs_mount *mp,
557 void *unused)
558{
559 int error;
560
561 if (!(mp->m_flags & XFS_MOUNT_RDONLY))
562 error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
563 mp->m_sync_seq++;
564 wake_up(&mp->m_wait_single_sync_task);
565}
566
567STATIC int
568xfssyncd(
569 void *arg)
570{
571 struct xfs_mount *mp = arg;
572 long timeleft;
573 bhv_vfs_sync_work_t *work, *n;
574 LIST_HEAD (tmp);
575
576 set_freezable();
577 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
578 for (;;) {
579 timeleft = schedule_timeout_interruptible(timeleft);
580 /* swsusp */
581 try_to_freeze();
582 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
583 break;
584
585 spin_lock(&mp->m_sync_lock);
586 /*
587 * We can get woken by laptop mode, to do a sync -
588 * that's the (only!) case where the list would be
589 * empty with time remaining.
590 */
591 if (!timeleft || list_empty(&mp->m_sync_list)) {
592 if (!timeleft)
593 timeleft = xfs_syncd_centisecs *
594 msecs_to_jiffies(10);
595 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
596 list_add_tail(&mp->m_sync_work.w_list,
597 &mp->m_sync_list);
598 }
599 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
600 list_move(&work->w_list, &tmp);
601 spin_unlock(&mp->m_sync_lock);
602
603 list_for_each_entry_safe(work, n, &tmp, w_list) {
604 (*work->w_syncer)(mp, work->w_data);
605 list_del(&work->w_list);
606 if (work == &mp->m_sync_work)
607 continue;
608 kmem_free(work);
609 }
610 }
611
612 return 0;
613}
614
615int
616xfs_syncd_init(
617 struct xfs_mount *mp)
618{
619 mp->m_sync_work.w_syncer = xfs_sync_worker;
620 mp->m_sync_work.w_mount = mp;
621 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
622 if (IS_ERR(mp->m_sync_task))
623 return -PTR_ERR(mp->m_sync_task);
624 return 0;
625}
626
627void
628xfs_syncd_stop(
629 struct xfs_mount *mp)
630{
631 kthread_stop(mp->m_sync_task);
632}
633