Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) International Business Machines Corp., 2000-2004 |
| 3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 |
| 4 | * |
| 5 | * This program is free software; you can redistribute it and/or modify |
| 6 | * it under the terms of the GNU General Public License as published by |
| 7 | * the Free Software Foundation; either version 2 of the License, or |
| 8 | * (at your option) any later version. |
| 9 | * |
| 10 | * This program is distributed in the hope that it will be useful, |
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See |
| 13 | * the GNU General Public License for more details. |
| 14 | * |
| 15 | * You should have received a copy of the GNU General Public License |
| 16 | * along with this program; if not, write to the Free Software |
| 17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 18 | */ |
| 19 | #ifndef _H_JFS_LOGMGR |
| 20 | #define _H_JFS_LOGMGR |
| 21 | |
| 22 | #include "jfs_filsys.h" |
| 23 | #include "jfs_lock.h" |
| 24 | |
| 25 | /* |
| 26 | * log manager configuration parameters |
| 27 | */ |
| 28 | |
| 29 | /* log page size */ |
| 30 | #define LOGPSIZE 4096 |
| 31 | #define L2LOGPSIZE 12 |
| 32 | |
| 33 | #define LOGPAGES 16 /* Log pages per mounted file system */ |
| 34 | |
| 35 | /* |
| 36 | * log logical volume |
| 37 | * |
| 38 | * a log is used to make the commit operation on journalled |
| 39 | * files within the same logical volume group atomic. |
| 40 | * a log is implemented with a logical volume. |
| 41 | * there is one log per logical volume group. |
| 42 | * |
| 43 | * block 0 of the log logical volume is not used (ipl etc). |
| 44 | * block 1 contains a log "superblock" and is used by logFormat(), |
| 45 | * lmLogInit(), lmLogShutdown(), and logRedo() to record status |
| 46 | * of the log but is not otherwise used during normal processing. |
| 47 | * blocks 2 - (N-1) are used to contain log records. |
| 48 | * |
| 49 | * when a volume group is varied-on-line, logRedo() must have |
| 50 | * been executed before the file systems (logical volumes) in |
| 51 | * the volume group can be mounted. |
| 52 | */ |
| 53 | /* |
| 54 | * log superblock (block 1 of logical volume) |
| 55 | */ |
| 56 | #define LOGSUPER_B 1 |
| 57 | #define LOGSTART_B 2 |
| 58 | |
| 59 | #define LOGMAGIC 0x87654321 |
| 60 | #define LOGVERSION 1 |
| 61 | |
| 62 | #define MAX_ACTIVE 128 /* Max active file systems sharing log */ |
| 63 | |
| 64 | struct logsuper { |
| 65 | __le32 magic; /* 4: log lv identifier */ |
| 66 | __le32 version; /* 4: version number */ |
| 67 | __le32 serial; /* 4: log open/mount counter */ |
| 68 | __le32 size; /* 4: size in number of LOGPSIZE blocks */ |
| 69 | __le32 bsize; /* 4: logical block size in byte */ |
| 70 | __le32 l2bsize; /* 4: log2 of bsize */ |
| 71 | |
| 72 | __le32 flag; /* 4: option */ |
| 73 | __le32 state; /* 4: state - see below */ |
| 74 | |
| 75 | __le32 end; /* 4: addr of last log record set by logredo */ |
| 76 | char uuid[16]; /* 16: 128-bit journal uuid */ |
| 77 | char label[16]; /* 16: journal label */ |
| 78 | struct { |
| 79 | char uuid[16]; |
| 80 | } active[MAX_ACTIVE]; /* 2048: active file systems list */ |
| 81 | }; |
| 82 | |
| 83 | #define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" |
| 84 | |
| 85 | /* log flag: commit option (see jfs_filsys.h) */ |
| 86 | |
| 87 | /* log state */ |
| 88 | #define LOGMOUNT 0 /* log mounted by lmLogInit() */ |
| 89 | #define LOGREDONE 1 /* log shutdown by lmLogShutdown(). |
| 90 | * log redo completed by logredo(). |
| 91 | */ |
| 92 | #define LOGWRAP 2 /* log wrapped */ |
| 93 | #define LOGREADERR 3 /* log read error detected in logredo() */ |
| 94 | |
| 95 | |
| 96 | /* |
| 97 | * log logical page |
| 98 | * |
| 99 | * (this comment should be rewritten !) |
| 100 | * the header and trailer structures (h,t) will normally have |
| 101 | * the same page and eor value. |
| 102 | * An exception to this occurs when a complete page write is not |
| 103 | * accomplished on a power failure. Since the hardware may "split write" |
| 104 | * sectors in the page, any out of order sequence may occur during powerfail |
| 105 | * and needs to be recognized during log replay. The xor value is |
| 106 | * an "exclusive or" of all log words in the page up to eor. This |
| 107 | * 32 bit eor is stored with the top 16 bits in the header and the |
| 108 | * bottom 16 bits in the trailer. logredo can easily recognize pages |
| 109 | * that were not completed by reconstructing this eor and checking |
| 110 | * the log page. |
| 111 | * |
| 112 | * Previous versions of the operating system did not allow split |
| 113 | * writes and detected partially written records in logredo by |
| 114 | * ordering the updates to the header, trailer, and the move of data |
| 115 | * into the logdata area. The order: (1) data is moved (2) header |
| 116 | * is updated (3) trailer is updated. In logredo, when the header |
| 117 | * differed from the trailer, the header and trailer were reconciled |
| 118 | * as follows: if h.page != t.page they were set to the smaller of |
| 119 | * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) |
| 120 | * h.eor != t.eor they were set to the smaller of their two values. |
| 121 | */ |
| 122 | struct logpage { |
| 123 | struct { /* header */ |
| 124 | __le32 page; /* 4: log sequence page number */ |
| 125 | __le16 rsrvd; /* 2: */ |
| 126 | __le16 eor; /* 2: end-of-log offset of lasrt record write */ |
| 127 | } h; |
| 128 | |
| 129 | __le32 data[LOGPSIZE / 4 - 4]; /* log record area */ |
| 130 | |
| 131 | struct { /* trailer */ |
| 132 | __le32 page; /* 4: normally the same as h.page */ |
| 133 | __le16 rsrvd; /* 2: */ |
| 134 | __le16 eor; /* 2: normally the same as h.eor */ |
| 135 | } t; |
| 136 | }; |
| 137 | |
| 138 | #define LOGPHDRSIZE 8 /* log page header size */ |
| 139 | #define LOGPTLRSIZE 8 /* log page trailer size */ |
| 140 | |
| 141 | |
| 142 | /* |
| 143 | * log record |
| 144 | * |
| 145 | * (this comment should be rewritten !) |
| 146 | * jfs uses only "after" log records (only a single writer is allowed |
| 147 | * in a page, pages are written to temporary paging space if |
| 148 | * if they must be written to disk before commit, and i/o is |
| 149 | * scheduled for modified pages to their home location after |
| 150 | * the log records containing the after values and the commit |
| 151 | * record is written to the log on disk, undo discards the copy |
| 152 | * in main-memory.) |
| 153 | * |
| 154 | * a log record consists of a data area of variable length followed by |
| 155 | * a descriptor of fixed size LOGRDSIZE bytes. |
| 156 | * the data area is rounded up to an integral number of 4-bytes and |
| 157 | * must be no longer than LOGPSIZE. |
| 158 | * the descriptor is of size of multiple of 4-bytes and aligned on a |
| 159 | * 4-byte boundary. |
| 160 | * records are packed one after the other in the data area of log pages. |
| 161 | * (sometimes a DUMMY record is inserted so that at least one record ends |
| 162 | * on every page or the longest record is placed on at most two pages). |
| 163 | * the field eor in page header/trailer points to the byte following |
| 164 | * the last record on a page. |
| 165 | */ |
| 166 | |
| 167 | /* log record types */ |
| 168 | #define LOG_COMMIT 0x8000 |
| 169 | #define LOG_SYNCPT 0x4000 |
| 170 | #define LOG_MOUNT 0x2000 |
| 171 | #define LOG_REDOPAGE 0x0800 |
| 172 | #define LOG_NOREDOPAGE 0x0080 |
| 173 | #define LOG_NOREDOINOEXT 0x0040 |
| 174 | #define LOG_UPDATEMAP 0x0008 |
| 175 | #define LOG_NOREDOFILE 0x0001 |
| 176 | |
| 177 | /* REDOPAGE/NOREDOPAGE log record data type */ |
| 178 | #define LOG_INODE 0x0001 |
| 179 | #define LOG_XTREE 0x0002 |
| 180 | #define LOG_DTREE 0x0004 |
| 181 | #define LOG_BTROOT 0x0010 |
| 182 | #define LOG_EA 0x0020 |
| 183 | #define LOG_ACL 0x0040 |
| 184 | #define LOG_DATA 0x0080 |
| 185 | #define LOG_NEW 0x0100 |
| 186 | #define LOG_EXTEND 0x0200 |
| 187 | #define LOG_RELOCATE 0x0400 |
| 188 | #define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */ |
| 189 | |
| 190 | /* UPDATEMAP log record descriptor type */ |
| 191 | #define LOG_ALLOCXADLIST 0x0080 |
| 192 | #define LOG_ALLOCPXDLIST 0x0040 |
| 193 | #define LOG_ALLOCXAD 0x0020 |
| 194 | #define LOG_ALLOCPXD 0x0010 |
| 195 | #define LOG_FREEXADLIST 0x0008 |
| 196 | #define LOG_FREEPXDLIST 0x0004 |
| 197 | #define LOG_FREEXAD 0x0002 |
| 198 | #define LOG_FREEPXD 0x0001 |
| 199 | |
| 200 | |
| 201 | struct lrd { |
| 202 | /* |
| 203 | * type independent area |
| 204 | */ |
| 205 | __le32 logtid; /* 4: log transaction identifier */ |
| 206 | __le32 backchain; /* 4: ptr to prev record of same transaction */ |
| 207 | __le16 type; /* 2: record type */ |
| 208 | __le16 length; /* 2: length of data in record (in byte) */ |
| 209 | __le32 aggregate; /* 4: file system lv/aggregate */ |
| 210 | /* (16) */ |
| 211 | |
| 212 | /* |
| 213 | * type dependent area (20) |
| 214 | */ |
| 215 | union { |
| 216 | |
| 217 | /* |
| 218 | * COMMIT: commit |
| 219 | * |
| 220 | * transaction commit: no type-dependent information; |
| 221 | */ |
| 222 | |
| 223 | /* |
| 224 | * REDOPAGE: after-image |
| 225 | * |
| 226 | * apply after-image; |
| 227 | * |
| 228 | * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; |
| 229 | */ |
| 230 | struct { |
| 231 | __le32 fileset; /* 4: fileset number */ |
| 232 | __le32 inode; /* 4: inode number */ |
| 233 | __le16 type; /* 2: REDOPAGE record type */ |
| 234 | __le16 l2linesize; /* 2: log2 of line size */ |
| 235 | pxd_t pxd; /* 8: on-disk page pxd */ |
| 236 | } redopage; /* (20) */ |
| 237 | |
| 238 | /* |
| 239 | * NOREDOPAGE: the page is freed |
| 240 | * |
| 241 | * do not apply after-image records which precede this record |
| 242 | * in the log with the same page block number to this page. |
| 243 | * |
| 244 | * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; |
| 245 | */ |
| 246 | struct { |
| 247 | __le32 fileset; /* 4: fileset number */ |
| 248 | __le32 inode; /* 4: inode number */ |
| 249 | __le16 type; /* 2: NOREDOPAGE record type */ |
| 250 | __le16 rsrvd; /* 2: reserved */ |
| 251 | pxd_t pxd; /* 8: on-disk page pxd */ |
| 252 | } noredopage; /* (20) */ |
| 253 | |
| 254 | /* |
| 255 | * UPDATEMAP: update block allocation map |
| 256 | * |
| 257 | * either in-line PXD, |
| 258 | * or out-of-line XADLIST; |
| 259 | * |
| 260 | * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; |
| 261 | */ |
| 262 | struct { |
| 263 | __le32 fileset; /* 4: fileset number */ |
| 264 | __le32 inode; /* 4: inode number */ |
| 265 | __le16 type; /* 2: UPDATEMAP record type */ |
| 266 | __le16 nxd; /* 2: number of extents */ |
| 267 | pxd_t pxd; /* 8: pxd */ |
| 268 | } updatemap; /* (20) */ |
| 269 | |
| 270 | /* |
| 271 | * NOREDOINOEXT: the inode extent is freed |
| 272 | * |
| 273 | * do not apply after-image records which precede this |
| 274 | * record in the log with the any of the 4 page block |
| 275 | * numbers in this inode extent. |
| 276 | * |
| 277 | * NOTE: The fileset and pxd fields MUST remain in |
| 278 | * the same fields in the REDOPAGE record format. |
| 279 | * |
| 280 | */ |
| 281 | struct { |
| 282 | __le32 fileset; /* 4: fileset number */ |
| 283 | __le32 iagnum; /* 4: IAG number */ |
| 284 | __le32 inoext_idx; /* 4: inode extent index */ |
| 285 | pxd_t pxd; /* 8: on-disk page pxd */ |
| 286 | } noredoinoext; /* (20) */ |
| 287 | |
| 288 | /* |
| 289 | * SYNCPT: log sync point |
| 290 | * |
| 291 | * replay log upto syncpt address specified; |
| 292 | */ |
| 293 | struct { |
| 294 | __le32 sync; /* 4: syncpt address (0 = here) */ |
| 295 | } syncpt; |
| 296 | |
| 297 | /* |
| 298 | * MOUNT: file system mount |
| 299 | * |
| 300 | * file system mount: no type-dependent information; |
| 301 | */ |
| 302 | |
| 303 | /* |
| 304 | * ? FREEXTENT: free specified extent(s) |
| 305 | * |
| 306 | * free specified extent(s) from block allocation map |
| 307 | * N.B.: nextents should be length of data/sizeof(xad_t) |
| 308 | */ |
| 309 | struct { |
| 310 | __le32 type; /* 4: FREEXTENT record type */ |
| 311 | __le32 nextent; /* 4: number of extents */ |
| 312 | |
| 313 | /* data: PXD or XAD list */ |
| 314 | } freextent; |
| 315 | |
| 316 | /* |
| 317 | * ? NOREDOFILE: this file is freed |
| 318 | * |
| 319 | * do not apply records which precede this record in the log |
| 320 | * with the same inode number. |
| 321 | * |
| 322 | * NOREDILE must be the first to be written at commit |
| 323 | * (last to be read in logredo()) - it prevents |
| 324 | * replay of preceding updates of all preceding generations |
| 325 | * of the inumber esp. the on-disk inode itself, |
| 326 | * but does NOT prevent |
| 327 | * replay of the |
| 328 | */ |
| 329 | struct { |
| 330 | __le32 fileset; /* 4: fileset number */ |
| 331 | __le32 inode; /* 4: inode number */ |
| 332 | } noredofile; |
| 333 | |
| 334 | /* |
| 335 | * ? NEWPAGE: |
| 336 | * |
| 337 | * metadata type dependent |
| 338 | */ |
| 339 | struct { |
| 340 | __le32 fileset; /* 4: fileset number */ |
| 341 | __le32 inode; /* 4: inode number */ |
| 342 | __le32 type; /* 4: NEWPAGE record type */ |
| 343 | pxd_t pxd; /* 8: on-disk page pxd */ |
| 344 | } newpage; |
| 345 | |
| 346 | /* |
| 347 | * ? DUMMY: filler |
| 348 | * |
| 349 | * no type-dependent information |
| 350 | */ |
| 351 | } log; |
| 352 | }; /* (36) */ |
| 353 | |
| 354 | #define LOGRDSIZE (sizeof(struct lrd)) |
| 355 | |
| 356 | /* |
| 357 | * line vector descriptor |
| 358 | */ |
| 359 | struct lvd { |
| 360 | __le16 offset; |
| 361 | __le16 length; |
| 362 | }; |
| 363 | |
| 364 | |
| 365 | /* |
| 366 | * log logical volume |
| 367 | */ |
| 368 | struct jfs_log { |
| 369 | |
| 370 | struct list_head sb_list;/* This is used to sync metadata |
| 371 | * before writing syncpt. |
| 372 | */ |
| 373 | struct list_head journal_list; /* Global list */ |
| 374 | struct block_device *bdev; /* 4: log lv pointer */ |
| 375 | int serial; /* 4: log mount serial number */ |
| 376 | |
| 377 | s64 base; /* @8: log extent address (inline log ) */ |
| 378 | int size; /* 4: log size in log page (in page) */ |
| 379 | int l2bsize; /* 4: log2 of bsize */ |
| 380 | |
| 381 | long flag; /* 4: flag */ |
| 382 | |
| 383 | struct lbuf *lbuf_free; /* 4: free lbufs */ |
| 384 | wait_queue_head_t free_wait; /* 4: */ |
| 385 | |
| 386 | /* log write */ |
| 387 | int logtid; /* 4: log tid */ |
| 388 | int page; /* 4: page number of eol page */ |
| 389 | int eor; /* 4: eor of last record in eol page */ |
| 390 | struct lbuf *bp; /* 4: current log page buffer */ |
| 391 | |
| 392 | struct semaphore loglock; /* 4: log write serialization lock */ |
| 393 | |
| 394 | /* syncpt */ |
| 395 | int nextsync; /* 4: bytes to write before next syncpt */ |
| 396 | int active; /* 4: */ |
| 397 | wait_queue_head_t syncwait; /* 4: */ |
| 398 | |
| 399 | /* commit */ |
| 400 | uint cflag; /* 4: */ |
| 401 | struct list_head cqueue; /* FIFO commit queue */ |
| 402 | struct tblock *flush_tblk; /* tblk we're waiting on for flush */ |
| 403 | int gcrtc; /* 4: GC_READY transaction count */ |
| 404 | struct tblock *gclrt; /* 4: latest GC_READY transaction */ |
| 405 | spinlock_t gclock; /* 4: group commit lock */ |
| 406 | int logsize; /* 4: log data area size in byte */ |
| 407 | int lsn; /* 4: end-of-log */ |
| 408 | int clsn; /* 4: clsn */ |
| 409 | int syncpt; /* 4: addr of last syncpt record */ |
| 410 | int sync; /* 4: addr from last logsync() */ |
| 411 | struct list_head synclist; /* 8: logsynclist anchor */ |
| 412 | spinlock_t synclock; /* 4: synclist lock */ |
| 413 | struct lbuf *wqueue; /* 4: log pageout queue */ |
| 414 | int count; /* 4: count */ |
| 415 | char uuid[16]; /* 16: 128-bit uuid of log device */ |
| 416 | |
| 417 | int no_integrity; /* 3: flag to disable journaling to disk */ |
| 418 | }; |
| 419 | |
| 420 | /* |
| 421 | * Log flag |
| 422 | */ |
| 423 | #define log_INLINELOG 1 |
| 424 | #define log_SYNCBARRIER 2 |
| 425 | #define log_QUIESCE 3 |
| 426 | #define log_FLUSH 4 |
| 427 | |
| 428 | /* |
| 429 | * group commit flag |
| 430 | */ |
| 431 | /* jfs_log */ |
| 432 | #define logGC_PAGEOUT 0x00000001 |
| 433 | |
| 434 | /* tblock/lbuf */ |
| 435 | #define tblkGC_QUEUE 0x0001 |
| 436 | #define tblkGC_READY 0x0002 |
| 437 | #define tblkGC_COMMIT 0x0004 |
| 438 | #define tblkGC_COMMITTED 0x0008 |
| 439 | #define tblkGC_EOP 0x0010 |
| 440 | #define tblkGC_FREE 0x0020 |
| 441 | #define tblkGC_LEADER 0x0040 |
| 442 | #define tblkGC_ERROR 0x0080 |
| 443 | #define tblkGC_LAZY 0x0100 // D230860 |
| 444 | #define tblkGC_UNLOCKED 0x0200 // D230860 |
| 445 | |
| 446 | /* |
| 447 | * log cache buffer header |
| 448 | */ |
| 449 | struct lbuf { |
| 450 | struct jfs_log *l_log; /* 4: log associated with buffer */ |
| 451 | |
| 452 | /* |
| 453 | * data buffer base area |
| 454 | */ |
| 455 | uint l_flag; /* 4: pageout control flags */ |
| 456 | |
| 457 | struct lbuf *l_wqnext; /* 4: write queue link */ |
| 458 | struct lbuf *l_freelist; /* 4: freelistlink */ |
| 459 | |
| 460 | int l_pn; /* 4: log page number */ |
| 461 | int l_eor; /* 4: log record eor */ |
| 462 | int l_ceor; /* 4: committed log record eor */ |
| 463 | |
| 464 | s64 l_blkno; /* 8: log page block number */ |
| 465 | caddr_t l_ldata; /* 4: data page */ |
Dave Kleikamp | dc5798d | 2005-05-02 12:24:57 -0600 | [diff] [blame] | 466 | struct page *l_page; /* The page itself */ |
| 467 | uint l_offset; /* Offset of l_ldata within the page */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 468 | |
| 469 | wait_queue_head_t l_ioevent; /* 4: i/o done event */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 470 | }; |
| 471 | |
| 472 | /* Reuse l_freelist for redrive list */ |
| 473 | #define l_redrive_next l_freelist |
| 474 | |
| 475 | /* |
| 476 | * logsynclist block |
| 477 | * |
| 478 | * common logsyncblk prefix for jbuf_t and tblock |
| 479 | */ |
| 480 | struct logsyncblk { |
| 481 | u16 xflag; /* flags */ |
| 482 | u16 flag; /* only meaninful in tblock */ |
| 483 | lid_t lid; /* lock id */ |
| 484 | s32 lsn; /* log sequence number */ |
| 485 | struct list_head synclist; /* log sync list link */ |
| 486 | }; |
| 487 | |
| 488 | /* |
| 489 | * logsynclist serialization (per log) |
| 490 | */ |
| 491 | |
| 492 | #define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock) |
Dave Kleikamp | 7fab479 | 2005-05-02 12:25:02 -0600 | [diff] [blame^] | 493 | #define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags) |
| 494 | #define LOGSYNC_UNLOCK(log, flags) \ |
| 495 | spin_unlock_irqrestore(&(log)->synclock, flags) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 496 | |
| 497 | /* compute the difference in bytes of lsn from sync point */ |
| 498 | #define logdiff(diff, lsn, log)\ |
| 499 | {\ |
| 500 | diff = (lsn) - (log)->syncpt;\ |
| 501 | if (diff < 0)\ |
| 502 | diff += (log)->logsize;\ |
| 503 | } |
| 504 | |
| 505 | extern int lmLogOpen(struct super_block *sb); |
| 506 | extern int lmLogClose(struct super_block *sb); |
| 507 | extern int lmLogShutdown(struct jfs_log * log); |
| 508 | extern int lmLogInit(struct jfs_log * log); |
| 509 | extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize); |
| 510 | extern void jfs_flush_journal(struct jfs_log * log, int wait); |
| 511 | |
| 512 | #endif /* _H_JFS_LOGMGR */ |