1c6278295d
This patch adds jfs_syncpt, which calls lmLogSync to write sync points to the journal both in jfs_sync_fs and when sync barrier processing completes. lmLogSync accomplishes two things: 1) it pushes logged-but-dirty metadata pages to disk, and 2) it writes a sync record to the journal so that jfs_fsck doesn't need to replay more transactions than is necessary. Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
513 lines
15 KiB
C
513 lines
15 KiB
C
/*
|
|
* Copyright (C) International Business Machines Corp., 2000-2004
|
|
* Portions Copyright (C) Christoph Hellwig, 2001-2002
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
|
|
* the GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*/
|
|
#ifndef _H_JFS_LOGMGR
|
|
#define _H_JFS_LOGMGR
|
|
|
|
#include "jfs_filsys.h"
|
|
#include "jfs_lock.h"
|
|
|
|
/*
|
|
* log manager configuration parameters
|
|
*/
|
|
|
|
/* log page size */
|
|
#define LOGPSIZE 4096
|
|
#define L2LOGPSIZE 12
|
|
|
|
#define LOGPAGES 16 /* Log pages per mounted file system */
|
|
|
|
/*
|
|
* log logical volume
|
|
*
|
|
* a log is used to make the commit operation on journalled
|
|
* files within the same logical volume group atomic.
|
|
* a log is implemented with a logical volume.
|
|
* there is one log per logical volume group.
|
|
*
|
|
* block 0 of the log logical volume is not used (ipl etc).
|
|
* block 1 contains a log "superblock" and is used by logFormat(),
|
|
* lmLogInit(), lmLogShutdown(), and logRedo() to record status
|
|
* of the log but is not otherwise used during normal processing.
|
|
* blocks 2 - (N-1) are used to contain log records.
|
|
*
|
|
* when a volume group is varied-on-line, logRedo() must have
|
|
* been executed before the file systems (logical volumes) in
|
|
* the volume group can be mounted.
|
|
*/
|
|
/*
|
|
* log superblock (block 1 of logical volume)
|
|
*/
|
|
#define LOGSUPER_B 1
|
|
#define LOGSTART_B 2
|
|
|
|
#define LOGMAGIC 0x87654321
|
|
#define LOGVERSION 1
|
|
|
|
#define MAX_ACTIVE 128 /* Max active file systems sharing log */
|
|
|
|
struct logsuper {
|
|
__le32 magic; /* 4: log lv identifier */
|
|
__le32 version; /* 4: version number */
|
|
__le32 serial; /* 4: log open/mount counter */
|
|
__le32 size; /* 4: size in number of LOGPSIZE blocks */
|
|
__le32 bsize; /* 4: logical block size in byte */
|
|
__le32 l2bsize; /* 4: log2 of bsize */
|
|
|
|
__le32 flag; /* 4: option */
|
|
__le32 state; /* 4: state - see below */
|
|
|
|
__le32 end; /* 4: addr of last log record set by logredo */
|
|
char uuid[16]; /* 16: 128-bit journal uuid */
|
|
char label[16]; /* 16: journal label */
|
|
struct {
|
|
char uuid[16];
|
|
} active[MAX_ACTIVE]; /* 2048: active file systems list */
|
|
};
|
|
|
|
#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
|
|
|
|
/* log flag: commit option (see jfs_filsys.h) */
|
|
|
|
/* log state */
|
|
#define LOGMOUNT 0 /* log mounted by lmLogInit() */
|
|
#define LOGREDONE 1 /* log shutdown by lmLogShutdown().
|
|
* log redo completed by logredo().
|
|
*/
|
|
#define LOGWRAP 2 /* log wrapped */
|
|
#define LOGREADERR 3 /* log read error detected in logredo() */
|
|
|
|
|
|
/*
|
|
* log logical page
|
|
*
|
|
* (this comment should be rewritten !)
|
|
* the header and trailer structures (h,t) will normally have
|
|
* the same page and eor value.
|
|
* An exception to this occurs when a complete page write is not
|
|
* accomplished on a power failure. Since the hardware may "split write"
|
|
* sectors in the page, any out of order sequence may occur during powerfail
|
|
* and needs to be recognized during log replay. The xor value is
|
|
* an "exclusive or" of all log words in the page up to eor. This
|
|
* 32 bit eor is stored with the top 16 bits in the header and the
|
|
* bottom 16 bits in the trailer. logredo can easily recognize pages
|
|
* that were not completed by reconstructing this eor and checking
|
|
* the log page.
|
|
*
|
|
* Previous versions of the operating system did not allow split
|
|
* writes and detected partially written records in logredo by
|
|
* ordering the updates to the header, trailer, and the move of data
|
|
* into the logdata area. The order: (1) data is moved (2) header
|
|
* is updated (3) trailer is updated. In logredo, when the header
|
|
* differed from the trailer, the header and trailer were reconciled
|
|
* as follows: if h.page != t.page they were set to the smaller of
|
|
* the two and h.eor and t.eor set to 8 (i.e. empty page). if (only)
|
|
* h.eor != t.eor they were set to the smaller of their two values.
|
|
*/
|
|
struct logpage {
|
|
struct { /* header */
|
|
__le32 page; /* 4: log sequence page number */
|
|
__le16 rsrvd; /* 2: */
|
|
__le16 eor; /* 2: end-of-log offset of lasrt record write */
|
|
} h;
|
|
|
|
__le32 data[LOGPSIZE / 4 - 4]; /* log record area */
|
|
|
|
struct { /* trailer */
|
|
__le32 page; /* 4: normally the same as h.page */
|
|
__le16 rsrvd; /* 2: */
|
|
__le16 eor; /* 2: normally the same as h.eor */
|
|
} t;
|
|
};
|
|
|
|
#define LOGPHDRSIZE 8 /* log page header size */
|
|
#define LOGPTLRSIZE 8 /* log page trailer size */
|
|
|
|
|
|
/*
|
|
* log record
|
|
*
|
|
* (this comment should be rewritten !)
|
|
* jfs uses only "after" log records (only a single writer is allowed
|
|
* in a page, pages are written to temporary paging space if
|
|
* if they must be written to disk before commit, and i/o is
|
|
* scheduled for modified pages to their home location after
|
|
* the log records containing the after values and the commit
|
|
* record is written to the log on disk, undo discards the copy
|
|
* in main-memory.)
|
|
*
|
|
* a log record consists of a data area of variable length followed by
|
|
* a descriptor of fixed size LOGRDSIZE bytes.
|
|
* the data area is rounded up to an integral number of 4-bytes and
|
|
* must be no longer than LOGPSIZE.
|
|
* the descriptor is of size of multiple of 4-bytes and aligned on a
|
|
* 4-byte boundary.
|
|
* records are packed one after the other in the data area of log pages.
|
|
* (sometimes a DUMMY record is inserted so that at least one record ends
|
|
* on every page or the longest record is placed on at most two pages).
|
|
* the field eor in page header/trailer points to the byte following
|
|
* the last record on a page.
|
|
*/
|
|
|
|
/* log record types */
|
|
#define LOG_COMMIT 0x8000
|
|
#define LOG_SYNCPT 0x4000
|
|
#define LOG_MOUNT 0x2000
|
|
#define LOG_REDOPAGE 0x0800
|
|
#define LOG_NOREDOPAGE 0x0080
|
|
#define LOG_NOREDOINOEXT 0x0040
|
|
#define LOG_UPDATEMAP 0x0008
|
|
#define LOG_NOREDOFILE 0x0001
|
|
|
|
/* REDOPAGE/NOREDOPAGE log record data type */
|
|
#define LOG_INODE 0x0001
|
|
#define LOG_XTREE 0x0002
|
|
#define LOG_DTREE 0x0004
|
|
#define LOG_BTROOT 0x0010
|
|
#define LOG_EA 0x0020
|
|
#define LOG_ACL 0x0040
|
|
#define LOG_DATA 0x0080
|
|
#define LOG_NEW 0x0100
|
|
#define LOG_EXTEND 0x0200
|
|
#define LOG_RELOCATE 0x0400
|
|
#define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */
|
|
|
|
/* UPDATEMAP log record descriptor type */
|
|
#define LOG_ALLOCXADLIST 0x0080
|
|
#define LOG_ALLOCPXDLIST 0x0040
|
|
#define LOG_ALLOCXAD 0x0020
|
|
#define LOG_ALLOCPXD 0x0010
|
|
#define LOG_FREEXADLIST 0x0008
|
|
#define LOG_FREEPXDLIST 0x0004
|
|
#define LOG_FREEXAD 0x0002
|
|
#define LOG_FREEPXD 0x0001
|
|
|
|
|
|
struct lrd {
|
|
/*
|
|
* type independent area
|
|
*/
|
|
__le32 logtid; /* 4: log transaction identifier */
|
|
__le32 backchain; /* 4: ptr to prev record of same transaction */
|
|
__le16 type; /* 2: record type */
|
|
__le16 length; /* 2: length of data in record (in byte) */
|
|
__le32 aggregate; /* 4: file system lv/aggregate */
|
|
/* (16) */
|
|
|
|
/*
|
|
* type dependent area (20)
|
|
*/
|
|
union {
|
|
|
|
/*
|
|
* COMMIT: commit
|
|
*
|
|
* transaction commit: no type-dependent information;
|
|
*/
|
|
|
|
/*
|
|
* REDOPAGE: after-image
|
|
*
|
|
* apply after-image;
|
|
*
|
|
* N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
|
|
*/
|
|
struct {
|
|
__le32 fileset; /* 4: fileset number */
|
|
__le32 inode; /* 4: inode number */
|
|
__le16 type; /* 2: REDOPAGE record type */
|
|
__le16 l2linesize; /* 2: log2 of line size */
|
|
pxd_t pxd; /* 8: on-disk page pxd */
|
|
} redopage; /* (20) */
|
|
|
|
/*
|
|
* NOREDOPAGE: the page is freed
|
|
*
|
|
* do not apply after-image records which precede this record
|
|
* in the log with the same page block number to this page.
|
|
*
|
|
* N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
|
|
*/
|
|
struct {
|
|
__le32 fileset; /* 4: fileset number */
|
|
__le32 inode; /* 4: inode number */
|
|
__le16 type; /* 2: NOREDOPAGE record type */
|
|
__le16 rsrvd; /* 2: reserved */
|
|
pxd_t pxd; /* 8: on-disk page pxd */
|
|
} noredopage; /* (20) */
|
|
|
|
/*
|
|
* UPDATEMAP: update block allocation map
|
|
*
|
|
* either in-line PXD,
|
|
* or out-of-line XADLIST;
|
|
*
|
|
* N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
|
|
*/
|
|
struct {
|
|
__le32 fileset; /* 4: fileset number */
|
|
__le32 inode; /* 4: inode number */
|
|
__le16 type; /* 2: UPDATEMAP record type */
|
|
__le16 nxd; /* 2: number of extents */
|
|
pxd_t pxd; /* 8: pxd */
|
|
} updatemap; /* (20) */
|
|
|
|
/*
|
|
* NOREDOINOEXT: the inode extent is freed
|
|
*
|
|
* do not apply after-image records which precede this
|
|
* record in the log with the any of the 4 page block
|
|
* numbers in this inode extent.
|
|
*
|
|
* NOTE: The fileset and pxd fields MUST remain in
|
|
* the same fields in the REDOPAGE record format.
|
|
*
|
|
*/
|
|
struct {
|
|
__le32 fileset; /* 4: fileset number */
|
|
__le32 iagnum; /* 4: IAG number */
|
|
__le32 inoext_idx; /* 4: inode extent index */
|
|
pxd_t pxd; /* 8: on-disk page pxd */
|
|
} noredoinoext; /* (20) */
|
|
|
|
/*
|
|
* SYNCPT: log sync point
|
|
*
|
|
* replay log upto syncpt address specified;
|
|
*/
|
|
struct {
|
|
__le32 sync; /* 4: syncpt address (0 = here) */
|
|
} syncpt;
|
|
|
|
/*
|
|
* MOUNT: file system mount
|
|
*
|
|
* file system mount: no type-dependent information;
|
|
*/
|
|
|
|
/*
|
|
* ? FREEXTENT: free specified extent(s)
|
|
*
|
|
* free specified extent(s) from block allocation map
|
|
* N.B.: nextents should be length of data/sizeof(xad_t)
|
|
*/
|
|
struct {
|
|
__le32 type; /* 4: FREEXTENT record type */
|
|
__le32 nextent; /* 4: number of extents */
|
|
|
|
/* data: PXD or XAD list */
|
|
} freextent;
|
|
|
|
/*
|
|
* ? NOREDOFILE: this file is freed
|
|
*
|
|
* do not apply records which precede this record in the log
|
|
* with the same inode number.
|
|
*
|
|
* NOREDILE must be the first to be written at commit
|
|
* (last to be read in logredo()) - it prevents
|
|
* replay of preceding updates of all preceding generations
|
|
* of the inumber esp. the on-disk inode itself,
|
|
* but does NOT prevent
|
|
* replay of the
|
|
*/
|
|
struct {
|
|
__le32 fileset; /* 4: fileset number */
|
|
__le32 inode; /* 4: inode number */
|
|
} noredofile;
|
|
|
|
/*
|
|
* ? NEWPAGE:
|
|
*
|
|
* metadata type dependent
|
|
*/
|
|
struct {
|
|
__le32 fileset; /* 4: fileset number */
|
|
__le32 inode; /* 4: inode number */
|
|
__le32 type; /* 4: NEWPAGE record type */
|
|
pxd_t pxd; /* 8: on-disk page pxd */
|
|
} newpage;
|
|
|
|
/*
|
|
* ? DUMMY: filler
|
|
*
|
|
* no type-dependent information
|
|
*/
|
|
} log;
|
|
}; /* (36) */
|
|
|
|
#define LOGRDSIZE (sizeof(struct lrd))
|
|
|
|
/*
|
|
* line vector descriptor
|
|
*/
|
|
struct lvd {
|
|
__le16 offset;
|
|
__le16 length;
|
|
};
|
|
|
|
|
|
/*
|
|
* log logical volume
|
|
*/
|
|
struct jfs_log {
|
|
|
|
struct list_head sb_list;/* This is used to sync metadata
|
|
* before writing syncpt.
|
|
*/
|
|
struct list_head journal_list; /* Global list */
|
|
struct block_device *bdev; /* 4: log lv pointer */
|
|
int serial; /* 4: log mount serial number */
|
|
|
|
s64 base; /* @8: log extent address (inline log ) */
|
|
int size; /* 4: log size in log page (in page) */
|
|
int l2bsize; /* 4: log2 of bsize */
|
|
|
|
long flag; /* 4: flag */
|
|
|
|
struct lbuf *lbuf_free; /* 4: free lbufs */
|
|
wait_queue_head_t free_wait; /* 4: */
|
|
|
|
/* log write */
|
|
int logtid; /* 4: log tid */
|
|
int page; /* 4: page number of eol page */
|
|
int eor; /* 4: eor of last record in eol page */
|
|
struct lbuf *bp; /* 4: current log page buffer */
|
|
|
|
struct semaphore loglock; /* 4: log write serialization lock */
|
|
|
|
/* syncpt */
|
|
int nextsync; /* 4: bytes to write before next syncpt */
|
|
int active; /* 4: */
|
|
wait_queue_head_t syncwait; /* 4: */
|
|
|
|
/* commit */
|
|
uint cflag; /* 4: */
|
|
struct list_head cqueue; /* FIFO commit queue */
|
|
struct tblock *flush_tblk; /* tblk we're waiting on for flush */
|
|
int gcrtc; /* 4: GC_READY transaction count */
|
|
struct tblock *gclrt; /* 4: latest GC_READY transaction */
|
|
spinlock_t gclock; /* 4: group commit lock */
|
|
int logsize; /* 4: log data area size in byte */
|
|
int lsn; /* 4: end-of-log */
|
|
int clsn; /* 4: clsn */
|
|
int syncpt; /* 4: addr of last syncpt record */
|
|
int sync; /* 4: addr from last logsync() */
|
|
struct list_head synclist; /* 8: logsynclist anchor */
|
|
spinlock_t synclock; /* 4: synclist lock */
|
|
struct lbuf *wqueue; /* 4: log pageout queue */
|
|
int count; /* 4: count */
|
|
char uuid[16]; /* 16: 128-bit uuid of log device */
|
|
|
|
int no_integrity; /* 3: flag to disable journaling to disk */
|
|
};
|
|
|
|
/*
|
|
* Log flag
|
|
*/
|
|
#define log_INLINELOG 1
|
|
#define log_SYNCBARRIER 2
|
|
#define log_QUIESCE 3
|
|
#define log_FLUSH 4
|
|
|
|
/*
|
|
* group commit flag
|
|
*/
|
|
/* jfs_log */
|
|
#define logGC_PAGEOUT 0x00000001
|
|
|
|
/* tblock/lbuf */
|
|
#define tblkGC_QUEUE 0x0001
|
|
#define tblkGC_READY 0x0002
|
|
#define tblkGC_COMMIT 0x0004
|
|
#define tblkGC_COMMITTED 0x0008
|
|
#define tblkGC_EOP 0x0010
|
|
#define tblkGC_FREE 0x0020
|
|
#define tblkGC_LEADER 0x0040
|
|
#define tblkGC_ERROR 0x0080
|
|
#define tblkGC_LAZY 0x0100 // D230860
|
|
#define tblkGC_UNLOCKED 0x0200 // D230860
|
|
|
|
/*
|
|
* log cache buffer header
|
|
*/
|
|
struct lbuf {
|
|
struct jfs_log *l_log; /* 4: log associated with buffer */
|
|
|
|
/*
|
|
* data buffer base area
|
|
*/
|
|
uint l_flag; /* 4: pageout control flags */
|
|
|
|
struct lbuf *l_wqnext; /* 4: write queue link */
|
|
struct lbuf *l_freelist; /* 4: freelistlink */
|
|
|
|
int l_pn; /* 4: log page number */
|
|
int l_eor; /* 4: log record eor */
|
|
int l_ceor; /* 4: committed log record eor */
|
|
|
|
s64 l_blkno; /* 8: log page block number */
|
|
caddr_t l_ldata; /* 4: data page */
|
|
struct page *l_page; /* The page itself */
|
|
uint l_offset; /* Offset of l_ldata within the page */
|
|
|
|
wait_queue_head_t l_ioevent; /* 4: i/o done event */
|
|
};
|
|
|
|
/* Reuse l_freelist for redrive list */
|
|
#define l_redrive_next l_freelist
|
|
|
|
/*
|
|
* logsynclist block
|
|
*
|
|
* common logsyncblk prefix for jbuf_t and tblock
|
|
*/
|
|
struct logsyncblk {
|
|
u16 xflag; /* flags */
|
|
u16 flag; /* only meaninful in tblock */
|
|
lid_t lid; /* lock id */
|
|
s32 lsn; /* log sequence number */
|
|
struct list_head synclist; /* log sync list link */
|
|
};
|
|
|
|
/*
|
|
* logsynclist serialization (per log)
|
|
*/
|
|
|
|
#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock)
|
|
#define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags)
|
|
#define LOGSYNC_UNLOCK(log, flags) \
|
|
spin_unlock_irqrestore(&(log)->synclock, flags)
|
|
|
|
/* compute the difference in bytes of lsn from sync point */
|
|
#define logdiff(diff, lsn, log)\
|
|
{\
|
|
diff = (lsn) - (log)->syncpt;\
|
|
if (diff < 0)\
|
|
diff += (log)->logsize;\
|
|
}
|
|
|
|
extern int lmLogOpen(struct super_block *sb);
|
|
extern int lmLogClose(struct super_block *sb);
|
|
extern int lmLogShutdown(struct jfs_log * log);
|
|
extern int lmLogInit(struct jfs_log * log);
|
|
extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize);
|
|
extern void jfs_flush_journal(struct jfs_log * log, int wait);
|
|
extern void jfs_syncpt(struct jfs_log *log);
|
|
|
|
#endif /* _H_JFS_LOGMGR */
|