Home | History | Annotate | Download | only in sys
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #ifndef	_SYS_ZIL_H
     27 #define	_SYS_ZIL_H
     28 
     29 #pragma ident	"@(#)zil.h	1.14	07/12/12 SMI"
     30 
     31 #include <sys/types.h>
     32 #include <sys/spa.h>
     33 #include <sys/zio.h>
     34 #include <sys/dmu.h>
     35 
     36 #ifdef	__cplusplus
     37 extern "C" {
     38 #endif
     39 
     40 /*
     41  * Intent log format:
     42  *
     43  * Each objset has its own intent log.  The log header (zil_header_t)
     44  * for objset N's intent log is kept in the Nth object of the SPA's
     45  * intent_log objset.  The log header points to a chain of log blocks,
     46  * each of which contains log records (i.e., transactions) followed by
     47  * a log block trailer (zil_trailer_t).  The format of a log record
     48  * depends on the record (or transaction) type, but all records begin
     49  * with a common structure that defines the type, length, and txg.
     50  */
     51 
     52 /*
     53  * Intent log header - this on disk structure holds fields to manage
     54  * the log.  All fields are 64 bit to easily handle cross architectures.
     55  */
     56 typedef struct zil_header {
     57 	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
     58 	uint64_t zh_replay_seq;	/* highest replayed sequence number */
     59 	blkptr_t zh_log;	/* log chain */
     60 	uint64_t zh_claim_seq;	/* highest claimed sequence number */
     61 	uint64_t zh_pad[5];
     62 } zil_header_t;
     63 
     64 /*
     65  * Log block trailer - structure at the end of the header and each log block
     66  *
     67  * The zit_bt contains a zbt_cksum which for the intent log is
     68  * the sequence number of this log block. A seq of 0 is invalid.
     69  * The zbt_cksum is checked by the SPA against the sequence
     70  * number passed in the blk_cksum field of the blkptr_t
     71  */
     72 typedef struct zil_trailer {
     73 	uint64_t zit_pad;
     74 	blkptr_t zit_next_blk;	/* next block in chain */
     75 	uint64_t zit_nused;	/* bytes in log block used */
     76 	zio_block_tail_t zit_bt; /* block trailer */
     77 } zil_trailer_t;
     78 
     79 #define	ZIL_MIN_BLKSZ	4096ULL
     80 #define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
     81 #define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))
     82 
     83 /*
     84  * The words of a log block checksum.
     85  */
     86 #define	ZIL_ZC_GUID_0	0
     87 #define	ZIL_ZC_GUID_1	1
     88 #define	ZIL_ZC_OBJSET	2
     89 #define	ZIL_ZC_SEQ	3
     90 
     91 typedef enum zil_create {
     92 	Z_FILE,
     93 	Z_DIR,
     94 	Z_XATTRDIR,
     95 } zil_create_t;
     96 
     97 /*
     98  * size of xvattr log section.
     99  * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
    100  * for create time and a single 64 bit integer for all of the attributes,
    101  * and 4 64 bit integers (32 bytes) for the scanstamp.
    102  *
    103  */
    104 
    105 #define	ZIL_XVAT_SIZE(mapsize) \
    106 	sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
    107 	(sizeof (uint64_t) * 7)
    108 
    109 /*
    110  * Size of ACL in log.  The ACE data is padded out to properly align
    111  * on 8 byte boundary.
    112  */
    113 
    114 #define	ZIL_ACE_LENGTH(x)	(roundup(x, sizeof (uint64_t)))
    115 
    116 /*
    117  * Intent log transaction types and record structures
    118  */
    119 #define	TX_CREATE		1	/* Create file */
    120 #define	TX_MKDIR		2	/* Make directory */
    121 #define	TX_MKXATTR		3	/* Make XATTR directory */
    122 #define	TX_SYMLINK		4	/* Create symbolic link to a file */
    123 #define	TX_REMOVE		5	/* Remove file */
    124 #define	TX_RMDIR		6	/* Remove directory */
    125 #define	TX_LINK			7	/* Create hard link to a file */
    126 #define	TX_RENAME		8	/* Rename a file */
    127 #define	TX_WRITE		9	/* File write */
    128 #define	TX_TRUNCATE		10	/* Truncate a file */
    129 #define	TX_SETATTR		11	/* Set file attributes */
    130 #define	TX_ACL_V0		12	/* Set old formatted ACL */
    131 #define	TX_ACL			13	/* Set ACL */
    132 #define	TX_CREATE_ACL		14	/* create with ACL */
    133 #define	TX_CREATE_ATTR		15	/* create + attrs */
    134 #define	TX_CREATE_ACL_ATTR 	16	/* create with ACL + attrs */
    135 #define	TX_MKDIR_ACL		17	/* mkdir with ACL */
    136 #define	TX_MKDIR_ATTR		18	/* mkdir with attr */
    137 #define	TX_MKDIR_ACL_ATTR	19	/* mkdir with ACL + attrs */
    138 #define	TX_MAX_TYPE		20	/* Max transaction type */
    139 
    140 /*
    141  * The transactions for mkdir, symlink, remove, rmdir, link, and rename
    142  * may have the following bit set, indicating the original request
    143  * specified case-insensitive handling of names.
    144  */
    145 #define	TX_CI	((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
    146 
    147 /*
    148  * Format of log records.
    149  * The fields are carefully defined to allow them to be aligned
    150  * and sized the same on sparc & intel architectures.
    151  * Each log record has a common structure at the beginning.
    152  *
    153  * Note, lrc_seq holds two different sequence numbers. Whilst in memory
    154  * it contains the transaction sequence number.  The log record on
    155  * disk holds the sequence number of all log records which is used to
    156  * ensure we don't replay the same record.  The two sequence numbers are
    157  * different because the transactions can now be pushed out of order.
    158  */
    159 typedef struct {			/* common log record header */
    160 	uint64_t	lrc_txtype;	/* intent log transaction type */
    161 	uint64_t	lrc_reclen;	/* transaction record length */
    162 	uint64_t	lrc_txg;	/* dmu transaction group number */
    163 	uint64_t	lrc_seq;	/* see comment above */
    164 } lr_t;
    165 
    166 /*
    167  * Handle option extended vattr attributes.
    168  *
    169  * Whenever new attributes are added the version number
    170  * will need to be updated as will code in
    171  * zfs_log.c and zfs_replay.c
    172  */
    173 typedef struct {
    174 	uint32_t	lr_attr_masksize; /* number of elements in array */
    175 	uint32_t	lr_attr_bitmap; /* First entry of array */
    176 	/* remainder of array and any additional fields */
    177 } lr_attr_t;
    178 
    179 /*
    180  * log record for creates without optional ACL.
    181  * This log record does support optional xvattr_t attributes.
    182  */
    183 typedef struct {
    184 	lr_t		lr_common;	/* common portion of log record */
    185 	uint64_t	lr_doid;	/* object id of directory */
    186 	uint64_t	lr_foid;	/* object id of created file object */
    187 	uint64_t	lr_mode;	/* mode of object */
    188 	uint64_t	lr_uid;		/* uid of object */
    189 	uint64_t	lr_gid;		/* gid of object */
    190 	uint64_t	lr_gen;		/* generation (txg of creation) */
    191 	uint64_t	lr_crtime[2];	/* creation time */
    192 	uint64_t	lr_rdev;	/* rdev of object to create */
    193 	/* name of object to create follows this */
    194 	/* for symlinks, link content follows name */
    195 	/* for creates with xvattr data, the name follows the xvattr info */
    196 } lr_create_t;
    197 
    198 /*
    199  * FUID ACL record will be an array of ACEs from the original ACL.
    200  * If this array includes ephemeral IDs, the record will also include
    201  * an array of log-specific FUIDs to replace the ephemeral IDs.
    202  * Only one copy of each unique domain will be present, so the log-specific
    203  * FUIDs will use an index into a compressed domain table.  On replay this
    204  * information will be used to construct real FUIDs (and bypass idmap,
    205  * since it may not be available).
    206  */
    207 
    208 /*
    209  * Log record for creates with optional ACL
    210  * This log record is also used for recording any FUID
    211  * information needed for replaying the create.  If the
    212  * file doesn't have any actual ACEs then the lr_aclcnt
    213  * would be zero.
    214  */
    215 typedef struct {
    216 	lr_create_t	lr_create;	/* common create portion */
    217 	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
    218 	uint64_t	lr_domcnt;	/* number of unique domains */
    219 	uint64_t	lr_fuidcnt;	/* number of real fuids */
    220 	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
    221 	uint64_t	lr_acl_flags;	/* ACL flags */
    222 	/* lr_acl_bytes number of variable sized ace's follows */
    223 	/* if create is also setting xvattr's, then acl data follows xvattr */
    224 	/* if ACE FUIDs are needed then they will follow the xvattr_t */
    225 	/* Following the FUIDs will be the domain table information. */
    226 	/* The FUIDs for the owner and group will be in the lr_create */
    227 	/* portion of the record. */
    228 	/* name follows ACL data */
    229 } lr_acl_create_t;
    230 
    231 typedef struct {
    232 	lr_t		lr_common;	/* common portion of log record */
    233 	uint64_t	lr_doid;	/* obj id of directory */
    234 	/* name of object to remove follows this */
    235 } lr_remove_t;
    236 
    237 typedef struct {
    238 	lr_t		lr_common;	/* common portion of log record */
    239 	uint64_t	lr_doid;	/* obj id of directory */
    240 	uint64_t	lr_link_obj;	/* obj id of link */
    241 	/* name of object to link follows this */
    242 } lr_link_t;
    243 
    244 typedef struct {
    245 	lr_t		lr_common;	/* common portion of log record */
    246 	uint64_t	lr_sdoid;	/* obj id of source directory */
    247 	uint64_t	lr_tdoid;	/* obj id of target directory */
    248 	/* 2 strings: names of source and destination follow this */
    249 } lr_rename_t;
    250 
    251 typedef struct {
    252 	lr_t		lr_common;	/* common portion of log record */
    253 	uint64_t	lr_foid;	/* file object to write */
    254 	uint64_t	lr_offset;	/* offset to write to */
    255 	uint64_t	lr_length;	/* user data length to write */
    256 	uint64_t	lr_blkoff;	/* offset represented by lr_blkptr */
    257 	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
    258 	/* write data will follow for small writes */
    259 } lr_write_t;
    260 
    261 typedef struct {
    262 	lr_t		lr_common;	/* common portion of log record */
    263 	uint64_t	lr_foid;	/* object id of file to truncate */
    264 	uint64_t	lr_offset;	/* offset to truncate from */
    265 	uint64_t	lr_length;	/* length to truncate */
    266 } lr_truncate_t;
    267 
    268 typedef struct {
    269 	lr_t		lr_common;	/* common portion of log record */
    270 	uint64_t	lr_foid;	/* file object to change attributes */
    271 	uint64_t	lr_mask;	/* mask of attributes to set */
    272 	uint64_t	lr_mode;	/* mode to set */
    273 	uint64_t	lr_uid;		/* uid to set */
    274 	uint64_t	lr_gid;		/* gid to set */
    275 	uint64_t	lr_size;	/* size to set */
    276 	uint64_t	lr_atime[2];	/* access time */
    277 	uint64_t	lr_mtime[2];	/* modification time */
    278 	/* optional attribute lr_attr_t may be here */
    279 } lr_setattr_t;
    280 
    281 typedef struct {
    282 	lr_t		lr_common;	/* common portion of log record */
    283 	uint64_t	lr_foid;	/* obj id of file */
    284 	uint64_t	lr_aclcnt;	/* number of acl entries */
    285 	/* lr_aclcnt number of ace_t entries follow this */
    286 } lr_acl_v0_t;
    287 
    288 typedef struct {
    289 	lr_t		lr_common;	/* common portion of log record */
    290 	uint64_t	lr_foid;	/* obj id of file */
    291 	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
    292 	uint64_t	lr_domcnt;	/* number of unique domains */
    293 	uint64_t	lr_fuidcnt;	/* number of real fuids */
    294 	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
    295 	uint64_t	lr_acl_flags;	/* ACL flags */
    296 	/* lr_acl_bytes number of variable sized ace's follows */
    297 } lr_acl_t;
    298 
    299 /*
    300  * ZIL structure definitions, interface function prototype and globals.
    301  */
    302 
    303 /*
    304  * ZFS intent log transaction structure
    305  */
    306 typedef enum {
    307 	WR_INDIRECT,	/* indirect - a large write (dmu_sync() data */
    308 			/* and put blkptr in log, rather than actual data) */
    309 	WR_COPIED,	/* immediate - data is copied into lr_write_t */
    310 	WR_NEED_COPY,	/* immediate - data needs to be copied if pushed */
    311 } itx_wr_state_t;
    312 
    313 typedef struct itx {
    314 	list_node_t	itx_node;	/* linkage on zl_itx_list */
    315 	void		*itx_private;	/* type-specific opaque data */
    316 	itx_wr_state_t	itx_wr_state;	/* write state */
    317 	uint8_t		itx_sync;	/* synchronous transaction */
    318 	lr_t		itx_lr;		/* common part of log record */
    319 	/* followed by type-specific part of lr_xx_t and its immediate data */
    320 } itx_t;
    321 
    322 
    323 /*
    324  * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
    325  * to handle the cleanup of the dmu_sync() buffer write
    326  */
    327 typedef struct {
    328 	zilog_t		*zgd_zilog;	/* zilog */
    329 	blkptr_t	*zgd_bp;	/* block pointer */
    330 	struct rl	*zgd_rl;	/* range lock */
    331 } zgd_t;
    332 
    333 
    334 typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
    335     uint64_t txg);
    336 typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
    337     uint64_t txg);
    338 typedef int zil_replay_func_t();
    339 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
    340 
    341 extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
    342     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
    343 
    344 extern void	zil_init(void);
    345 extern void	zil_fini(void);
    346 
    347 extern zilog_t	*zil_alloc(objset_t *os, zil_header_t *zh_phys);
    348 extern void	zil_free(zilog_t *zilog);
    349 
    350 extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
    351 extern void	zil_close(zilog_t *zilog);
    352 
    353 extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
    354     zil_replay_func_t *replay_func[TX_MAX_TYPE]);
    355 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
    356 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
    357 
    358 extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
    359 extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
    360 
    361 extern void	zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
    362 
    363 extern int	zil_claim(char *osname, void *txarg);
    364 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
    365 extern void	zil_clean(zilog_t *zilog);
    366 extern int	zil_is_committed(zilog_t *zilog);
    367 
    368 extern int	zil_suspend(zilog_t *zilog);
    369 extern void	zil_resume(zilog_t *zilog);
    370 
    371 extern void	zil_add_block(zilog_t *zilog, blkptr_t *bp);
    372 
    373 extern int zil_disable;
    374 
    375 #ifdef	__cplusplus
    376 }
    377 #endif
    378 
    379 #endif	/* _SYS_ZIL_H */
    380