Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)vdev_raidz.c	1.10	07/11/27 SMI"
     28 
     29 #include <sys/zfs_context.h>
     30 #include <sys/spa.h>
     31 #include <sys/vdev_impl.h>
     32 #include <sys/zio.h>
     33 #include <sys/zio_checksum.h>
     34 #include <sys/fs/zfs.h>
     35 #include <sys/fm/fs/zfs.h>
     36 
     37 /*
     38  * Virtual device vector for RAID-Z.
     39  *
     40  * This vdev supports both single and double parity. For single parity, we
     41  * use a simple XOR of all the data columns. For double parity, we use both
     42  * the simple XOR as well as a technique described in "The mathematics of
     43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
     44  * over the integers expressable in a single byte. Briefly, the operations on
     45  * the field are defined as follows:
     46  *
     47  *   o addition (+) is represented by a bitwise XOR
     48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
     49  *   o multiplication of A by 2 is defined by the following bitwise expression:
     50  *	(A * 2)_7 = A_6
     51  *	(A * 2)_6 = A_5
     52  *	(A * 2)_5 = A_4
     53  *	(A * 2)_4 = A_3 + A_7
     54  *	(A * 2)_3 = A_2 + A_7
     55  *	(A * 2)_2 = A_1 + A_7
     56  *	(A * 2)_1 = A_0
     57  *	(A * 2)_0 = A_7
     58  *
     59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
     60  *
     61  * Observe that any number in the field (except for 0) can be expressed as a
     62  * power of 2 -- a generator for the field. We store a table of the powers of
     63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
     64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
     65  * than field addition). The inverse of a field element A (A^-1) is A^254.
     66  *
     67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
     68  * can be expressed by field operations:
     69  *
     70  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
     71  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
     72  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
     73  *
     74  * See the reconstruction code below for how P and Q can used individually or
     75  * in concert to recover missing data columns.
     76  */
     77 
     78 typedef struct raidz_col {
     79 	uint64_t rc_devidx;		/* child device index for I/O */
     80 	uint64_t rc_offset;		/* device offset */
     81 	uint64_t rc_size;		/* I/O size */
     82 	void *rc_data;			/* I/O data */
     83 	int rc_error;			/* I/O error for this device */
     84 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
     85 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
     86 } raidz_col_t;
     87 
     88 typedef struct raidz_map {
     89 	uint64_t rm_cols;		/* Column count */
     90 	uint64_t rm_bigcols;		/* Number of oversized columns */
     91 	uint64_t rm_asize;		/* Actual total I/O size */
     92 	uint64_t rm_missingdata;	/* Count of missing data devices */
     93 	uint64_t rm_missingparity;	/* Count of missing parity devices */
     94 	uint64_t rm_firstdatacol;	/* First data column/parity count */
     95 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
     96 } raidz_map_t;
     97 
     98 #define	VDEV_RAIDZ_P		0
     99 #define	VDEV_RAIDZ_Q		1
    100 
    101 #define	VDEV_RAIDZ_MAXPARITY	2
    102 
    103 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
    104 
    105 /*
    106  * These two tables represent powers and logs of 2 in the Galois field defined
    107  * above. These values were computed by repeatedly multiplying by 2 as above.
    108  */
    109 static const uint8_t vdev_raidz_pow2[256] = {
    110 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
    111 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
    112 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
    113 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
    114 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
    115 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
    116 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
    117 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
    118 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
    119 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
    120 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
    121 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
    122 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
    123 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
    124 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
    125 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
    126 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
    127 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
    128 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
    129 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
    130 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
    131 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
    132 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
    133 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
    134 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
    135 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
    136 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
    137 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
    138 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
    139 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
    140 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
    141 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
    142 };
    143 static const uint8_t vdev_raidz_log2[256] = {
    144 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
    145 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
    146 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
    147 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
    148 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
    149 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
    150 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
    151 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
    152 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
    153 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
    154 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
    155 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
    156 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
    157 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
    158 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
    159 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
    160 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
    161 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
    162 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
    163 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
    164 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
    165 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
    166 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
    167 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
    168 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
    169 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
    170 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
    171 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
    172 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
    173 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
    174 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
    175 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
    176 };
    177 
    178 /*
    179  * Multiply a given number by 2 raised to the given power.
    180  */
    181 static uint8_t
    182 vdev_raidz_exp2(uint_t a, int exp)
    183 {
    184 	if (a == 0)
    185 		return (0);
    186 
    187 	ASSERT(exp >= 0);
    188 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
    189 
    190 	exp += vdev_raidz_log2[a];
    191 	if (exp > 255)
    192 		exp -= 255;
    193 
    194 	return (vdev_raidz_pow2[exp]);
    195 }
    196 
    197 static raidz_map_t *
    198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
    199     uint64_t nparity)
    200 {
    201 	raidz_map_t *rm;
    202 	uint64_t b = zio->io_offset >> unit_shift;
    203 	uint64_t s = zio->io_size >> unit_shift;
    204 	uint64_t f = b % dcols;
    205 	uint64_t o = (b / dcols) << unit_shift;
    206 	uint64_t q, r, c, bc, col, acols, coff, devidx;
    207 
    208 	q = s / (dcols - nparity);
    209 	r = s - q * (dcols - nparity);
    210 	bc = (r == 0 ? 0 : r + nparity);
    211 
    212 	acols = (q == 0 ? bc : dcols);
    213 
    214 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
    215 
    216 	rm->rm_cols = acols;
    217 	rm->rm_bigcols = bc;
    218 	rm->rm_asize = 0;
    219 	rm->rm_missingdata = 0;
    220 	rm->rm_missingparity = 0;
    221 	rm->rm_firstdatacol = nparity;
    222 
    223 	for (c = 0; c < acols; c++) {
    224 		col = f + c;
    225 		coff = o;
    226 		if (col >= dcols) {
    227 			col -= dcols;
    228 			coff += 1ULL << unit_shift;
    229 		}
    230 		rm->rm_col[c].rc_devidx = col;
    231 		rm->rm_col[c].rc_offset = coff;
    232 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
    233 		rm->rm_col[c].rc_data = NULL;
    234 		rm->rm_col[c].rc_error = 0;
    235 		rm->rm_col[c].rc_tried = 0;
    236 		rm->rm_col[c].rc_skipped = 0;
    237 		rm->rm_asize += rm->rm_col[c].rc_size;
    238 	}
    239 
    240 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
    241 
    242 	for (c = 0; c < rm->rm_firstdatacol; c++)
    243 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
    244 
    245 	rm->rm_col[c].rc_data = zio->io_data;
    246 
    247 	for (c = c + 1; c < acols; c++)
    248 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
    249 		    rm->rm_col[c - 1].rc_size;
    250 
    251 	/*
    252 	 * If all data stored spans all columns, there's a danger that parity
    253 	 * will always be on the same device and, since parity isn't read
    254 	 * during normal operation, that that device's I/O bandwidth won't be
    255 	 * used effectively. We therefore switch the parity every 1MB.
    256 	 *
    257 	 * ... at least that was, ostensibly, the theory. As a practical
    258 	 * matter unless we juggle the parity between all devices evenly, we
    259 	 * won't see any benefit. Further, occasional writes that aren't a
    260 	 * multiple of the LCM of the number of children and the minimum
    261 	 * stripe width are sufficient to avoid pessimal behavior.
    262 	 * Unfortunately, this decision created an implicit on-disk format
    263 	 * requirement that we need to support for all eternity, but only
    264 	 * for single-parity RAID-Z.
    265 	 */
    266 	ASSERT(rm->rm_cols >= 2);
    267 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
    268 
    269 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
    270 		devidx = rm->rm_col[0].rc_devidx;
    271 		o = rm->rm_col[0].rc_offset;
    272 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
    273 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
    274 		rm->rm_col[1].rc_devidx = devidx;
    275 		rm->rm_col[1].rc_offset = o;
    276 	}
    277 
    278 	zio->io_vsd = rm;
    279 	return (rm);
    280 }
    281 
    282 static void
    283 vdev_raidz_map_free(zio_t *zio)
    284 {
    285 	raidz_map_t *rm = zio->io_vsd;
    286 	int c;
    287 
    288 	for (c = 0; c < rm->rm_firstdatacol; c++)
    289 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
    290 
    291 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
    292 	zio->io_vsd = NULL;
    293 }
    294 
    295 static void
    296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
    297 {
    298 	uint64_t *p, *src, pcount, ccount, i;
    299 	int c;
    300 
    301 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    302 
    303 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    304 		src = rm->rm_col[c].rc_data;
    305 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    306 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    307 
    308 		if (c == rm->rm_firstdatacol) {
    309 			ASSERT(ccount == pcount);
    310 			for (i = 0; i < ccount; i++, p++, src++) {
    311 				*p = *src;
    312 			}
    313 		} else {
    314 			ASSERT(ccount <= pcount);
    315 			for (i = 0; i < ccount; i++, p++, src++) {
    316 				*p ^= *src;
    317 			}
    318 		}
    319 	}
    320 }
    321 
    322 static void
    323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
    324 {
    325 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
    326 	int c;
    327 
    328 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    329 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    330 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    331 
    332 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    333 		src = rm->rm_col[c].rc_data;
    334 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    335 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    336 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    337 
    338 		if (c == rm->rm_firstdatacol) {
    339 			ASSERT(ccount == pcount || ccount == 0);
    340 			for (i = 0; i < ccount; i++, p++, q++, src++) {
    341 				*q = *src;
    342 				*p = *src;
    343 			}
    344 			for (; i < pcount; i++, p++, q++, src++) {
    345 				*q = 0;
    346 				*p = 0;
    347 			}
    348 		} else {
    349 			ASSERT(ccount <= pcount);
    350 
    351 			/*
    352 			 * Rather than multiplying each byte individually (as
    353 			 * described above), we are able to handle 8 at once
    354 			 * by generating a mask based on the high bit in each
    355 			 * byte and using that to conditionally XOR in 0x1d.
    356 			 */
    357 			for (i = 0; i < ccount; i++, p++, q++, src++) {
    358 				mask = *q & 0x8080808080808080ULL;
    359 				mask = (mask << 1) - (mask >> 7);
    360 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
    361 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    362 				*q ^= *src;
    363 				*p ^= *src;
    364 			}
    365 
    366 			/*
    367 			 * Treat short columns as though they are full of 0s.
    368 			 */
    369 			for (; i < pcount; i++, q++) {
    370 				mask = *q & 0x8080808080808080ULL;
    371 				mask = (mask << 1) - (mask >> 7);
    372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
    373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    374 			}
    375 		}
    376 	}
    377 }
    378 
    379 static void
    380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
    381 {
    382 	uint64_t *dst, *src, xcount, ccount, count, i;
    383 	int c;
    384 
    385 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    386 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
    387 	ASSERT(xcount > 0);
    388 
    389 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    390 	dst = rm->rm_col[x].rc_data;
    391 	for (i = 0; i < xcount; i++, dst++, src++) {
    392 		*dst = *src;
    393 	}
    394 
    395 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    396 		src = rm->rm_col[c].rc_data;
    397 		dst = rm->rm_col[x].rc_data;
    398 
    399 		if (c == x)
    400 			continue;
    401 
    402 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    403 		count = MIN(ccount, xcount);
    404 
    405 		for (i = 0; i < count; i++, dst++, src++) {
    406 			*dst ^= *src;
    407 		}
    408 	}
    409 }
    410 
    411 static void
    412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
    413 {
    414 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
    415 	uint8_t *b;
    416 	int c, j, exp;
    417 
    418 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    419 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
    420 
    421 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    422 		src = rm->rm_col[c].rc_data;
    423 		dst = rm->rm_col[x].rc_data;
    424 
    425 		if (c == x)
    426 			ccount = 0;
    427 		else
    428 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    429 
    430 		count = MIN(ccount, xcount);
    431 
    432 		if (c == rm->rm_firstdatacol) {
    433 			for (i = 0; i < count; i++, dst++, src++) {
    434 				*dst = *src;
    435 			}
    436 			for (; i < xcount; i++, dst++) {
    437 				*dst = 0;
    438 			}
    439 
    440 		} else {
    441 			/*
    442 			 * For an explanation of this, see the comment in
    443 			 * vdev_raidz_generate_parity_pq() above.
    444 			 */
    445 			for (i = 0; i < count; i++, dst++, src++) {
    446 				mask = *dst & 0x8080808080808080ULL;
    447 				mask = (mask << 1) - (mask >> 7);
    448 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
    449 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    450 				*dst ^= *src;
    451 			}
    452 
    453 			for (; i < xcount; i++, dst++) {
    454 				mask = *dst & 0x8080808080808080ULL;
    455 				mask = (mask << 1) - (mask >> 7);
    456 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
    457 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    458 			}
    459 		}
    460 	}
    461 
    462 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    463 	dst = rm->rm_col[x].rc_data;
    464 	exp = 255 - (rm->rm_cols - 1 - x);
    465 
    466 	for (i = 0; i < xcount; i++, dst++, src++) {
    467 		*dst ^= *src;
    468 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
    469 			*b = vdev_raidz_exp2(*b, exp);
    470 		}
    471 	}
    472 }
    473 
    474 static void
    475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
    476 {
    477 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
    478 	void *pdata, *qdata;
    479 	uint64_t xsize, ysize, i;
    480 
    481 	ASSERT(x < y);
    482 	ASSERT(x >= rm->rm_firstdatacol);
    483 	ASSERT(y < rm->rm_cols);
    484 
    485 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
    486 
    487 	/*
    488 	 * Move the parity data aside -- we're going to compute parity as
    489 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
    490 	 * reuse the parity generation mechanism without trashing the actual
    491 	 * parity so we make those columns appear to be full of zeros by
    492 	 * setting their lengths to zero.
    493 	 */
    494 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    495 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    496 	xsize = rm->rm_col[x].rc_size;
    497 	ysize = rm->rm_col[y].rc_size;
    498 
    499 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
    500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
    501 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
    502 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    503 	rm->rm_col[x].rc_size = 0;
    504 	rm->rm_col[y].rc_size = 0;
    505 
    506 	vdev_raidz_generate_parity_pq(rm);
    507 
    508 	rm->rm_col[x].rc_size = xsize;
    509 	rm->rm_col[y].rc_size = ysize;
    510 
    511 	p = pdata;
    512 	q = qdata;
    513 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    514 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    515 	xd = rm->rm_col[x].rc_data;
    516 	yd = rm->rm_col[y].rc_data;
    517 
    518 	/*
    519 	 * We now have:
    520 	 *	Pxy = P + D_x + D_y
    521 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
    522 	 *
    523 	 * We can then solve for D_x:
    524 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
    525 	 * where
    526 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
    527 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
    528 	 *
    529 	 * With D_x in hand, we can easily solve for D_y:
    530 	 *	D_y = P + Pxy + D_x
    531 	 */
    532 
    533 	a = vdev_raidz_pow2[255 + x - y];
    534 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
    535 	tmp = 255 - vdev_raidz_log2[a ^ 1];
    536 
    537 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
    538 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
    539 
    540 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
    541 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
    542 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
    543 
    544 		if (i < ysize)
    545 			*yd = *p ^ *pxy ^ *xd;
    546 	}
    547 
    548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
    549 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
    550 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
    551 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    552 
    553 	/*
    554 	 * Restore the saved parity data.
    555 	 */
    556 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
    557 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
    558 }
    559 
    560 
    561 static int
    562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
    563 {
    564 	vdev_t *cvd;
    565 	uint64_t nparity = vd->vdev_nparity;
    566 	int c, error;
    567 	int lasterror = 0;
    568 	int numerrors = 0;
    569 
    570 	ASSERT(nparity > 0);
    571 
    572 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
    573 	    vd->vdev_children < nparity + 1) {
    574 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
    575 		return (EINVAL);
    576 	}
    577 
    578 	for (c = 0; c < vd->vdev_children; c++) {
    579 		cvd = vd->vdev_child[c];
    580 
    581 		if ((error = vdev_open(cvd)) != 0) {
    582 			lasterror = error;
    583 			numerrors++;
    584 			continue;
    585 		}
    586 
    587 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
    588 		*ashift = MAX(*ashift, cvd->vdev_ashift);
    589 	}
    590 
    591 	*asize *= vd->vdev_children;
    592 
    593 	if (numerrors > nparity) {
    594 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
    595 		return (lasterror);
    596 	}
    597 
    598 	return (0);
    599 }
    600 
    601 static void
    602 vdev_raidz_close(vdev_t *vd)
    603 {
    604 	int c;
    605 
    606 	for (c = 0; c < vd->vdev_children; c++)
    607 		vdev_close(vd->vdev_child[c]);
    608 }
    609 
    610 static uint64_t
    611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
    612 {
    613 	uint64_t asize;
    614 	uint64_t ashift = vd->vdev_top->vdev_ashift;
    615 	uint64_t cols = vd->vdev_children;
    616 	uint64_t nparity = vd->vdev_nparity;
    617 
    618 	asize = ((psize - 1) >> ashift) + 1;
    619 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
    620 	asize = roundup(asize, nparity + 1) << ashift;
    621 
    622 	return (asize);
    623 }
    624 
    625 static void
    626 vdev_raidz_child_done(zio_t *zio)
    627 {
    628 	raidz_col_t *rc = zio->io_private;
    629 
    630 	rc->rc_error = zio->io_error;
    631 	rc->rc_tried = 1;
    632 	rc->rc_skipped = 0;
    633 }
    634 
    635 static void
    636 vdev_raidz_repair_done(zio_t *zio)
    637 {
    638 	ASSERT(zio->io_private == zio->io_parent);
    639 	vdev_raidz_map_free(zio->io_private);
    640 }
    641 
    642 static int
    643 vdev_raidz_io_start(zio_t *zio)
    644 {
    645 	vdev_t *vd = zio->io_vd;
    646 	vdev_t *tvd = vd->vdev_top;
    647 	vdev_t *cvd;
    648 	blkptr_t *bp = zio->io_bp;
    649 	raidz_map_t *rm;
    650 	raidz_col_t *rc;
    651 	int c;
    652 
    653 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
    654 	    vd->vdev_nparity);
    655 
    656 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
    657 
    658 	if (zio->io_type == ZIO_TYPE_WRITE) {
    659 		/*
    660 		 * Generate RAID parity in the first virtual columns.
    661 		 */
    662 		if (rm->rm_firstdatacol == 1)
    663 			vdev_raidz_generate_parity_p(rm);
    664 		else
    665 			vdev_raidz_generate_parity_pq(rm);
    666 
    667 		for (c = 0; c < rm->rm_cols; c++) {
    668 			rc = &rm->rm_col[c];
    669 			cvd = vd->vdev_child[rc->rc_devidx];
    670 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
    671 			    rc->rc_offset, rc->rc_data, rc->rc_size,
    672 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
    673 			    vdev_raidz_child_done, rc));
    674 		}
    675 
    676 		return (zio_wait_for_children_done(zio));
    677 	}
    678 
    679 	ASSERT(zio->io_type == ZIO_TYPE_READ);
    680 
    681 	/*
    682 	 * Iterate over the columns in reverse order so that we hit the parity
    683 	 * last -- any errors along the way will force us to read the parity
    684 	 * data.
    685 	 */
    686 	for (c = rm->rm_cols - 1; c >= 0; c--) {
    687 		rc = &rm->rm_col[c];
    688 		cvd = vd->vdev_child[rc->rc_devidx];
    689 		if (!vdev_readable(cvd)) {
    690 			if (c >= rm->rm_firstdatacol)
    691 				rm->rm_missingdata++;
    692 			else
    693 				rm->rm_missingparity++;
    694 			rc->rc_error = ENXIO;
    695 			rc->rc_tried = 1;	/* don't even try */
    696 			rc->rc_skipped = 1;
    697 			continue;
    698 		}
    699 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
    700 			if (c >= rm->rm_firstdatacol)
    701 				rm->rm_missingdata++;
    702 			else
    703 				rm->rm_missingparity++;
    704 			rc->rc_error = ESTALE;
    705 			rc->rc_skipped = 1;
    706 			continue;
    707 		}
    708 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
    709 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
    710 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
    711 			    rc->rc_offset, rc->rc_data, rc->rc_size,
    712 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
    713 			    vdev_raidz_child_done, rc));
    714 		}
    715 	}
    716 
    717 	return (zio_wait_for_children_done(zio));
    718 }
    719 
    720 /*
    721  * Report a checksum error for a child of a RAID-Z device.
    722  */
    723 static void
    724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
    725 {
    726 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
    727 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
    728 	    vdev_description(vd));
    729 
    730 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
    731 		mutex_enter(&vd->vdev_stat_lock);
    732 		vd->vdev_stat.vs_checksum_errors++;
    733 		mutex_exit(&vd->vdev_stat_lock);
    734 	}
    735 
    736 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
    737 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
    738 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
    739 }
    740 
    741 /*
    742  * Generate the parity from the data columns. If we tried and were able to
    743  * read the parity without error, verify that the generated parity matches the
    744  * data we read. If it doesn't, we fire off a checksum error. Return the
    745  * number such failures.
    746  */
    747 static int
    748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
    749 {
    750 	void *orig[VDEV_RAIDZ_MAXPARITY];
    751 	int c, ret = 0;
    752 	raidz_col_t *rc;
    753 
    754 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    755 		rc = &rm->rm_col[c];
    756 		if (!rc->rc_tried || rc->rc_error != 0)
    757 			continue;
    758 		orig[c] = zio_buf_alloc(rc->rc_size);
    759 		bcopy(rc->rc_data, orig[c], rc->rc_size);
    760 	}
    761 
    762 	if (rm->rm_firstdatacol == 1)
    763 		vdev_raidz_generate_parity_p(rm);
    764 	else
    765 		vdev_raidz_generate_parity_pq(rm);
    766 
    767 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    768 		rc = &rm->rm_col[c];
    769 		if (!rc->rc_tried || rc->rc_error != 0)
    770 			continue;
    771 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
    772 			raidz_checksum_error(zio, rc);
    773 			rc->rc_error = ECKSUM;
    774 			ret++;
    775 		}
    776 		zio_buf_free(orig[c], rc->rc_size);
    777 	}
    778 
    779 	return (ret);
    780 }
    781 
    782 static uint64_t raidz_corrected_p;
    783 static uint64_t raidz_corrected_q;
    784 static uint64_t raidz_corrected_pq;
    785 
    786 static int
    787 vdev_raidz_io_done(zio_t *zio)
    788 {
    789 	vdev_t *vd = zio->io_vd;
    790 	vdev_t *cvd;
    791 	raidz_map_t *rm = zio->io_vsd;
    792 	raidz_col_t *rc, *rc1;
    793 	int unexpected_errors = 0;
    794 	int parity_errors = 0;
    795 	int parity_untried = 0;
    796 	int data_errors = 0;
    797 	int n, c, c1;
    798 
    799 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
    800 
    801 	zio->io_error = 0;
    802 	zio->io_numerrors = 0;
    803 
    804 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
    805 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
    806 
    807 	for (c = 0; c < rm->rm_cols; c++) {
    808 		rc = &rm->rm_col[c];
    809 
    810 		/*
    811 		 * We preserve any EIOs because those may be worth retrying;
    812 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
    813 		 */
    814 		if (rc->rc_error) {
    815 			if (zio->io_error != EIO)
    816 				zio->io_error = rc->rc_error;
    817 
    818 			if (c < rm->rm_firstdatacol)
    819 				parity_errors++;
    820 			else
    821 				data_errors++;
    822 
    823 			if (!rc->rc_skipped)
    824 				unexpected_errors++;
    825 
    826 			zio->io_numerrors++;
    827 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
    828 			parity_untried++;
    829 		}
    830 	}
    831 
    832 	if (zio->io_type == ZIO_TYPE_WRITE) {
    833 		/*
    834 		 * If this is not a failfast write, and we were able to
    835 		 * write enough columns to reconstruct the data, good enough.
    836 		 */
    837 		/* XXPOLICY */
    838 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
    839 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
    840 			zio->io_error = 0;
    841 
    842 		vdev_raidz_map_free(zio);
    843 
    844 		return (ZIO_PIPELINE_CONTINUE);
    845 	}
    846 
    847 	ASSERT(zio->io_type == ZIO_TYPE_READ);
    848 	/*
    849 	 * There are three potential phases for a read:
    850 	 *	1. produce valid data from the columns read
    851 	 *	2. read all disks and try again
    852 	 *	3. perform combinatorial reconstruction
    853 	 *
    854 	 * Each phase is progressively both more expensive and less likely to
    855 	 * occur. If we encounter more errors than we can repair or all phases
    856 	 * fail, we have no choice but to return an error.
    857 	 */
    858 
    859 	/*
    860 	 * If the number of errors we saw was correctable -- less than or equal
    861 	 * to the number of parity disks read -- attempt to produce data that
    862 	 * has a valid checksum. Naturally, this case applies in the absence of
    863 	 * any errors.
    864 	 */
    865 	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
    866 		switch (data_errors) {
    867 		case 0:
    868 			if (zio_checksum_error(zio) == 0) {
    869 				zio->io_error = 0;
    870 
    871 				/*
    872 				 * If we read parity information (unnecessarily
    873 				 * as it happens since no reconstruction was
    874 				 * needed) regenerate and verify the parity.
    875 				 * We also regenerate parity when resilvering
    876 				 * so we can write it out to the failed device
    877 				 * later.
    878 				 */
    879 				if (parity_errors + parity_untried <
    880 				    rm->rm_firstdatacol ||
    881 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
    882 					n = raidz_parity_verify(zio, rm);
    883 					unexpected_errors += n;
    884 					ASSERT(parity_errors + n <=
    885 					    rm->rm_firstdatacol);
    886 				}
    887 				goto done;
    888 			}
    889 			break;
    890 
    891 		case 1:
    892 			/*
    893 			 * We either attempt to read all the parity columns or
    894 			 * none of them. If we didn't try to read parity, we
    895 			 * wouldn't be here in the correctable case. There must
    896 			 * also have been fewer parity errors than parity
    897 			 * columns or, again, we wouldn't be in this code path.
    898 			 */
    899 			ASSERT(parity_untried == 0);
    900 			ASSERT(parity_errors < rm->rm_firstdatacol);
    901 
    902 			/*
    903 			 * Find the column that reported the error.
    904 			 */
    905 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    906 				rc = &rm->rm_col[c];
    907 				if (rc->rc_error != 0)
    908 					break;
    909 			}
    910 			ASSERT(c != rm->rm_cols);
    911 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
    912 			    rc->rc_error == ESTALE);
    913 
    914 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
    915 				vdev_raidz_reconstruct_p(rm, c);
    916 			} else {
    917 				ASSERT(rm->rm_firstdatacol > 1);
    918 				vdev_raidz_reconstruct_q(rm, c);
    919 			}
    920 
    921 			if (zio_checksum_error(zio) == 0) {
    922 				zio->io_error = 0;
    923 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
    924 					atomic_inc_64(&raidz_corrected_p);
    925 				else
    926 					atomic_inc_64(&raidz_corrected_q);
    927 
    928 				/*
    929 				 * If there's more than one parity disk that
    930 				 * was successfully read, confirm that the
    931 				 * other parity disk produced the correct data.
    932 				 * This routine is suboptimal in that it
    933 				 * regenerates both the parity we wish to test
    934 				 * as well as the parity we just used to
    935 				 * perform the reconstruction, but this should
    936 				 * be a relatively uncommon case, and can be
    937 				 * optimized if it becomes a problem.
    938 				 * We also regenerate parity when resilvering
    939 				 * so we can write it out to the failed device
    940 				 * later.
    941 				 */
    942 				if (parity_errors < rm->rm_firstdatacol - 1 ||
    943 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
    944 					n = raidz_parity_verify(zio, rm);
    945 					unexpected_errors += n;
    946 					ASSERT(parity_errors + n <=
    947 					    rm->rm_firstdatacol);
    948 				}
    949 
    950 				goto done;
    951 			}
    952 			break;
    953