Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)vdev_raidz.c	1.10	07/11/27 SMI"
     28 
     29 #include <sys/zfs_context.h>
     30 #include <sys/spa.h>
     31 #include <sys/vdev_impl.h>
     32 #include <sys/zio.h>
     33 #include <sys/zio_checksum.h>
     34 #include <sys/fs/zfs.h>
     35 #include <sys/fm/fs/zfs.h>
     36 
     37 /*
     38  * Virtual device vector for RAID-Z.
     39  *
     40  * This vdev supports both single and double parity. For single parity, we
     41  * use a simple XOR of all the data columns. For double parity, we use both
     42  * the simple XOR as well as a technique described in "The mathematics of
     43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
     44  * over the integers expressable in a single byte. Briefly, the operations on
     45  * the field are defined as follows:
     46  *
     47  *   o addition (+) is represented by a bitwise XOR
     48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
     49  *   o multiplication of A by 2 is defined by the following bitwise expression:
     50  *	(A * 2)_7 = A_6
     51  *	(A * 2)_6 = A_5
     52  *	(A * 2)_5 = A_4
     53  *	(A * 2)_4 = A_3 + A_7
     54  *	(A * 2)_3 = A_2 + A_7
     55  *	(A * 2)_2 = A_1 + A_7
     56  *	(A * 2)_1 = A_0
     57  *	(A * 2)_0 = A_7
     58  *
     59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
     60  *
     61  * Observe that any number in the field (except for 0) can be expressed as a
     62  * power of 2 -- a generator for the field. We store a table of the powers of
     63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
     64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
     65  * than field addition). The inverse of a field element A (A^-1) is A^254.
     66  *
     67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
     68  * can be expressed by field operations:
     69  *
     70  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
     71  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
     72  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
     73  *
     74  * See the reconstruction code below for how P and Q can used individually or
     75  * in concert to recover missing data columns.
     76  */
     77 
     78 typedef struct raidz_col {
     79 	uint64_t rc_devidx;		/* child device index for I/O */
     80 	uint64_t rc_offset;		/* device offset */
     81 	uint64_t rc_size;		/* I/O size */
     82 	void *rc_data;			/* I/O data */
     83 	int rc_error;			/* I/O error for this device */
     84 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
     85 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
     86 } raidz_col_t;
     87 
     88 typedef struct raidz_map {
     89 	uint64_t rm_cols;		/* Column count */
     90 	uint64_t rm_bigcols;		/* Number of oversized columns */
     91 	uint64_t rm_asize;		/* Actual total I/O size */
     92 	uint64_t rm_missingdata;	/* Count of missing data devices */
     93 	uint64_t rm_missingparity;	/* Count of missing parity devices */
     94 	uint64_t rm_firstdatacol;	/* First data column/parity count */
     95 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
     96 } raidz_map_t;
     97 
     98 #define	VDEV_RAIDZ_P		0
     99 #define	VDEV_RAIDZ_Q		1
    100 
    101 #define	VDEV_RAIDZ_MAXPARITY	2
    102 
    103 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
    104 
    105 /*
    106  * These two tables represent powers and logs of 2 in the Galois field defined
    107  * above. These values were computed by repeatedly multiplying by 2 as above.
    108  */
    109 static const uint8_t vdev_raidz_pow2[256] = {
    110 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
    111 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
    112 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
    113 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
    114 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
    115 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
    116 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
    117 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
    118 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
    119 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
    120 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
    121 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
    122 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
    123 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
    124 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
    125 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
    126 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
    127 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
    128 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
    129 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
    130 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
    131 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
    132 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
    133 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
    134 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
    135 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
    136 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
    137 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
    138 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
    139 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
    140 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
    141 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
    142 };
    143 static const uint8_t vdev_raidz_log2[256] = {
    144 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
    145 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
    146 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
    147 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
    148 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
    149 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
    150 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
    151 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
    152 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
    153 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
    154 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
    155 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
    156 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
    157 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
    158 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
    159 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
    160 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
    161 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
    162 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
    163 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
    164 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
    165 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
    166 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
    167 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
    168 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
    169 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
    170 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
    171 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
    172 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
    173 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
    174 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
    175 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
    176 };
    177 
    178 /*
    179  * Multiply a given number by 2 raised to the given power.
    180  */
    181 static uint8_t
    182 vdev_raidz_exp2(uint_t a, int exp)
    183 {
    184 	if (a == 0)
    185 		return (0);
    186 
    187 	ASSERT(exp >= 0);
    188 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
    189 
    190 	exp += vdev_raidz_log2[a];
    191 	if (exp > 255)
    192 		exp -= 255;
    193 
    194 	return (vdev_raidz_pow2[exp]);
    195 }
    196 
    197 static raidz_map_t *
    198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
    199     uint64_t nparity)
    200 {
    201 	raidz_map_t *rm;
    202 	uint64_t b = zio->io_offset >> unit_shift;
    203 	uint64_t s = zio->io_size >> unit_shift;
    204 	uint64_t f = b % dcols;
    205 	uint64_t o = (b / dcols) << unit_shift;
    206 	uint64_t q, r, c, bc, col, acols, coff, devidx;
    207 
    208 	q = s / (dcols - nparity);
    209 	r = s - q * (dcols - nparity);
    210 	bc = (r == 0 ? 0 : r + nparity);
    211 
    212 	acols = (q == 0 ? bc : dcols);
    213 
    214 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
    215 
    216 	rm->rm_cols = acols;
    217 	rm->rm_bigcols = bc;
    218 	rm->rm_asize = 0;
    219 	rm->rm_missingdata = 0;
    220 	rm->rm_missingparity = 0;
    221 	rm->rm_firstdatacol = nparity;
    222 
    223 	for (c = 0; c < acols; c++) {
    224 		col = f + c;
    225 		coff = o;
    226 		if (col >= dcols) {
    227 			col -= dcols;
    228 			coff += 1ULL << unit_shift;
    229 		}
    230 		rm->rm_col[c].rc_devidx = col;
    231 		rm->rm_col[c].rc_offset = coff;
    232 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
    233 		rm->rm_col[c].rc_data = NULL;
    234 		rm->rm_col[c].rc_error = 0;
    235 		rm->rm_col[c].rc_tried = 0;
    236 		rm->rm_col[c].rc_skipped = 0;
    237 		rm->rm_asize += rm->rm_col[c].rc_size;
    238 	}
    239 
    240 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
    241 
    242 	for (c = 0; c < rm->rm_firstdatacol; c++)
    243 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
    244 
    245 	rm->rm_col[c].rc_data = zio->io_data;
    246 
    247 	for (c = c + 1; c < acols; c++)
    248 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
    249 		    rm->rm_col[c - 1].rc_size;
    250 
    251 	/*
    252 	 * If all data stored spans all columns, there's a danger that parity
    253 	 * will always be on the same device and, since parity isn't read
    254 	 * during normal operation, that that device's I/O bandwidth won't be
    255 	 * used effectively. We therefore switch the parity every 1MB.
    256 	 *
    257 	 * ... at least that was, ostensibly, the theory. As a practical
    258 	 * matter unless we juggle the parity between all devices evenly, we
    259 	 * won't see any benefit. Further, occasional writes that aren't a
    260 	 * multiple of the LCM of the number of children and the minimum
    261 	 * stripe width are sufficient to avoid pessimal behavior.
    262 	 * Unfortunately, this decision created an implicit on-disk format
    263 	 * requirement that we need to support for all eternity, but only
    264 	 * for single-parity RAID-Z.
    265 	 */
    266 	ASSERT(rm->rm_cols >= 2);
    267 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
    268 
    269 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
    270 		devidx = rm->rm_col[0].rc_devidx;
    271 		o = rm->rm_col[0].rc_offset;
    272 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
    273 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
    274 		rm->rm_col[1].rc_devidx = devidx;
    275 		rm->rm_col[1].rc_offset = o;
    276 	}
    277 
    278 	zio->io_vsd = rm;
    279 	return (rm);
    280 }
    281 
    282 static void
    283 vdev_raidz_map_free(zio_t *zio)
    284 {
    285 	raidz_map_t *rm = zio->io_vsd;
    286 	int c;
    287 
    288 	for (c = 0; c < rm->rm_firstdatacol; c++)
    289 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
    290 
    291 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
    292 	zio->io_vsd = NULL;
    293 }
    294 
    295 static void
    296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
    297 {
    298 	uint64_t *p, *src, pcount, ccount, i;
    299 	int c;
    300 
    301 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    302 
    303 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    304 		src = rm->rm_col[c].rc_data;
    305 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    306 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    307 
    308 		if (c == rm->rm_firstdatacol) {
    309 			ASSERT(ccount == pcount);
    310 			for (i = 0; i < ccount; i++, p++, src++) {
    311 				*p = *src;
    312 			}
    313 		} else {
    314 			ASSERT(ccount <= pcount);
    315 			for (i = 0; i < ccount; i++, p++, src++) {
    316 				*p ^= *src;
    317 			}
    318 		}
    319 	}
    320 }
    321 
    322 static void
    323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
    324 {
    325 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
    326 	int c;
    327 
    328 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
    329 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
    330 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    331 
    332 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    333 		src = rm->rm_col[c].rc_data;
    334 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    335 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    336 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    337 
    338 		if (c == rm->rm_firstdatacol) {
    339 			ASSERT(ccount == pcount || ccount == 0);
    340 			for (i = 0; i < ccount; i++, p++, q++, src++) {
    341 				*q = *src;
    342 				*p = *src;
    343 			}
    344 			for (; i < pcount; i++, p++, q++, src++) {
    345 				*q = 0;
    346 				*p = 0;
    347 			}
    348 		} else {
    349 			ASSERT(ccount <= pcount);
    350 
    351 			/*
    352 			 * Rather than multiplying each byte individually (as
    353 			 * described above), we are able to handle 8 at once
    354 			 * by generating a mask based on the high bit in each
    355 			 * byte and using that to conditionally XOR in 0x1d.
    356 			 */
    357 			for (i = 0; i < ccount; i++, p++, q++, src++) {
    358 				mask = *q & 0x8080808080808080ULL;
    359 				mask = (mask << 1) - (mask >> 7);
    360 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
    361 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    362 				*q ^= *src;
    363 				*p ^= *src;
    364 			}
    365 
    366 			/*
    367 			 * Treat short columns as though they are full of 0s.
    368 			 */
    369 			for (; i < pcount; i++, q++) {
    370 				mask = *q & 0x8080808080808080ULL;
    371 				mask = (mask << 1) - (mask >> 7);
    372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
    373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    374 			}
    375 		}
    376 	}
    377 }
    378 
    379 static void
    380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
    381 {
    382 	uint64_t *dst, *src, xcount, ccount, count, i;
    383 	int c;
    384 
    385 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    386 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
    387 	ASSERT(xcount > 0);
    388 
    389 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    390 	dst = rm->rm_col[x].rc_data;
    391 	for (i = 0; i < xcount; i++, dst++, src++) {
    392 		*dst = *src;
    393 	}
    394 
    395 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    396 		src = rm->rm_col[c].rc_data;
    397 		dst = rm->rm_col[x].rc_data;
    398 
    399 		if (c == x)
    400 			continue;
    401 
    402 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    403 		count = MIN(ccount, xcount);
    404 
    405 		for (i = 0; i < count; i++, dst++, src++) {
    406 			*dst ^= *src;
    407 		}
    408 	}
    409 }
    410 
    411 static void
    412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
    413 {
    414 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
    415 	uint8_t *b;
    416 	int c, j, exp;
    417 
    418 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
    419 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
    420 
    421 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    422 		src = rm->rm_col[c].rc_data;
    423 		dst = rm->rm_col[x].rc_data;
    424 
    425 		if (c == x)
    426 			ccount = 0;
    427 		else
    428 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
    429 
    430 		count = MIN(ccount, xcount);
    431 
    432 		if (c == rm->rm_firstdatacol) {
    433 			for (i = 0; i < count; i++, dst++, src++) {
    434 				*dst = *src;
    435 			}
    436 			for (; i < xcount; i++, dst++) {
    437 				*dst = 0;
    438 			}
    439 
    440 		} else {
    441 			/*
    442 			 * For an explanation of this, see the comment in
    443 			 * vdev_raidz_generate_parity_pq() above.
    444 			 */
    445 			for (i = 0; i < count; i++, dst++, src++) {
    446 				mask = *dst & 0x8080808080808080ULL;
    447 				mask = (mask << 1) - (mask >> 7);
    448 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
    449 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    450 				*dst ^= *src;
    451 			}
    452 
    453 			for (; i < xcount; i++, dst++) {
    454 				mask = *dst & 0x8080808080808080ULL;
    455 				mask = (mask << 1) - (mask >> 7);
    456 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
    457 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
    458 			}
    459 		}
    460 	}
    461 
    462 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    463 	dst = rm->rm_col[x].rc_data;
    464 	exp = 255 - (rm->rm_cols - 1 - x);
    465 
    466 	for (i = 0; i < xcount; i++, dst++, src++) {
    467 		*dst ^= *src;
    468 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
    469 			*b = vdev_raidz_exp2(*b, exp);
    470 		}
    471 	}
    472 }
    473 
    474 static void
    475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
    476 {
    477 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
    478 	void *pdata, *qdata;
    479 	uint64_t xsize, ysize, i;
    480 
    481 	ASSERT(x < y);
    482 	ASSERT(x >= rm->rm_firstdatacol);
    483 	ASSERT(y < rm->rm_cols);
    484 
    485 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
    486 
    487 	/*
    488 	 * Move the parity data aside -- we're going to compute parity as
    489 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
    490 	 * reuse the parity generation mechanism without trashing the actual
    491 	 * parity so we make those columns appear to be full of zeros by
    492 	 * setting their lengths to zero.
    493 	 */
    494 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    495 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    496 	xsize = rm->rm_col[x].rc_size;
    497 	ysize = rm->rm_col[y].rc_size;
    498 
    499 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
    500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
    501 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
    502 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    503 	rm->rm_col[x].rc_size = 0;
    504 	rm->rm_col[y].rc_size = 0;
    505 
    506 	vdev_raidz_generate_parity_pq(rm);
    507 
    508 	rm->rm_col[x].rc_size = xsize;
    509 	rm->rm_col[y].rc_size = ysize;
    510 
    511 	p = pdata;
    512 	q = qdata;
    513 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
    514 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
    515 	xd = rm->rm_col[x].rc_data;
    516 	yd = rm->rm_col[y].rc_data;
    517 
    518 	/*
    519 	 * We now have:
    520 	 *	Pxy = P + D_x + D_y
    521 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
    522 	 *
    523 	 * We can then solve for D_x:
    524 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
    525 	 * where
    526 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
    527 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
    528 	 *
    529 	 * With D_x in hand, we can easily solve for D_y:
    530 	 *	D_y = P + Pxy + D_x
    531 	 */
    532 
    533 	a = vdev_raidz_pow2[255 + x - y];
    534 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
    535 	tmp = 255 - vdev_raidz_log2[a ^ 1];
    536 
    537 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
    538 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
    539 
    540 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
    541 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
    542 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
    543 
    544 		if (i < ysize)
    545 			*yd = *p ^ *pxy ^ *xd;
    546 	}
    547 
    548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
    549 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
    550 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
    551 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
    552 
    553 	/*
    554 	 * Restore the saved parity data.
    555 	 */
    556 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
    557 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
    558 }
    559 
    560 
    561 static int
    562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
    563 {
    564 	vdev_t *cvd;
    565 	uint64_t nparity = vd->vdev_nparity;
    566 	int c, error;
    567 	int lasterror = 0;
    568 	int numerrors = 0;
    569 
    570 	ASSERT(nparity > 0);
    571 
    572 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
    573 	    vd->vdev_children < nparity + 1) {
    574 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
    575 		return (EINVAL);
    576 	}
    577 
    578 	for (c = 0; c < vd->vdev_children; c++) {
    579 		cvd = vd->vdev_child[c];
    580 
    581 		if ((error = vdev_open(cvd)) != 0) {
    582 			lasterror = error;
    583 			numerrors++;
    584 			continue;
    585 		}
    586 
    587 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
    588 		*ashift = MAX(*ashift, cvd->vdev_ashift);
    589 	}
    590 
    591 	*asize *= vd->vdev_children;
    592 
    593 	if (numerrors > nparity) {
    594 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
    595 		return (lasterror);
    596 	}
    597 
    598 	return (0);
    599 }
    600 
    601 static void
    602 vdev_raidz_close(vdev_t *vd)
    603 {
    604 	int c;
    605 
    606 	for (c = 0; c < vd->vdev_children; c++)
    607 		vdev_close(vd->vdev_child[c]);
    608 }
    609 
    610 static uint64_t
    611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
    612 {
    613 	uint64_t asize;
    614 	uint64_t ashift = vd->vdev_top->vdev_ashift;
    615 	uint64_t cols = vd->vdev_children;
    616 	uint64_t nparity = vd->vdev_nparity;
    617 
    618 	asize = ((psize - 1) >> ashift) + 1;
    619 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
    620 	asize = roundup(asize, nparity + 1) << ashift;
    621 
    622 	return (asize);
    623 }
    624 
    625 static void
    626 vdev_raidz_child_done(zio_t *zio)
    627 {
    628 	raidz_col_t *rc = zio->io_private;
    629 
    630 	rc->rc_error = zio->io_error;
    631 	rc->rc_tried = 1;
    632 	rc->rc_skipped = 0;
    633 }
    634 
    635 static void
    636 vdev_raidz_repair_done(zio_t *zio)
    637 {
    638 	ASSERT(zio->io_private == zio->io_parent);
    639 	vdev_raidz_map_free(zio->io_private);
    640 }
    641 
    642 static int
    643 vdev_raidz_io_start(zio_t *zio)
    644 {
    645 	vdev_t *vd = zio->io_vd;
    646 	vdev_t *tvd = vd->vdev_top;
    647 	vdev_t *cvd;
    648 	blkptr_t *bp = zio->io_bp;
    649 	raidz_map_t *rm;
    650 	raidz_col_t *rc;
    651 	int c;
    652 
    653 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
    654 	    vd->vdev_nparity);
    655 
    656 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
    657 
    658 	if (zio->io_type == ZIO_TYPE_WRITE) {
    659 		/*
    660 		 * Generate RAID parity in the first virtual columns.
    661 		 */
    662 		if (rm->rm_firstdatacol == 1)
    663 			vdev_raidz_generate_parity_p(rm);
    664 		else
    665 			vdev_raidz_generate_parity_pq(rm);
    666 
    667 		for (c = 0; c < rm->rm_cols; c++) {
    668 			rc = &rm->rm_col[c];
    669 			cvd = vd->vdev_child[rc->rc_devidx];
    670 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
    671 			    rc->rc_offset, rc->rc_data, rc->rc_size,
    672 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
    673 			    vdev_raidz_child_done, rc));
    674 		}
    675 
    676 		return (zio_wait_for_children_done(zio));
    677 	}
    678 
    679 	ASSERT(zio->io_type == ZIO_TYPE_READ);
    680 
    681 	/*
    682 	 * Iterate over the columns in reverse order so that we hit the parity
    683 	 * last -- any errors along the way will force us to read the parity
    684 	 * data.
    685 	 */
    686 	for (c = rm->rm_cols - 1; c >= 0; c--) {
    687 		rc = &rm->rm_col[c];
    688 		cvd = vd->vdev_child[rc->rc_devidx];
    689 		if (!vdev_readable(cvd)) {
    690 			if (c >= rm->rm_firstdatacol)
    691 				rm->rm_missingdata++;
    692 			else
    693 				rm->rm_missingparity++;
    694 			rc->rc_error = ENXIO;
    695 			rc->rc_tried = 1;	/* don't even try */
    696 			rc->rc_skipped = 1;
    697 			continue;
    698 		}
    699 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
    700 			if (c >= rm->rm_firstdatacol)
    701 				rm->rm_missingdata++;
    702 			else
    703 				rm->rm_missingparity++;
    704 			rc->rc_error = ESTALE;
    705 			rc->rc_skipped = 1;
    706 			continue;
    707 		}
    708 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
    709 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
    710 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
    711 			    rc->rc_offset, rc->rc_data, rc->rc_size,
    712 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
    713 			    vdev_raidz_child_done, rc));
    714 		}
    715 	}
    716 
    717 	return (zio_wait_for_children_done(zio));
    718 }
    719 
    720 /*
    721  * Report a checksum error for a child of a RAID-Z device.
    722  */
    723 static void
    724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
    725 {
    726 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
    727 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
    728 	    vdev_description(vd));
    729 
    730 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
    731 		mutex_enter(&vd->vdev_stat_lock);
    732 		vd->vdev_stat.vs_checksum_errors++;
    733 		mutex_exit(&vd->vdev_stat_lock);
    734 	}
    735 
    736 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
    737 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
    738 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
    739 }
    740 
    741 /*
    742  * Generate the parity from the data columns. If we tried and were able to
    743  * read the parity without error, verify that the generated parity matches the
    744  * data we read. If it doesn't, we fire off a checksum error. Return the
    745  * number such failures.
    746  */
    747 static int
    748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
    749 {
    750 	void *orig[VDEV_RAIDZ_MAXPARITY];
    751 	int c, ret = 0;
    752 	raidz_col_t *rc;
    753 
    754 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    755 		rc = &rm->rm_col[c];
    756 		if (!rc->rc_tried || rc->rc_error != 0)
    757 			continue;
    758 		orig[c] = zio_buf_alloc(rc->rc_size);
    759 		bcopy(rc->rc_data, orig[c], rc->rc_size);
    760 	}
    761 
    762 	if (rm->rm_firstdatacol == 1)
    763 		vdev_raidz_generate_parity_p(rm);
    764 	else
    765 		vdev_raidz_generate_parity_pq(rm);
    766 
    767 	for (c = 0; c < rm->rm_firstdatacol; c++) {
    768 		rc = &rm->rm_col[c];
    769 		if (!rc->rc_tried || rc->rc_error != 0)
    770 			continue;
    771 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
    772 			raidz_checksum_error(zio, rc);
    773 			rc->rc_error = ECKSUM;
    774 			ret++;
    775 		}
    776 		zio_buf_free(orig[c], rc->rc_size);
    777 	}
    778 
    779 	return (ret);
    780 }
    781 
    782 static uint64_t raidz_corrected_p;
    783 static uint64_t raidz_corrected_q;
    784 static uint64_t raidz_corrected_pq;
    785 
    786 static int
    787 vdev_raidz_io_done(zio_t *zio)
    788 {
    789 	vdev_t *vd = zio->io_vd;
    790 	vdev_t *cvd;
    791 	raidz_map_t *rm = zio->io_vsd;
    792 	raidz_col_t *rc, *rc1;
    793 	int unexpected_errors = 0;
    794 	int parity_errors = 0;
    795 	int parity_untried = 0;
    796 	int data_errors = 0;
    797 	int n, c, c1;
    798 
    799 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
    800 
    801 	zio->io_error = 0;
    802 	zio->io_numerrors = 0;
    803 
    804 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
    805 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
    806 
    807 	for (c = 0; c < rm->rm_cols; c++) {
    808 		rc = &rm->rm_col[c];
    809 
    810 		/*
    811 		 * We preserve any EIOs because those may be worth retrying;
    812 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
    813 		 */
    814 		if (rc->rc_error) {
    815 			if (zio->io_error != EIO)
    816 				zio->io_error = rc->rc_error;
    817 
    818 			if (c < rm->rm_firstdatacol)
    819 				parity_errors++;
    820 			else
    821 				data_errors++;
    822 
    823 			if (!rc->rc_skipped)
    824 				unexpected_errors++;
    825 
    826 			zio->io_numerrors++;
    827 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
    828 			parity_untried++;
    829 		}
    830 	}
    831 
    832 	if (zio->io_type == ZIO_TYPE_WRITE) {
    833 		/*
    834 		 * If this is not a failfast write, and we were able to
    835 		 * write enough columns to reconstruct the data, good enough.
    836 		 */
    837 		/* XXPOLICY */
    838 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
    839 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
    840 			zio->io_error = 0;
    841 
    842 		vdev_raidz_map_free(zio);
    843 
    844 		return (ZIO_PIPELINE_CONTINUE);
    845 	}
    846 
    847 	ASSERT(zio->io_type == ZIO_TYPE_READ);
    848 	/*
    849 	 * There are three potential phases for a read:
    850 	 *	1. produce valid data from the columns read
    851 	 *	2. read all disks and try again
    852 	 *	3. perform combinatorial reconstruction
    853 	 *
    854 	 * Each phase is progressively both more expensive and less likely to
    855 	 * occur. If we encounter more errors than we can repair or all phases
    856 	 * fail, we have no choice but to return an error.
    857 	 */
    858 
    859 	/*
    860 	 * If the number of errors we saw was correctable -- less than or equal
    861 	 * to the number of parity disks read -- attempt to produce data that
    862 	 * has a valid checksum. Naturally, this case applies in the absence of
    863 	 * any errors.
    864 	 */
    865 	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
    866 		switch (data_errors) {
    867 		case 0:
    868 			if (zio_checksum_error(zio) == 0) {
    869 				zio->io_error = 0;
    870 
    871 				/*
    872 				 * If we read parity information (unnecessarily
    873 				 * as it happens since no reconstruction was
    874 				 * needed) regenerate and verify the parity.
    875 				 * We also regenerate parity when resilvering
    876 				 * so we can write it out to the failed device
    877 				 * later.
    878 				 */
    879 				if (parity_errors + parity_untried <
    880 				    rm->rm_firstdatacol ||
    881 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
    882 					n = raidz_parity_verify(zio, rm);
    883 					unexpected_errors += n;
    884 					ASSERT(parity_errors + n <=
    885 					    rm->rm_firstdatacol);
    886 				}
    887 				goto done;
    888 			}
    889 			break;
    890 
    891 		case 1:
    892 			/*
    893 			 * We either attempt to read all the parity columns or
    894 			 * none of them. If we didn't try to read parity, we
    895 			 * wouldn't be here in the correctable case. There must
    896 			 * also have been fewer parity errors than parity
    897 			 * columns or, again, we wouldn't be in this code path.
    898 			 */
    899 			ASSERT(parity_untried == 0);
    900 			ASSERT(parity_errors < rm->rm_firstdatacol);
    901 
    902 			/*
    903 			 * Find the column that reported the error.
    904 			 */
    905 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    906 				rc = &rm->rm_col[c];
    907 				if (rc->rc_error != 0)
    908 					break;
    909 			}
    910 			ASSERT(c != rm->rm_cols);
    911 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
    912 			    rc->rc_error == ESTALE);
    913 
    914 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
    915 				vdev_raidz_reconstruct_p(rm, c);
    916 			} else {
    917 				ASSERT(rm->rm_firstdatacol > 1);
    918 				vdev_raidz_reconstruct_q(rm, c);
    919 			}
    920 
    921 			if (zio_checksum_error(zio) == 0) {
    922 				zio->io_error = 0;
    923 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
    924 					atomic_inc_64(&raidz_corrected_p);
    925 				else
    926 					atomic_inc_64(&raidz_corrected_q);
    927 
    928 				/*
    929 				 * If there's more than one parity disk that
    930 				 * was successfully read, confirm that the
    931 				 * other parity disk produced the correct data.
    932 				 * This routine is suboptimal in that it
    933 				 * regenerates both the parity we wish to test
    934 				 * as well as the parity we just used to
    935 				 * perform the reconstruction, but this should
    936 				 * be a relatively uncommon case, and can be
    937 				 * optimized if it becomes a problem.
    938 				 * We also regenerate parity when resilvering
    939 				 * so we can write it out to the failed device
    940 				 * later.
    941 				 */
    942 				if (parity_errors < rm->rm_firstdatacol - 1 ||
    943 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
    944 					n = raidz_parity_verify(zio, rm);
    945 					unexpected_errors += n;
    946 					ASSERT(parity_errors + n <=
    947 					    rm->rm_firstdatacol);
    948 				}
    949 
    950 				goto done;
    951 			}
    952 			break;
    953 
    954 		case 2:
    955 			/*
    956 			 * Two data column errors require double parity.
    957 			 */
    958 			ASSERT(rm->rm_firstdatacol == 2);
    959 
    960 			/*
    961 			 * Find the two columns that reported errors.
    962 			 */
    963 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
    964 				rc = &rm->rm_col[c];
    965 				if (rc->rc_error != 0)
    966 					break;
    967 			}
    968 			ASSERT(c != rm->rm_cols);
    969 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
    970 			    rc->rc_error == ESTALE);
    971 
    972 			for (c1 = c++; c < rm->rm_cols; c++) {
    973 				rc = &rm->rm_col[c];
    974 				if (rc->rc_error != 0)
    975 					break;
    976 			}
    977 			ASSERT(c != rm->rm_cols);
    978 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
    979 			    rc->rc_error == ESTALE);
    980 
    981 			vdev_raidz_reconstruct_pq(rm, c1, c);
    982 
    983 			if (zio_checksum_error(zio) == 0) {
    984 				zio->io_error = 0;
    985 				atomic_inc_64(&raidz_corrected_pq);
    986 
    987 				goto done;
    988 			}
    989 			break;
    990 
    991 		default:
    992 			ASSERT(rm->rm_firstdatacol <= 2);
    993 			ASSERT(0);
    994 		}
    995 	}
    996 
    997 	/*
    998 	 * This isn't a typical situation -- either we got a read error or
    999 	 * a child silently returned bad data. Read every block so we can
   1000 	 * try again with as much data and parity as we can track down. If
   1001 	 * we've already been through once before, all children will be marked
   1002 	 * as tried so we'll proceed to combinatorial reconstruction.
   1003 	 */
   1004 	unexpected_errors = 1;
   1005 	rm->rm_missingdata = 0;
   1006 	rm->rm_missingparity = 0;
   1007 
   1008 	for (c = 0; c < rm->rm_cols; c++) {
   1009 		if (rm->rm_col[c].rc_tried)
   1010 			continue;
   1011 
   1012 		zio->io_error = 0;
   1013 		zio_vdev_io_redone(zio);
   1014 		do {
   1015 			rc = &rm->rm_col[c];
   1016 			if (rc->rc_tried)
   1017 				continue;
   1018 			zio_nowait(zio_vdev_child_io(zio, NULL,
   1019 			    vd->vdev_child[rc->rc_devidx],
   1020 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1021 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
   1022 			    vdev_raidz_child_done, rc));
   1023 		} while (++c < rm->rm_cols);
   1024 		dprintf("rereading\n");
   1025 
   1026 		return (zio_wait_for_children_done(zio));
   1027 	}
   1028 
   1029 	/*
   1030 	 * At this point we've attempted to reconstruct the data given the
   1031 	 * errors we detected, and we've attempted to read all columns. There
   1032 	 * must, therefore, be one or more additional problems -- silent errors
   1033 	 * resulting in invalid data rather than explicit I/O errors resulting
   1034 	 * in absent data. Before we attempt combinatorial reconstruction make
   1035 	 * sure we have a chance of coming up with the right answer.
   1036 	 */
   1037 	if (zio->io_numerrors >= rm->rm_firstdatacol) {
   1038 		ASSERT(zio->io_error != 0);
   1039 		goto done;
   1040 	}
   1041 
   1042 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
   1043 		/*
   1044 		 * Attempt to reconstruct the data from parity P.
   1045 		 */
   1046 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1047 			void *orig;
   1048 			rc = &rm->rm_col[c];
   1049 
   1050 			orig = zio_buf_alloc(rc->rc_size);
   1051 			bcopy(rc->rc_data, orig, rc->rc_size);
   1052 			vdev_raidz_reconstruct_p(rm, c);
   1053 
   1054 			if (zio_checksum_error(zio) == 0) {
   1055 				zio_buf_free(orig, rc->rc_size);
   1056 				zio->io_error = 0;
   1057 				atomic_inc_64(&raidz_corrected_p);
   1058 
   1059 				/*
   1060 				 * If this child didn't know that it returned
   1061 				 * bad data, inform it.
   1062 				 */
   1063 				if (rc->rc_tried && rc->rc_error == 0)
   1064 					raidz_checksum_error(zio, rc);
   1065 				rc->rc_error = ECKSUM;
   1066 				goto done;
   1067 			}
   1068 
   1069 			bcopy(orig, rc->rc_data, rc->rc_size);
   1070 			zio_buf_free(orig, rc->rc_size);
   1071 		}
   1072 	}
   1073 
   1074 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
   1075 		/*
   1076 		 * Attempt to reconstruct the data from parity Q.
   1077 		 */
   1078 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
   1079 			void *orig;
   1080 			rc = &rm->rm_col[c];
   1081 
   1082 			orig = zio_buf_alloc(rc->rc_size);
   1083 			bcopy(rc->rc_data, orig, rc->rc_size);
   1084 			vdev_raidz_reconstruct_q(rm, c);
   1085 
   1086 			if (zio_checksum_error(zio) == 0) {
   1087 				zio_buf_free(orig, rc->rc_size);
   1088 				zio->io_error = 0;
   1089 				atomic_inc_64(&raidz_corrected_q);
   1090 
   1091 				/*
   1092 				 * If this child didn't know that it returned
   1093 				 * bad data, inform it.
   1094 				 */
   1095 				if (rc->rc_tried && rc->rc_error == 0)
   1096 					raidz_checksum_error(zio, rc);
   1097 				rc->rc_error = ECKSUM;
   1098 				goto done;
   1099 			}
   1100 
   1101 			bcopy(orig, rc->rc_data, rc->rc_size);
   1102 			zio_buf_free(orig, rc->rc_size);
   1103 		}
   1104 	}
   1105 
   1106 	if (rm->rm_firstdatacol > 1 &&
   1107 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
   1108 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
   1109 		/*
   1110 		 * Attempt to reconstruct the data from both P and Q.
   1111 		 */
   1112 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
   1113 			void *orig, *orig1;
   1114 			rc = &rm->rm_col[c];
   1115 
   1116 			orig = zio_buf_alloc(rc->rc_size);
   1117 			bcopy(rc->rc_data, orig, rc->rc_size);
   1118 
   1119 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
   1120 				rc1 = &rm->rm_col[c1];
   1121 
   1122 				orig1 = zio_buf_alloc(rc1->rc_size);
   1123 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
   1124 
   1125 				vdev_raidz_reconstruct_pq(rm, c, c1);
   1126 
   1127 				if (zio_checksum_error(zio) == 0) {
   1128 					zio_buf_free(orig, rc->rc_size);
   1129 					zio_buf_free(orig1, rc1->rc_size);
   1130 					zio->io_error = 0;
   1131 					atomic_inc_64(&raidz_corrected_pq);
   1132 
   1133 					/*
   1134 					 * If these children didn't know they
   1135 					 * returned bad data, inform them.
   1136 					 */
   1137 					if (rc->rc_tried && rc->rc_error == 0)
   1138 						raidz_checksum_error(zio, rc);
   1139 					if (rc1->rc_tried && rc1->rc_error == 0)
   1140 						raidz_checksum_error(zio, rc1);
   1141 
   1142 					rc->rc_error = ECKSUM;
   1143 					rc1->rc_error = ECKSUM;
   1144 
   1145 					goto done;
   1146 				}
   1147 
   1148 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
   1149 				zio_buf_free(orig1, rc1->rc_size);
   1150 			}
   1151 
   1152 			bcopy(orig, rc->rc_data, rc->rc_size);
   1153 			zio_buf_free(orig, rc->rc_size);
   1154 		}
   1155 	}
   1156 
   1157 	/*
   1158 	 * All combinations failed to checksum. Generate checksum ereports for
   1159 	 * all children.
   1160 	 */
   1161 	zio->io_error = ECKSUM;
   1162 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
   1163 		for (c = 0; c < rm->rm_cols; c++) {
   1164 			rc = &rm->rm_col[c];
   1165 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
   1166 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
   1167 			    rc->rc_offset, rc->rc_size);
   1168 		}
   1169 	}
   1170 
   1171 done:
   1172 	zio_checksum_verified(zio);
   1173 
   1174 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
   1175 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
   1176 		zio_t *rio;
   1177 
   1178 		/*
   1179 		 * Use the good data we have in hand to repair damaged children.
   1180 		 *
   1181 		 * We issue all repair I/Os as children of 'rio' to arrange
   1182 		 * that vdev_raidz_map_free(zio) will be invoked after all
   1183 		 * repairs complete, but before we advance to the next stage.
   1184 		 */
   1185 		rio = zio_null(zio, zio->io_spa,
   1186 		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
   1187 
   1188 		for (c = 0; c < rm->rm_cols; c++) {
   1189 			rc = &rm->rm_col[c];
   1190 			cvd = vd->vdev_child[rc->rc_devidx];
   1191 
   1192 			if (rc->rc_error == 0)
   1193 				continue;
   1194 
   1195 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
   1196 			    vdev_description(vd),
   1197 			    vdev_description(cvd),
   1198 			    zio->io_offset, rc->rc_error);
   1199 
   1200 			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
   1201 			    rc->rc_offset, rc->rc_data, rc->rc_size,
   1202 			    ZIO_TYPE_WRITE, zio->io_priority,
   1203 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
   1204 			    ZIO_FLAG_CANFAIL, NULL, NULL));
   1205 		}
   1206 
   1207 		zio_nowait(rio);
   1208 
   1209 		return (zio_wait_for_children_done(zio));
   1210 	}
   1211 
   1212 	vdev_raidz_map_free(zio);
   1213 
   1214 	return (ZIO_PIPELINE_CONTINUE);
   1215 }
   1216 
   1217 static void
   1218 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
   1219 {
   1220 	if (faulted > vd->vdev_nparity)
   1221 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
   1222 		    VDEV_AUX_NO_REPLICAS);
   1223 	else if (degraded + faulted != 0)
   1224 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
   1225 	else
   1226 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
   1227 }
   1228 
   1229 vdev_ops_t vdev_raidz_ops = {
   1230 	vdev_raidz_open,
   1231 	vdev_raidz_close,
   1232 	NULL,
   1233 	vdev_raidz_asize,
   1234 	vdev_raidz_io_start,
   1235 	vdev_raidz_io_done,
   1236 	vdev_raidz_state_change,
   1237 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
   1238 	B_FALSE			/* not a leaf vdev */
   1239 };
   1240