Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)vdev_file.c	1.7	07/11/27 SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/spa.h>
     30 #include <sys/vdev_file.h>
     31 #include <sys/vdev_impl.h>
     32 #include <sys/zio.h>
     33 #include <sys/fs/zfs.h>
     34 
     35 /*
     36  * Virtual device vector for files.
     37  */
     38 
     39 static int
     40 vdev_file_open_common(vdev_t *vd)
     41 {
     42 	vdev_file_t *vf;
     43 	vnode_t *vp;
     44 	int error;
     45 
     46 	/*
     47 	 * We must have a pathname, and it must be absolute.
     48 	 */
     49 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
     50 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
     51 		return (EINVAL);
     52 	}
     53 
     54 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
     55 
     56 	/*
     57 	 * We always open the files from the root of the global zone, even if
     58 	 * we're in a local zone.  If the user has gotten to this point, the
     59 	 * administrator has already decided that the pool should be available
     60 	 * to local zone users, so the underlying devices should be as well.
     61 	 */
     62 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
     63 	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
     64 	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
     65 
     66 	if (error) {
     67 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
     68 		return (error);
     69 	}
     70 
     71 	vf->vf_vnode = vp;
     72 
     73 #ifdef _KERNEL
     74 	/*
     75 	 * Make sure it's a regular file.
     76 	 */
     77 	if (vp->v_type != VREG) {
     78 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
     79 		return (ENODEV);
     80 	}
     81 #endif
     82 
     83 	return (0);
     84 }
     85 
     86 static int
     87 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
     88 {
     89 	vdev_file_t *vf;
     90 	vattr_t vattr;
     91 	int error;
     92 
     93 	if ((error = vdev_file_open_common(vd)) != 0)
     94 		return (error);
     95 
     96 	vf = vd->vdev_tsd;
     97 
     98 	/*
     99 	 * Determine the physical size of the file.
    100 	 */
    101 	vattr.va_mask = AT_SIZE;
    102 	error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
    103 	if (error) {
    104 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
    105 		return (error);
    106 	}
    107 
    108 	*psize = vattr.va_size;
    109 	*ashift = SPA_MINBLOCKSHIFT;
    110 
    111 	return (0);
    112 }
    113 
    114 static void
    115 vdev_file_close(vdev_t *vd)
    116 {
    117 	vdev_file_t *vf = vd->vdev_tsd;
    118 
    119 	if (vf == NULL)
    120 		return;
    121 
    122 	if (vf->vf_vnode != NULL) {
    123 		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
    124 		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
    125 		VN_RELE(vf->vf_vnode);
    126 	}
    127 
    128 	kmem_free(vf, sizeof (vdev_file_t));
    129 	vd->vdev_tsd = NULL;
    130 }
    131 
    132 static int
    133 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
    134     enum uio_rw rw)
    135 {
    136 	vdev_file_t *vf = vd->vdev_tsd;
    137 	ssize_t resid;
    138 	int error = 0;
    139 
    140 	if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
    141 		return (EINVAL);
    142 
    143 	ASSERT(rw == UIO_READ || rw ==  UIO_WRITE);
    144 
    145 	error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
    146 	    0, RLIM64_INFINITY, kcred, &resid);
    147 	if (error || resid != 0)
    148 		return (EIO);
    149 	return (0);
    150 }
    151 
    152 /*
    153  * Determine if the underlying device is accessible by reading and writing
    154  * to a known location. We must be able to do this during syncing context
    155  * and thus we cannot set the vdev state directly.
    156  */
    157 static int
    158 vdev_file_probe(vdev_t *vd)
    159 {
    160 	vdev_t *nvd;
    161 	char *vl_boot;
    162 	uint64_t offset;
    163 	int l, error = 0, retries = 0;
    164 
    165 	if (vd == NULL)
    166 		return (EINVAL);
    167 
    168 	/* Hijack the current vdev */
    169 	nvd = vd;
    170 
    171 	/*
    172 	 * Pick a random label to rewrite.
    173 	 */
    174 	l = spa_get_random(VDEV_LABELS);
    175 	ASSERT(l < VDEV_LABELS);
    176 
    177 	offset = vdev_label_offset(vd->vdev_psize, l,
    178 	    offsetof(vdev_label_t, vl_boot_header));
    179 
    180 	vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
    181 
    182 	while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
    183 	    offset, UIO_READ)) != 0 && retries == 0) {
    184 
    185 		/*
    186 		 * If we failed with the vdev that was passed in then
    187 		 * try allocating a new one and try again.
    188 		 */
    189 		nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
    190 		if (vd->vdev_path)
    191 			nvd->vdev_path = spa_strdup(vd->vdev_path);
    192 		retries++;
    193 
    194 		error = vdev_file_open_common(nvd);
    195 		if (error)
    196 			break;
    197 	}
    198 
    199 	if ((spa_mode & FWRITE) && !error) {
    200 		error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
    201 		    offset, UIO_WRITE);
    202 	}
    203 
    204 	if (retries) {
    205 		vdev_file_close(nvd);
    206 		if (nvd->vdev_path)
    207 			spa_strfree(nvd->vdev_path);
    208 		kmem_free(nvd, sizeof (vdev_t));
    209 	}
    210 	kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
    211 
    212 	if (!error)
    213 		vd->vdev_is_failing = B_FALSE;
    214 
    215 	return (error);
    216 }
    217 
    218 static int
    219 vdev_file_io_start(zio_t *zio)
    220 {
    221 	vdev_t *vd = zio->io_vd;
    222 	vdev_file_t *vf = vd->vdev_tsd;
    223 	ssize_t resid;
    224 	int error;
    225 
    226 	if (zio->io_type == ZIO_TYPE_IOCTL) {
    227 		zio_vdev_io_bypass(zio);
    228 
    229 		/* XXPOLICY */
    230 		if (!vdev_readable(vd)) {
    231 			zio->io_error = ENXIO;
    232 			return (ZIO_PIPELINE_CONTINUE);
    233 		}
    234 
    235 		switch (zio->io_cmd) {
    236 		case DKIOCFLUSHWRITECACHE:
    237 			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
    238 			    kcred, NULL);
    239 			dprintf("fsync(%s) = %d\n", vdev_description(vd),
    240 			    zio->io_error);
    241 			break;
    242 		default:
    243 			zio->io_error = ENOTSUP;
    244 		}
    245 
    246 		return (ZIO_PIPELINE_CONTINUE);
    247 	}
    248 
    249 	/*
    250 	 * In the kernel, don't bother double-caching, but in userland,
    251 	 * we want to test the vdev_cache code.
    252 	 */
    253 #ifndef _KERNEL
    254 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
    255 		return (ZIO_PIPELINE_STOP);
    256 #endif
    257 
    258 	if ((zio = vdev_queue_io(zio)) == NULL)
    259 		return (ZIO_PIPELINE_STOP);
    260 
    261 	/* XXPOLICY */
    262 	if (zio->io_type == ZIO_TYPE_WRITE)
    263 		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
    264 	else
    265 		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
    266 	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
    267 	if (error) {
    268 		zio->io_error = error;
    269 		zio_interrupt(zio);
    270 		return (ZIO_PIPELINE_STOP);
    271 	}
    272 
    273 	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
    274 	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
    275 	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
    276 	    0, RLIM64_INFINITY, kcred, &resid);
    277 
    278 	if (resid != 0 && zio->io_error == 0)
    279 		zio->io_error = ENOSPC;
    280 
    281 	zio_interrupt(zio);
    282 
    283 	return (ZIO_PIPELINE_STOP);
    284 }
    285 
    286 static int
    287 vdev_file_io_done(zio_t *zio)
    288 {
    289 	vdev_t *vd = zio->io_vd;
    290 
    291 	if (zio_injection_enabled && zio->io_error == 0)
    292 		zio->io_error = zio_handle_device_injection(vd, EIO);
    293 
    294 	/*
    295 	 * If an error has been encountered then attempt to probe the device
    296 	 * to determine if it's still accessible.
    297 	 */
    298 	if (zio->io_error == EIO && vdev_probe(vd) != 0)
    299 		vd->vdev_is_failing = B_TRUE;
    300 
    301 	vdev_queue_io_done(zio);
    302 
    303 #ifndef _KERNEL
    304 	if (zio->io_type == ZIO_TYPE_WRITE)
    305 		vdev_cache_write(zio);
    306 #endif
    307 
    308 	return (ZIO_PIPELINE_CONTINUE);
    309 }
    310 
    311 vdev_ops_t vdev_file_ops = {
    312 	vdev_file_open,
    313 	vdev_file_close,
    314 	vdev_file_probe,
    315 	vdev_default_asize,
    316 	vdev_file_io_start,
    317 	vdev_file_io_done,
    318 	NULL,
    319 	VDEV_TYPE_FILE,		/* name of this vdev type */
    320 	B_TRUE			/* leaf vdev */
    321 };
    322 
    323 /*
    324  * From userland we access disks just like files.
    325  */
    326 #ifndef _KERNEL
    327 
    328 vdev_ops_t vdev_disk_ops = {
    329 	vdev_file_open,
    330 	vdev_file_close,
    331 	vdev_file_probe,
    332 	vdev_default_asize,
    333 	vdev_file_io_start,
    334 	vdev_file_io_done,
    335 	NULL,
    336 	VDEV_TYPE_DISK,		/* name of this vdev type */
    337 	B_TRUE			/* leaf vdev */
    338 };
    339 
    340 #endif
    341