1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #pragma ident "@(#)zfs_vnops.c 1.66 08/01/04 SMI" 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/time.h> 33 #include <sys/systm.h> 34 #include <sys/sysmacros.h> 35 #include <sys/resource.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/file.h> 40 #include <sys/stat.h> 41 #include <sys/kmem.h> 42 #include <sys/taskq.h> 43 #include <sys/uio.h> 44 #include <sys/vmsystm.h> 45 #include <sys/atomic.h> 46 #include <sys/vm.h> 47 #include <vm/seg_vn.h> 48 #include <vm/pvn.h> 49 #include <vm/as.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/cmn_err.h> 53 #include <sys/errno.h> 54 #include <sys/unistd.h> 55 #include <sys/zfs_dir.h> 56 #include <sys/zfs_acl.h> 57 #include <sys/zfs_ioctl.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/dmu.h> 60 #include <sys/spa.h> 61 #include <sys/txg.h> 62 #include <sys/dbuf.h> 63 #include <sys/zap.h> 64 #include <sys/dirent.h> 65 #include <sys/policy.h> 66 #include <sys/sunddi.h> 67 #include <sys/filio.h> 68 #include "fs/fs_subr.h" 69 #include <sys/zfs_ctldir.h> 70 #include <sys/zfs_fuid.h> 71 #include <sys/dnlc.h> 72 #include <sys/zfs_rlock.h> 73 #include <sys/extdirent.h> 74 #include <sys/kidmap.h> 75 #include <sys/cred_impl.h> 76 #include <sys/attr.h> 77 78 /* 79 * Programming rules. 80 * 81 * Each vnode op performs some logical unit of work. To do this, the ZPL must 82 * properly lock its in-core state, create a DMU transaction, do the work, 83 * record this work in the intent log (ZIL), commit the DMU transaction, 84 * and wait for the intent log to commit if it is a synchronous operation. 85 * Moreover, the vnode ops must work in both normal and log replay context. 86 * The ordering of events is important to avoid deadlocks and references 87 * to freed memory. The example below illustrates the following Big Rules: 88 * 89 * (1) A check must be made in each zfs thread for a mounted file system. 90 * This is done avoiding races using ZFS_ENTER(zfsvfs). 91 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 92 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 93 * can return EIO from the calling function. 94 * 95 * (2) VN_RELE() should always be the last thing except for zil_commit() 96 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 97 * First, if it's the last reference, the vnode/znode 98 * can be freed, so the zp may point to freed memory. Second, the last 99 * reference will call zfs_zinactive(), which may induce a lot of work -- 100 * pushing cached pages (which acquires range locks) and syncing out 101 * cached atime changes. Third, zfs_zinactive() may require a new tx, 102 * which could deadlock the system if you were already holding one. 103 * 104 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 105 * as they can span dmu_tx_assign() calls. 106 * 107 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 108 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 109 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 110 * This is critical because we don't want to block while holding locks. 111 * Note, in particular, that if a lock is sometimes acquired before 112 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 113 * use a non-blocking assign can deadlock the system. The scenario: 114 * 115 * Thread A has grabbed a lock before calling dmu_tx_assign(). 116 * Thread B is in an already-assigned tx, and blocks for this lock. 117 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 118 * forever, because the previous txg can't quiesce until B's tx commits. 119 * 120 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 121 * then drop all locks, call dmu_tx_wait(), and try again. 122 * 123 * (5) If the operation succeeded, generate the intent log entry for it 124 * before dropping locks. This ensures that the ordering of events 125 * in the intent log matches the order in which they actually occurred. 126 * 127 * (6) At the end of each vnode op, the DMU tx must always commit, 128 * regardless of whether there were any errors. 129 * 130 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 131 * to ensure that synchronous semantics are provided when necessary. 132 * 133 * In general, this is how things should be ordered in each vnode op: 134 * 135 * ZFS_ENTER(zfsvfs); // exit if unmounted 136 * top: 137 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 138 * rw_enter(...); // grab any other locks you need 139 * tx = dmu_tx_create(...); // get DMU tx 140 * dmu_tx_hold_*(); // hold each object you might modify 141 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 142 * if (error) { 143 * rw_exit(...); // drop locks 144 * zfs_dirent_unlock(dl); // unlock directory entry 145 * VN_RELE(...); // release held vnodes 146 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 147 * dmu_tx_wait(tx); 148 * dmu_tx_abort(tx); 149 * goto top; 150 * } 151 * dmu_tx_abort(tx); // abort DMU tx 152 * ZFS_EXIT(zfsvfs); // finished in zfs 153 * return (error); // really out of space 154 * } 155 * error = do_real_work(); // do whatever this VOP does 156 * if (error == 0) 157 * zfs_log_*(...); // on success, make ZIL entry 158 * dmu_tx_commit(tx); // commit DMU tx -- error or not 159 * rw_exit(...); // drop locks 160 * zfs_dirent_unlock(dl); // unlock directory entry 161 * VN_RELE(...); // release held vnodes 162 * zil_commit(zilog, seq, foid); // synchronous when necessary 163 * ZFS_EXIT(zfsvfs); // finished in zfs 164 * return (error); // done, report error 165 */ 166 167 /* ARGSUSED */ 168 static int 169 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 170 { 171 znode_t *zp = VTOZ(*vpp); 172 173 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 174 ((flag & FAPPEND) == 0)) { 175 return (EPERM); 176 } 177 178 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 179 ZTOV(zp)->v_type == VREG && 180 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 181 zp->z_phys->zp_size > 0) 182 if (fs_vscan(*vpp, cr, 0) != 0) 183 return (EACCES); 184 185 /* Keep a count of the synchronous opens in the znode */ 186 if (flag & (FSYNC | FDSYNC)) 187 atomic_inc_32(&zp->z_sync_cnt); 188 189 return (0); 190 } 191 192 /* ARGSUSED */ 193 static int 194 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 195 caller_context_t *ct) 196 { 197 znode_t *zp = VTOZ(vp); 198 199 /* Decrement the synchronous opens in the znode */ 200 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 201 atomic_dec_32(&zp->z_sync_cnt); 202 203 /* 204 * Clean up any locks held by this process on the vp. 205 */ 206 cleanlocks(vp, ddi_get_pid(), 0); 207 cleanshares(vp, ddi_get_pid()); 208 209 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 210 ZTOV(zp)->v_type == VREG && 211 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 212 zp->z_phys->zp_size > 0) 213 VERIFY(fs_vscan(vp, cr, 1) == 0); 214 215 return (0); 216 } 217 218 /* 219 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 220 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 221 */ 222 static int 223 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 224 { 225 znode_t *zp = VTOZ(vp); 226 uint64_t noff = (uint64_t)*off; /* new offset */ 227 uint64_t file_sz; 228 int error; 229 boolean_t hole; 230 231 file_sz = zp->z_phys->zp_size; 232 if (noff >= file_sz) { 233 return (ENXIO); 234 } 235 236 if (cmd == _FIO_SEEK_HOLE) 237 hole = B_TRUE; 238 else 239 hole = B_FALSE; 240 241 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 242 243 /* end of file? */ 244 if ((error == ESRCH) || (noff > file_sz)) { 245 /* 246 * Handle the virtual hole at the end of file. 247 */ 248 if (hole) { 249 *off = file_sz; 250 return (0); 251 } 252 return (ENXIO); 253 } 254 255 if (noff < *off) 256 return (error); 257 *off = noff; 258 return (error); 259 } 260 261 /* ARGSUSED */ 262 static int 263 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 264 int *rvalp, caller_context_t *ct) 265 { 266 offset_t off; 267 int error; 268 zfsvfs_t *zfsvfs; 269 znode_t *zp; 270 271 switch (com) { 272 case _FIOFFS: 273 return (zfs_sync(vp->v_vfsp, 0, cred)); 274 275 /* 276 * The following two ioctls are used by bfu. Faking out, 277 * necessary to avoid bfu errors. 278 */ 279 case _FIOGDIO: 280 case _FIOSDIO: 281 return (0); 282 283 case _FIO_SEEK_DATA: 284 case _FIO_SEEK_HOLE: 285 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 286 return (EFAULT); 287 288 zp = VTOZ(vp); 289 zfsvfs = zp->z_zfsvfs; 290 ZFS_ENTER(zfsvfs); 291 ZFS_VERIFY_ZP(zp); 292 293 /* offset parameter is in/out */ 294 error = zfs_holey(vp, com, &off); 295 ZFS_EXIT(zfsvfs); 296 if (error) 297 return (error); 298 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 299 return (EFAULT); 300 return (0); 301 } 302 return (ENOTTY); 303 } 304 305 /* 306 * When a file is memory mapped, we must keep the IO data synchronized 307 * between the DMU cache and the memory mapped pages. What this means: 308 * 309 * On Write: If we find a memory mapped page, we write to *both* 310 * the page and the dmu buffer. 311 * 312 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 313 * the file is memory mapped. 314 */ 315 static int 316 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 317 { 318 znode_t *zp = VTOZ(vp); 319 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 320 int64_t start, off; 321 int len = nbytes; 322 int error = 0; 323 324 start = uio->uio_loffset; 325 off = start & PAGEOFFSET; 326 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 327 page_t *pp; 328 uint64_t bytes = MIN(PAGESIZE - off, len); 329 uint64_t woff = uio->uio_loffset; 330 331 /* 332 * We don't want a new page to "appear" in the middle of 333 * the file update (because it may not get the write 334 * update data), so we grab a lock to block 335 * zfs_getpage(). 336 */ 337 rw_enter(&zp->z_map_lock, RW_WRITER); 338 if (pp = page_lookup(vp, start, SE_SHARED)) { 339 caddr_t va; 340 341 rw_exit(&zp->z_map_lock); 342 va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); 343 error = uiomove(va+off, bytes, UIO_WRITE, uio); 344 if (error == 0) { 345 dmu_write(zfsvfs->z_os, zp->z_id, 346 woff, bytes, va+off, tx); 347 } 348 ppmapout(va); 349 page_unlock(pp); 350 } else { 351 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 352 uio, bytes, tx); 353 rw_exit(&zp->z_map_lock); 354 } 355 len -= bytes; 356 off = 0; 357 if (error) 358 break; 359 } 360 return (error); 361 } 362 363 /* 364 * When a file is memory mapped, we must keep the IO data synchronized 365 * between the DMU cache and the memory mapped pages. What this means: 366 * 367 * On Read: We "read" preferentially from memory mapped pages, 368 * else we default from the dmu buffer. 369 * 370 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 371 * the file is memory mapped. 372 */ 373 static int 374 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 375 { 376 znode_t *zp = VTOZ(vp); 377 objset_t *os = zp->z_zfsvfs->z_os; 378 int64_t start, off; 379 int len = nbytes; 380 int error = 0; 381 382 start = uio->uio_loffset; 383 off = start & PAGEOFFSET; 384 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 385 page_t *pp; 386 uint64_t bytes = MIN(PAGESIZE - off, len); 387 388 if (pp = page_lookup(vp, start, SE_SHARED)) { 389 caddr_t va; 390 391 va = ppmapin(pp, PROT_READ, (caddr_t)-1L); 392 error = uiomove(va + off, bytes, UIO_READ, uio); 393 ppmapout(va); 394 page_unlock(pp); 395 } else { 396 error = dmu_read_uio(os, zp->z_id, uio, bytes); 397 } 398 len -= bytes; 399 off = 0; 400 if (error) 401 break; 402 } 403 return (error); 404 } 405 406 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 407 408 /* 409 * Read bytes from specified file into supplied buffer. 410 * 411 * IN: vp - vnode of file to be read from. 412 * uio - structure supplying read location, range info, 413 * and return buffer. 414 * ioflag - SYNC flags; used to provide FRSYNC semantics. 415 * cr - credentials of caller. 416 * ct - caller context 417 * 418 * OUT: uio - updated offset and range, buffer filled. 419 * 420 * RETURN: 0 if success 421 * error code if failure 422 * 423 * Side Effects: 424 * vp - atime updated if byte count > 0 425 */ 426 /* ARGSUSED */ 427 static int 428 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 429 { 430 znode_t *zp = VTOZ(vp); 431 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 432 objset_t *os; 433 ssize_t n, nbytes; 434 int error; 435 rl_t *rl; 436 437 ZFS_ENTER(zfsvfs); 438 ZFS_VERIFY_ZP(zp); 439 os = zfsvfs->z_os; 440 441 /* 442 * Validate file offset 443 */ 444 if (uio->uio_loffset < (offset_t)0) { 445 ZFS_EXIT(zfsvfs); 446 return (EINVAL); 447 } 448 449 /* 450 * Fasttrack empty reads 451 */ 452 if (uio->uio_resid == 0) { 453 ZFS_EXIT(zfsvfs); 454 return (0); 455 } 456 457 /* 458 * Check for mandatory locks 459 */ 460 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 461 if (error = chklock(vp, FREAD, 462 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 463 ZFS_EXIT(zfsvfs); 464 return (error); 465 } 466 } 467 468 /* 469 * If we're in FRSYNC mode, sync out this znode before reading it. 470 */ 471 if (ioflag & FRSYNC) 472 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 473 474 /* 475 * Lock the range against changes. 476 */ 477 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 478 479 /* 480 * If we are reading past end-of-file we can skip 481 * to the end; but we might still need to set atime. 482 */ 483 if (uio->uio_loffset >= zp->z_phys->zp_size) { 484 error = 0; 485 goto out; 486 } 487 488 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 489 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 490 491 while (n > 0) { 492 nbytes = MIN(n, zfs_read_chunk_size - 493 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 494 495 if (vn_has_cached_data(vp)) 496 error = mappedread(vp, nbytes, uio); 497 else 498 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 499 if (error) 500 break; 501 502 n -= nbytes; 503 } 504 505 out: 506 zfs_range_unlock(rl); 507 508 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 509 ZFS_EXIT(zfsvfs); 510 return (error); 511 } 512 513 /* 514 * Fault in the pages of the first n bytes specified by the uio structure. 515 * 1 byte in each page is touched and the uio struct is unmodified. 516 * Any error will exit this routine as this is only a best 517 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 518 */ 519 static void 520 zfs_prefault_write(ssize_t n, struct uio *uio) 521 { 522 struct iovec *iov; 523 ulong_t cnt, incr; 524 caddr_t p; 525 uint8_t tmp; 526 527 iov = uio->uio_iov; 528 529 while (n) { 530 cnt = MIN(iov->iov_len, n); 531 if (cnt == 0) { 532 /* empty iov entry */ 533 iov++; 534 continue; 535 } 536 n -= cnt; 537 /* 538 * touch each page in this segment. 539 */ 540 p = iov->iov_base; 541 while (cnt) { 542 switch (uio->uio_segflg) { 543 case UIO_USERSPACE: 544 case UIO_USERISPACE: 545 if (fuword8(p, &tmp)) 546 return; 547 break; 548 case UIO_SYSSPACE: 549 if (kcopy(p, &tmp, 1)) 550 return; 551 break; 552 } 553 incr = MIN(cnt, PAGESIZE); 554 p += incr; 555 cnt -= incr; 556 } 557 /* 558 * touch the last byte in case it straddles a page. 559 */ 560 p--; 561 switch (uio->uio_segflg) { 562 case UIO_USERSPACE: 563 case UIO_USERISPACE: 564 if (fuword8(p, &tmp)) 565 return; 566 break; 567 case UIO_SYSSPACE: 568 if (kcopy(p, &tmp, 1)) 569 return; 570 break; 571 } 572 iov++; 573 } 574 } 575 576 /* 577 * Write the bytes to a file. 578 * 579 * IN: vp - vnode of file to be written to. 580 * uio - structure supplying write location, range info, 581 * and data buffer. 582 * ioflag - FAPPEND flag set if in append mode. 583 * cr - credentials of caller. 584 * ct - caller context (NFS/CIFS fem monitor only) 585 * 586 * OUT: uio - updated offset and range. 587 * 588 * RETURN: 0 if success 589 * error code if failure 590 * 591 * Timestamps: 592 * vp - ctime|mtime updated if byte count > 0 593 */ 594 /* ARGSUSED */ 595 static int 596 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 597 { 598 znode_t *zp = VTOZ(vp); 599 rlim64_t limit = uio->uio_llimit; 600 ssize_t start_resid = uio->uio_resid; 601 ssize_t tx_bytes; 602 uint64_t end_size; 603 dmu_tx_t *tx; 604 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 605 zilog_t *zilog; 606 offset_t woff; 607 ssize_t n, nbytes; 608 rl_t *rl; 609 int max_blksz = zfsvfs->z_max_blksz; 610 uint64_t pflags = zp->z_phys->zp_flags; 611 int error; 612 613 /* 614 * If immutable or not appending then return EPERM 615 */ 616 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 617 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 618 (uio->uio_loffset < zp->z_phys->zp_size))) 619 return (EPERM); 620 621 /* 622 * Fasttrack empty write 623 */ 624 n = start_resid; 625 if (n == 0) 626 return (0); 627 628 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 629 limit = MAXOFFSET_T; 630 631 ZFS_ENTER(zfsvfs); 632 ZFS_VERIFY_ZP(zp); 633 zilog = zfsvfs->z_log; 634 635 /* 636 * Pre-fault the pages to ensure slow (eg NFS) pages 637 * don't hold up txg. 638 */ 639 zfs_prefault_write(n, uio); 640 641 /* 642 * If in append mode, set the io offset pointer to eof. 643 */ 644 if (ioflag & FAPPEND) { 645 /* 646 * Range lock for a file append: 647 * The value for the start of range will be determined by 648 * zfs_range_lock() (to guarantee append semantics). 649 * If this write will cause the block size to increase, 650 * zfs_range_lock() will lock the entire file, so we must 651 * later reduce the range after we grow the block size. 652 */ 653 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 654 if (rl->r_len == UINT64_MAX) { 655 /* overlocked, zp_size can't change */ 656 woff = uio->uio_loffset = zp->z_phys->zp_size; 657 } else { 658 woff = uio->uio_loffset = rl->r_off; 659 } 660 } else { 661 woff = uio->uio_loffset; 662 /* 663 * Validate file offset 664 */ 665 if (woff < 0) { 666 ZFS_EXIT(zfsvfs); 667 return (EINVAL); 668 } 669 670 /* 671 * If we need to grow the block size then zfs_range_lock() 672 * will lock a wider range than we request here. 673 * Later after growing the block size we reduce the range. 674 */ 675 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 676 } 677 678 if (woff >= limit) { 679 zfs_range_unlock(rl); 680 ZFS_EXIT(zfsvfs); 681 return (EFBIG); 682 } 683 684 if ((woff + n) > limit || woff > (limit - n)) 685 n = limit - woff; 686 687 /* 688 * Check for mandatory locks 689 */ 690 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 691 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 692 zfs_range_unlock(rl); 693 ZFS_EXIT(zfsvfs); 694 return (error); 695 } 696 end_size = MAX(zp->z_phys->zp_size, woff + n); 697 698 /* 699 * Write the file in reasonable size chunks. Each chunk is written 700 * in a separate transaction; this keeps the intent log records small 701 * and allows us to do more fine-grained space accounting. 702 */ 703 while (n > 0) { 704 /* 705 * Start a transaction. 706 */ 707 woff = uio->uio_loffset; 708 tx = dmu_tx_create(zfsvfs->z_os); 709 dmu_tx_hold_bonus(tx, zp->z_id); 710 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 711 error = dmu_tx_assign(tx, zfsvfs->z_assign); 712 if (error) { 713 if (error == ERESTART && 714 zfsvfs->z_assign == TXG_NOWAIT) { 715 dmu_tx_wait(tx); 716 dmu_tx_abort(tx); 717 continue; 718 } 719 dmu_tx_abort(tx); 720 break; 721 } 722 723 /* 724 * If zfs_range_lock() over-locked we grow the blocksize 725 * and then reduce the lock range. This will only happen 726 * on the first iteration since zfs_range_reduce() will 727 * shrink down r_len to the appropriate size. 728 */ 729 if (rl->r_len == UINT64_MAX) { 730 uint64_t new_blksz; 731 732 if (zp->z_blksz > max_blksz) { 733 ASSERT(!ISP2(zp->z_blksz)); 734 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 735 } else { 736 new_blksz = MIN(end_size, max_blksz); 737 } 738 zfs_grow_blocksize(zp, new_blksz, tx); 739 zfs_range_reduce(rl, woff, n); 740 } 741 742 /* 743 * XXX - should we really limit each write to z_max_blksz? 744 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 745 */ 746 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 747 rw_enter(&zp->z_map_lock, RW_READER); 748 749 tx_bytes = uio->uio_resid; 750 if (vn_has_cached_data(vp)) { 751 rw_exit(&zp->z_map_lock); 752 error = mappedwrite(vp, nbytes, uio, tx); 753 } else { 754 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 755 uio, nbytes, tx); 756 rw_exit(&zp->z_map_lock); 757 } 758 tx_bytes -= uio->uio_resid; 759 760 /* 761 * If we made no progress, we're done. If we made even 762 * partial progress, update the znode and ZIL accordingly. 763 */ 764 if (tx_bytes == 0) { 765 dmu_tx_commit(tx); 766 ASSERT(error != 0); 767 break; 768 } 769 770 /* 771 * Clear Set-UID/Set-GID bits on successful write if not 772 * privileged and at least one of the excute bits is set. 773 * 774 * It would be nice to to this after all writes have 775 * been done, but that would still expose the ISUID/ISGID 776 * to another app after the partial write is committed. 777 * 778 * Note: we don't call zfs_fuid_map_id() here because 779 * user 0 is not an ephemeral uid. 780 */ 781 mutex_enter(&zp->z_acl_lock); 782 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 783 (S_IXUSR >> 6))) != 0 && 784 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 785 secpolicy_vnode_setid_retain(cr, 786 (zp->z_phys->zp_mode & S_ISUID) != 0 && 787 zp->z_phys->zp_uid == 0) != 0) { 788 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 789 } 790 mutex_exit(&zp->z_acl_lock); 791 792 /* 793 * Update time stamp. NOTE: This marks the bonus buffer as 794 * dirty, so we don't have to do it again for zp_size. 795 */ 796 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 797 798 /* 799 * Update the file size (zp_size) if it has changed; 800 * account for possible concurrent updates. 801 */ 802 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 803 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 804 uio->uio_loffset); 805 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 806 dmu_tx_commit(tx); 807 808 if (error != 0) 809 break; 810 ASSERT(tx_bytes == nbytes); 811 n -= nbytes; 812 } 813 814 zfs_range_unlock(rl); 815 816 /* 817 * If we're in replay mode, or we made no progress, return error. 818 * Otherwise, it's at least a partial write, so it's successful. 819 */ 820 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 821 ZFS_EXIT(zfsvfs); 822 return (error); 823 } 824 825 if (ioflag & (FSYNC | FDSYNC)) 826 zil_commit(zilog, zp->z_last_itx, zp->z_id); 827 828 ZFS_EXIT(zfsvfs); 829 return (0); 830 } 831 832 void 833 zfs_get_done(dmu_buf_t *db, void *vzgd) 834 { 835 zgd_t *zgd = (zgd_t *)vzgd; 836 rl_t *rl = zgd->zgd_rl; 837 vnode_t *vp = ZTOV(rl->r_zp); 838 839 dmu_buf_rele(db, vzgd); 840 zfs_range_unlock(rl); 841 VN_RELE(vp); 842 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 843 kmem_free(zgd, sizeof (zgd_t)); 844 } 845 846 /* 847 * Get data to generate a TX_WRITE intent log record. 848 */ 849 int 850 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 851 { 852 zfsvfs_t *zfsvfs = arg; 853 objset_t *os = zfsvfs->z_os; 854 znode_t *zp; 855 uint64_t off = lr->lr_offset; 856 dmu_buf_t *db; 857 rl_t *rl; 858 zgd_t *zgd; 859 int dlen = lr->lr_length; /* length of user data */ 860 int error = 0; 861 862 ASSERT(zio); 863 ASSERT(dlen != 0); 864 865 /* 866 * Nothing to do if the file has been removed 867 */ 868 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 869 return (ENOENT); 870 if (zp->z_unlinked) { 871 VN_RELE(ZTOV(zp)); 872 return (ENOENT); 873 } 874 875 /* 876 * Write records come in two flavors: immediate and indirect. 877 * For small writes it's cheaper to store the data with the 878 * log record (immediate); for large writes it's cheaper to 879 * sync the data and get a pointer to it (indirect) so that 880 * we don't have to write the data twice. 881 */ 882 if (buf != NULL) { /* immediate write */ 883 rl = zfs_range_lock(zp, off, dlen, RL_READER); 884 /* test for truncation needs to be done while range locked */ 885 if (off >= zp->z_phys->zp_size) { 886 error = ENOENT; 887 goto out; 888 } 889 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 890 } else { /* indirect write */ 891 uint64_t boff; /* block starting offset */ 892 893 /* 894 * Have to lock the whole block to ensure when it's 895 * written out and it's checksum is being calculated 896 * that no one can change the data. We need to re-check 897 * blocksize after we get the lock in case it's changed! 898 */ 899 for (;;) { 900 if (ISP2(zp->z_blksz)) { 901 boff = P2ALIGN_TYPED(off, zp->z_blksz, 902 uint64_t); 903 } else { 904 boff = 0; 905 } 906 dlen = zp->z_blksz; 907 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 908 if (zp->z_blksz == dlen) 909 break; 910 zfs_range_unlock(rl); 911 } 912 /* test for truncation needs to be done while range locked */ 913 if (off >= zp->z_phys->zp_size) { 914 error = ENOENT; 915 goto out; 916 } 917 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 918 zgd->zgd_rl = rl; 919 zgd->zgd_zilog = zfsvfs->z_log; 920 zgd->zgd_bp = &lr->lr_blkptr; 921 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 922 ASSERT(boff == db->db_offset); 923 lr->lr_blkoff = off - boff; 924 error = dmu_sync(zio, db, &lr->lr_blkptr, 925 lr->lr_common.lrc_txg, zfs_get_done, zgd); 926 ASSERT((error && error != EINPROGRESS) || 927 lr->lr_length <= zp->z_blksz); 928 if (error == 0) 929 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 930 /* 931 * If we get EINPROGRESS, then we need to wait for a 932 * write IO initiated by dmu_sync() to complete before 933 * we can release this dbuf. We will finish everything 934 * up in the zfs_get_done() callback. 935 */ 936 if (error == EINPROGRESS) 937 return (0); 938 dmu_buf_rele(db, zgd); 939 kmem_free(zgd, sizeof (zgd_t)); 940 } 941 out: 942 zfs_range_unlock(rl); 943 VN_RELE(ZTOV(zp)); 944 return (error); 945 } 946 947 /*ARGSUSED*/ 948 static int 949 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 950 caller_context_t *ct) 951 { 952 znode_t *zp = VTOZ(vp); 953 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 954 int error; 955 956 ZFS_ENTER(zfsvfs); 957 ZFS_VERIFY_ZP(zp); 958 959 if (flag & V_ACE_MASK) 960 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 961 else 962 error = zfs_zaccess_rwx(zp, mode, flag, cr); 963 964 ZFS_EXIT(zfsvfs); 965 return (error); 966 } 967 968 /* 969 * Lookup an entry in a directory, or an extended attribute directory. 970 * If it exists, return a held vnode reference for it. 971 * 972 * IN: dvp - vnode of directory to search. 973 * nm - name of entry to lookup. 974 * pnp - full pathname to lookup [UNUSED]. 975 * flags - LOOKUP_XATTR set if looking for an attribute. 976 * rdir - root directory vnode [UNUSED]. 977 * cr - credentials of caller. 978 * ct - caller context 979 * direntflags - directory lookup flags 980 * realpnp - returned pathname. 981 * 982 * OUT: vpp - vnode of located entry, NULL if not found. 983 * 984 * RETURN: 0 if success 985 * error code if failure 986 * 987 * Timestamps: 988 * NA 989 */ 990 /* ARGSUSED */ 991 static int 992 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 993 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 994 int *direntflags, pathname_t *realpnp) 995 { 996 znode_t *zdp = VTOZ(dvp); 997 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 998 int error; 999 1000 ZFS_ENTER(zfsvfs); 1001 ZFS_VERIFY_ZP(zdp); 1002 1003 *vpp = NULL; 1004 1005 if (flags & LOOKUP_XATTR) { 1006 /* 1007 * If the xattr property is off, refuse the lookup request. 1008 */ 1009 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1010 ZFS_EXIT(zfsvfs); 1011 return (EINVAL); 1012 } 1013 1014 /* 1015 * We don't allow recursive attributes.. 1016 * Maybe someday we will. 1017 */ 1018 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1019 ZFS_EXIT(zfsvfs); 1020 return (EINVAL); 1021 } 1022 1023 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1024 ZFS_EXIT(zfsvfs); 1025 return (error); 1026 } 1027 1028 /* 1029 * Do we have permission to get into attribute directory? 1030 */ 1031 1032 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1033 B_FALSE, cr)) { 1034 VN_RELE(*vpp); 1035 *vpp = NULL; 1036 } 1037 1038 ZFS_EXIT(zfsvfs); 1039 return (error); 1040 } 1041 1042 if (dvp->v_type != VDIR) { 1043 ZFS_EXIT(zfsvfs); 1044 return (ENOTDIR); 1045 } 1046 1047 /* 1048 * Check accessibility of directory. 1049 */ 1050 1051 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1052 ZFS_EXIT(zfsvfs); 1053 return (error); 1054 } 1055 1056 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1057 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1058 ZFS_EXIT(zfsvfs); 1059 return (EILSEQ); 1060 } 1061 1062 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1063 if (error == 0) { 1064 /* 1065 * Convert device special files 1066 */ 1067 if (IS_DEVVP(*vpp)) { 1068 vnode_t *svp; 1069 1070 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1071 VN_RELE(*vpp); 1072 if (svp == NULL) 1073 error = ENOSYS; 1074 else 1075 *vpp = svp; 1076 } 1077 } 1078 1079 ZFS_EXIT(zfsvfs); 1080 return (error); 1081 } 1082 1083 /* 1084 * Attempt to create a new entry in a directory. If the entry 1085 * already exists, truncate the file if permissible, else return 1086 * an error. Return the vp of the created or trunc'd file. 1087 * 1088 * IN: dvp - vnode of directory to put new file entry in. 1089 * name - name of new file entry. 1090 * vap - attributes of new file. 1091 * excl - flag indicating exclusive or non-exclusive mode. 1092 * mode - mode to open file with. 1093 * cr - credentials of caller. 1094 * flag - large file flag [UNUSED]. 1095 * ct - caller context 1096 * vsecp - ACL to be set 1097 * 1098 * OUT: vpp - vnode of created or trunc'd entry. 1099 * 1100 * RETURN: 0 if success 1101 * error code if failure 1102 * 1103 * Timestamps: 1104 * dvp - ctime|mtime updated if new entry created 1105 * vp - ctime|mtime always, atime if new 1106 */ 1107 1108 /* ARGSUSED */ 1109 static int 1110 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1111 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1112 vsecattr_t *vsecp) 1113 { 1114 znode_t *zp, *dzp = VTOZ(dvp); 1115 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1116 zilog_t *zilog; 1117 objset_t *os; 1118 zfs_dirlock_t *dl; 1119 dmu_tx_t *tx; 1120 int error; 1121 zfs_acl_t *aclp = NULL; 1122 zfs_fuid_info_t *fuidp = NULL; 1123 1124 /* 1125 * If we have an ephemeral id, ACL, or XVATTR then 1126 * make sure file system is at proper version 1127 */ 1128 1129 if (zfsvfs->z_use_fuids == B_FALSE && 1130 (vsecp || (vap->va_mask & AT_XVATTR) || 1131 IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) 1132 return (EINVAL); 1133 1134 ZFS_ENTER(z