1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/resource.h> 34 #include <sys/vfs.h> 35 #include <sys/vfs_opreg.h> 36 #include <sys/vnode.h> 37 #include <sys/file.h> 38 #include <sys/stat.h> 39 #include <sys/kmem.h> 40 #include <sys/taskq.h> 41 #include <sys/uio.h> 42 #include <sys/vmsystm.h> 43 #include <sys/atomic.h> 44 #include <sys/vm.h> 45 #include <vm/seg_vn.h> 46 #include <vm/pvn.h> 47 #include <vm/as.h> 48 #include <vm/kpm.h> 49 #include <vm/seg_kpm.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/cmn_err.h> 53 #include <sys/errno.h> 54 #include <sys/unistd.h> 55 #include <sys/zfs_dir.h> 56 #include <sys/zfs_acl.h> 57 #include <sys/zfs_ioctl.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/dmu.h> 60 #include <sys/spa.h> 61 #include <sys/txg.h> 62 #include <sys/dbuf.h> 63 #include <sys/zap.h> 64 #include <sys/dirent.h> 65 #include <sys/policy.h> 66 #include <sys/sunddi.h> 67 #include <sys/filio.h> 68 #include "fs/fs_subr.h" 69 #include <sys/zfs_ctldir.h> 70 #include <sys/zfs_fuid.h> 71 #include <sys/dnlc.h> 72 #include <sys/zfs_rlock.h> 73 #include <sys/extdirent.h> 74 #include <sys/kidmap.h> 75 #include <sys/cred_impl.h> 76 #include <sys/attr.h> 77 78 /* 79 * Programming rules. 80 * 81 * Each vnode op performs some logical unit of work. To do this, the ZPL must 82 * properly lock its in-core state, create a DMU transaction, do the work, 83 * record this work in the intent log (ZIL), commit the DMU transaction, 84 * and wait for the intent log to commit if it is a synchronous operation. 85 * Moreover, the vnode ops must work in both normal and log replay context. 86 * The ordering of events is important to avoid deadlocks and references 87 * to freed memory. The example below illustrates the following Big Rules: 88 * 89 * (1) A check must be made in each zfs thread for a mounted file system. 90 * This is done avoiding races using ZFS_ENTER(zfsvfs). 91 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 92 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 93 * can return EIO from the calling function. 94 * 95 * (2) VN_RELE() should always be the last thing except for zil_commit() 96 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 97 * First, if it's the last reference, the vnode/znode 98 * can be freed, so the zp may point to freed memory. Second, the last 99 * reference will call zfs_zinactive(), which may induce a lot of work -- 100 * pushing cached pages (which acquires range locks) and syncing out 101 * cached atime changes. Third, zfs_zinactive() may require a new tx, 102 * which could deadlock the system if you were already holding one. 103 * 104 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 105 * as they can span dmu_tx_assign() calls. 106 * 107 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 108 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 109 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 110 * This is critical because we don't want to block while holding locks. 111 * Note, in particular, that if a lock is sometimes acquired before 112 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 113 * use a non-blocking assign can deadlock the system. The scenario: 114 * 115 * Thread A has grabbed a lock before calling dmu_tx_assign(). 116 * Thread B is in an already-assigned tx, and blocks for this lock. 117 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 118 * forever, because the previous txg can't quiesce until B's tx commits. 119 * 120 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 121 * then drop all locks, call dmu_tx_wait(), and try again. 122 * 123 * (5) If the operation succeeded, generate the intent log entry for it 124 * before dropping locks. This ensures that the ordering of events 125 * in the intent log matches the order in which they actually occurred. 126 * 127 * (6) At the end of each vnode op, the DMU tx must always commit, 128 * regardless of whether there were any errors. 129 * 130 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 131 * to ensure that synchronous semantics are provided when necessary. 132 * 133 * In general, this is how things should be ordered in each vnode op: 134 * 135 * ZFS_ENTER(zfsvfs); // exit if unmounted 136 * top: 137 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 138 * rw_enter(...); // grab any other locks you need 139 * tx = dmu_tx_create(...); // get DMU tx 140 * dmu_tx_hold_*(); // hold each object you might modify 141 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 142 * if (error) { 143 * rw_exit(...); // drop locks 144 * zfs_dirent_unlock(dl); // unlock directory entry 145 * VN_RELE(...); // release held vnodes 146 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 147 * dmu_tx_wait(tx); 148 * dmu_tx_abort(tx); 149 * goto top; 150 * } 151 * dmu_tx_abort(tx); // abort DMU tx 152 * ZFS_EXIT(zfsvfs); // finished in zfs 153 * return (error); // really out of space 154 * } 155 * error = do_real_work(); // do whatever this VOP does 156 * if (error == 0) 157 * zfs_log_*(...); // on success, make ZIL entry 158 * dmu_tx_commit(tx); // commit DMU tx -- error or not 159 * rw_exit(...); // drop locks 160 * zfs_dirent_unlock(dl); // unlock directory entry 161 * VN_RELE(...); // release held vnodes 162 * zil_commit(zilog, seq, foid); // synchronous when necessary 163 * ZFS_EXIT(zfsvfs); // finished in zfs 164 * return (error); // done, report error 165 */ 166 167 /* ARGSUSED */ 168 static int 169 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 170 { 171 znode_t *zp = VTOZ(*vpp); 172 173 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 174 ((flag & FAPPEND) == 0)) { 175 return (EPERM); 176 } 177 178 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 179 ZTOV(zp)->v_type == VREG && 180 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 181 zp->z_phys->zp_size > 0) 182 if (fs_vscan(*vpp, cr, 0) != 0) 183 return (EACCES); 184 185 /* Keep a count of the synchronous opens in the znode */ 186 if (flag & (FSYNC | FDSYNC)) 187 atomic_inc_32(&zp->z_sync_cnt); 188 189 return (0); 190 } 191 192 /* ARGSUSED */ 193 static int 194 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 195 caller_context_t *ct) 196 { 197 znode_t *zp = VTOZ(vp); 198 199 /* Decrement the synchronous opens in the znode */ 200 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 201 atomic_dec_32(&zp->z_sync_cnt); 202 203 /* 204 * Clean up any locks held by this process on the vp. 205 */ 206 cleanlocks(vp, ddi_get_pid(), 0); 207 cleanshares(vp, ddi_get_pid()); 208 209 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 210 ZTOV(zp)->v_type == VREG && 211 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 212 zp->z_phys->zp_size > 0) 213 VERIFY(fs_vscan(vp, cr, 1) == 0); 214 215 return (0); 216 } 217 218 /* 219 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 220 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 221 */ 222 static int 223 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 224 { 225 znode_t *zp = VTOZ(vp); 226 uint64_t noff = (uint64_t)*off; /* new offset */ 227 uint64_t file_sz; 228 int error; 229 boolean_t hole; 230 231 file_sz = zp->z_phys->zp_size; 232 if (noff >= file_sz) { 233 return (ENXIO); 234 } 235 236 if (cmd == _FIO_SEEK_HOLE) 237 hole = B_TRUE; 238 else 239 hole = B_FALSE; 240 241 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 242 243 /* end of file? */ 244 if ((error == ESRCH) || (noff > file_sz)) { 245 /* 246 * Handle the virtual hole at the end of file. 247 */ 248 if (hole) { 249 *off = file_sz; 250 return (0); 251 } 252 return (ENXIO); 253 } 254 255 if (noff < *off) 256 return (error); 257 *off = noff; 258 return (error); 259 } 260 261 /* ARGSUSED */ 262 static int 263 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 264 int *rvalp, caller_context_t *ct) 265 { 266 offset_t off; 267 int error; 268 zfsvfs_t *zfsvfs; 269 znode_t *zp; 270 271 switch (com) { 272 case _FIOFFS: 273 return (zfs_sync(vp->v_vfsp, 0, cred)); 274 275 /* 276 * The following two ioctls are used by bfu. Faking out, 277 * necessary to avoid bfu errors. 278 */ 279 case _FIOGDIO: 280 case _FIOSDIO: 281 return (0); 282 283 case _FIO_SEEK_DATA: 284 case _FIO_SEEK_HOLE: 285 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 286 return (EFAULT); 287 288 zp = VTOZ(vp); 289 zfsvfs = zp->z_zfsvfs; 290 ZFS_ENTER(zfsvfs); 291 ZFS_VERIFY_ZP(zp); 292 293 /* offset parameter is in/out */ 294 error = zfs_holey(vp, com, &off); 295 ZFS_EXIT(zfsvfs); 296 if (error) 297 return (error); 298 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 299 return (EFAULT); 300 return (0); 301 } 302 return (ENOTTY); 303 } 304 305 /* 306 * Utility functions to map and unmap a single physical page. These 307 * are used to manage the mappable copies of ZFS file data, and therefore 308 * do not update ref/mod bits. 309 */ 310 caddr_t 311 zfs_map_page(page_t *pp, enum seg_rw rw) 312 { 313 if (kpm_enable) 314 return (hat_kpm_mapin(pp, 0)); 315 ASSERT(rw == S_READ || rw == S_WRITE); 316 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 317 (caddr_t)-1)); 318 } 319 320 void 321 zfs_unmap_page(page_t *pp, caddr_t addr) 322 { 323 if (kpm_enable) { 324 hat_kpm_mapout(pp, 0, addr); 325 } else { 326 ppmapout(addr); 327 } 328 } 329 330 /* 331 * When a file is memory mapped, we must keep the IO data synchronized 332 * between the DMU cache and the memory mapped pages. What this means: 333 * 334 * On Write: If we find a memory mapped page, we write to *both* 335 * the page and the dmu buffer. 336 * 337 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 338 * the file is memory mapped. 339 */ 340 static int 341 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 342 { 343 znode_t *zp = VTOZ(vp); 344 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 345 int64_t start, off; 346 int len = nbytes; 347 int error = 0; 348 349 start = uio->uio_loffset; 350 off = start & PAGEOFFSET; 351 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 352 page_t *pp; 353 uint64_t bytes = MIN(PAGESIZE - off, len); 354 uint64_t woff = uio->uio_loffset; 355 356 /* 357 * We don't want a new page to "appear" in the middle of 358 * the file update (because it may not get the write 359 * update data), so we grab a lock to block 360 * zfs_getpage(). 361 */ 362 rw_enter(&zp->z_map_lock, RW_WRITER); 363 if (pp = page_lookup(vp, start, SE_SHARED)) { 364 caddr_t va; 365 366 rw_exit(&zp->z_map_lock); 367 va = zfs_map_page(pp, S_WRITE); 368 error = uiomove(va+off, bytes, UIO_WRITE, uio); 369 if (error == 0) { 370 dmu_write(zfsvfs->z_os, zp->z_id, 371 woff, bytes, va+off, tx); 372 } 373 zfs_unmap_page(pp, va); 374 page_unlock(pp); 375 } else { 376 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 377 uio, bytes, tx); 378 rw_exit(&zp->z_map_lock); 379 } 380 len -= bytes; 381 off = 0; 382 if (error) 383 break; 384 } 385 return (error); 386 } 387 388 /* 389 * When a file is memory mapped, we must keep the IO data synchronized 390 * between the DMU cache and the memory mapped pages. What this means: 391 * 392 * On Read: We "read" preferentially from memory mapped pages, 393 * else we default from the dmu buffer. 394 * 395 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 396 * the file is memory mapped. 397 */ 398 static int 399 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 400 { 401 znode_t *zp = VTOZ(vp); 402 objset_t *os = zp->z_zfsvfs->z_os; 403 int64_t start, off; 404 int len = nbytes; 405 int error = 0; 406 407 start = uio->uio_loffset; 408 off = start & PAGEOFFSET; 409 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 410 page_t *pp; 411 uint64_t bytes = MIN(PAGESIZE - off, len); 412 413 if (pp = page_lookup(vp, start, SE_SHARED)) { 414 caddr_t va; 415 416 va = zfs_map_page(pp, S_READ); 417 error = uiomove(va + off, bytes, UIO_READ, uio); 418 zfs_unmap_page(pp, va); 419 page_unlock(pp); 420 } else { 421 error = dmu_read_uio(os, zp->z_id, uio, bytes); 422 } 423 len -= bytes; 424 off = 0; 425 if (error) 426 break; 427 } 428 return (error); 429 } 430 431 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 432 433 /* 434 * Read bytes from specified file into supplied buffer. 435 * 436 * IN: vp - vnode of file to be read from. 437 * uio - structure supplying read location, range info, 438 * and return buffer. 439 * ioflag - SYNC flags; used to provide FRSYNC semantics. 440 * cr - credentials of caller. 441 * ct - caller context 442 * 443 * OUT: uio - updated offset and range, buffer filled. 444 * 445 * RETURN: 0 if success 446 * error code if failure 447 * 448 * Side Effects: 449 * vp - atime updated if byte count > 0 450 */ 451 /* ARGSUSED */ 452 static int 453 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 454 { 455 znode_t *zp = VTOZ(vp); 456 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 457 objset_t *os; 458 ssize_t n, nbytes; 459 int error; 460 rl_t *rl; 461 462 ZFS_ENTER(zfsvfs); 463 ZFS_VERIFY_ZP(zp); 464 os = zfsvfs->z_os; 465 466 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 467 ZFS_EXIT(zfsvfs); 468 return (EACCES); 469 } 470 471 /* 472 * Validate file offset 473 */ 474 if (uio->uio_loffset < (offset_t)0) { 475 ZFS_EXIT(zfsvfs); 476 return (EINVAL); 477 } 478 479 /* 480 * Fasttrack empty reads 481 */ 482 if (uio->uio_resid == 0) { 483 ZFS_EXIT(zfsvfs); 484 return (0); 485 } 486 487 /* 488 * Check for mandatory locks 489 */ 490 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 491 if (error = chklock(vp, FREAD, 492 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 493 ZFS_EXIT(zfsvfs); 494 return (error); 495 } 496 } 497 498 /* 499 * If we're in FRSYNC mode, sync out this znode before reading it. 500 */ 501 if (ioflag & FRSYNC) 502 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 503 504 /* 505 * Lock the range against changes. 506 */ 507 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 508 509 /* 510 * If we are reading past end-of-file we can skip 511 * to the end; but we might still need to set atime. 512 */ 513 if (uio->uio_loffset >= zp->z_phys->zp_size) { 514 error = 0; 515 goto out; 516 } 517 518 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 519 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 520 521 while (n > 0) { 522 nbytes = MIN(n, zfs_read_chunk_size - 523 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 524 525 if (vn_has_cached_data(vp)) 526 error = mappedread(vp, nbytes, uio); 527 else 528 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 529 if (error) { 530 /* convert checksum errors into IO errors */ 531 if (error == ECKSUM) 532 error = EIO; 533 break; 534 } 535 536 n -= nbytes; 537 } 538 539 out: 540 zfs_range_unlock(rl); 541 542 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 543 ZFS_EXIT(zfsvfs); 544 return (error); 545 } 546 547 /* 548 * Fault in the pages of the first n bytes specified by the uio structure. 549 * 1 byte in each page is touched and the uio struct is unmodified. 550 * Any error will exit this routine as this is only a best 551 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 552 */ 553 static void 554 zfs_prefault_write(ssize_t n, struct uio *uio) 555 { 556 struct iovec *iov; 557 ulong_t cnt, incr; 558 caddr_t p; 559 uint8_t tmp; 560 561 iov = uio->uio_iov; 562 563 while (n) { 564 cnt = MIN(iov->iov_len, n); 565 if (cnt == 0) { 566 /* empty iov entry */ 567 iov++; 568 continue; 569 } 570 n -= cnt; 571 /* 572 * touch each page in this segment. 573 */ 574 p = iov->iov_base; 575 while (cnt) { 576 switch (uio->uio_segflg) { 577 case UIO_USERSPACE: 578 case UIO_USERISPACE: 579 if (fuword8(p, &tmp)) 580 return; 581 break; 582 case UIO_SYSSPACE: 583 if (kcopy(p, &tmp, 1)) 584 return; 585 break; 586 } 587 incr = MIN(cnt, PAGESIZE); 588 p += incr; 589 cnt -= incr; 590 } 591 /* 592 * touch the last byte in case it straddles a page. 593 */ 594 p--; 595 switch (uio->uio_segflg) { 596 case UIO_USERSPACE: 597 case UIO_USERISPACE: 598 if (fuword8(p, &tmp)) 599 return; 600 break; 601 case UIO_SYSSPACE: 602 if (kcopy(p, &tmp, 1)) 603 return; 604 break; 605 } 606 iov++; 607 } 608 } 609 610 /* 611 * Write the bytes to a file. 612 * 613 * IN: vp - vnode of file to be written to. 614 * uio - structure supplying write location, range info, 615 * and data buffer. 616 * ioflag - FAPPEND flag set if in append mode. 617 * cr - credentials of caller. 618 * ct - caller context (NFS/CIFS fem monitor only) 619 * 620 * OUT: uio - updated offset and range. 621 * 622 * RETURN: 0 if success 623 * error code if failure 624 * 625 * Timestamps: 626 * vp - ctime|mtime updated if byte count > 0 627 */ 628 /* ARGSUSED */ 629 static int 630 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 631 { 632 znode_t *zp = VTOZ(vp); 633 rlim64_t limit = uio->uio_llimit; 634 ssize_t start_resid = uio->uio_resid; 635 ssize_t tx_bytes; 636 uint64_t end_size; 637 dmu_tx_t *tx; 638 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 639 zilog_t *zilog; 640 offset_t woff; 641 ssize_t n, nbytes; 642 rl_t *rl; 643 int max_blksz = zfsvfs->z_max_blksz; 644 uint64_t pflags; 645 int error; 646 647 /* 648 * Fasttrack empty write 649 */ 650 n = start_resid; 651 if (n == 0) 652 return (0); 653 654 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 655 limit = MAXOFFSET_T; 656 657 ZFS_ENTER(zfsvfs); 658 ZFS_VERIFY_ZP(zp); 659 660 /* 661 * If immutable or not appending then return EPERM 662 */ 663 pflags = zp->z_phys->zp_flags; 664 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 665 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 666 (uio->uio_loffset < zp->z_phys->zp_size))) { 667 ZFS_EXIT(zfsvfs); 668 return (EPERM); 669 } 670 671 zilog = zfsvfs->z_log; 672 673 /* 674 * Pre-fault the pages to ensure slow (eg NFS) pages 675 * don't hold up txg. 676 */ 677 zfs_prefault_write(n, uio); 678 679 /* 680 * If in append mode, set the io offset pointer to eof. 681 */ 682 if (ioflag & FAPPEND) { 683 /* 684 * Range lock for a file append: 685 * The value for the start of range will be determined by 686 * zfs_range_lock() (to guarantee append semantics). 687 * If this write will cause the block size to increase, 688 * zfs_range_lock() will lock the entire file, so we must 689 * later reduce the range after we grow the block size. 690 */ 691 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 692 if (rl->r_len == UINT64_MAX) { 693 /* overlocked, zp_size can't change */ 694 woff = uio->uio_loffset = zp->z_phys->zp_size; 695 } else { 696 woff = uio->uio_loffset = rl->r_off; 697 } 698 } else { 699 woff = uio->uio_loffset; 700 /* 701 * Validate file offset 702 */ 703 if (woff < 0) { 704 ZFS_EXIT(zfsvfs); 705 return (EINVAL); 706 } 707 708 /* 709 * If we need to grow the block size then zfs_range_lock() 710 * will lock a wider range than we request here. 711 * Later after growing the block size we reduce the range. 712 */ 713 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 714 } 715 716 if (woff >= limit) { 717 zfs_range_unlock(rl); 718 ZFS_EXIT(zfsvfs); 719 return (EFBIG); 720 } 721 722 if ((woff + n) > limit || woff > (limit - n)) 723 n = limit - woff; 724 725 /* 726 * Check for mandatory locks 727 */ 728 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 729 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 730 zfs_range_unlock(rl); 731 ZFS_EXIT(zfsvfs); 732 return (error); 733 } 734 end_size = MAX(zp->z_phys->zp_size, woff + n); 735 736 /* 737 * Write the file in reasonable size chunks. Each chunk is written 738 * in a separate transaction; this keeps the intent log records small 739 * and allows us to do more fine-grained space accounting. 740 */ 741 while (n > 0) { 742 /* 743 * Start a transaction. 744 */ 745 woff = uio->uio_loffset; 746 tx = dmu_tx_create(zfsvfs->z_os); 747 dmu_tx_hold_bonus(tx, zp->z_id); 748 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 749 error = dmu_tx_assign(tx, zfsvfs->z_assign); 750 if (error) { 751 if (error == ERESTART && 752 zfsvfs->z_assign == TXG_NOWAIT) { 753 dmu_tx_wait(tx); 754 dmu_tx_abort(tx); 755 continue; 756 } 757 dmu_tx_abort(tx); 758 break; 759 } 760 761 /* 762 * If zfs_range_lock() over-locked we grow the blocksize 763 * and then reduce the lock range. This will only happen 764 * on the first iteration since zfs_range_reduce() will 765 * shrink down r_len to the appropriate size. 766 */ 767 if (rl->r_len == UINT64_MAX) { 768 uint64_t new_blksz; 769 770 if (zp->z_blksz > max_blksz) { 771 ASSERT(!ISP2(zp->z_blksz)); 772 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 773 } else { 774 new_blksz = MIN(end_size, max_blksz); 775 } 776 zfs_grow_blocksize(zp, new_blksz, tx); 777 zfs_range_reduce(rl, woff, n); 778 } 779 780 /* 781 * XXX - should we really limit each write to z_max_blksz? 782 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 783 */ 784 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 785 rw_enter(&zp->z_map_lock, RW_READER); 786 787 tx_bytes = uio->uio_resid; 788 if (vn_has_cached_data(vp)) { 789 rw_exit(&zp->z_map_lock); 790 error = mappedwrite(vp, nbytes, uio, tx); 791 } else { 792 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 793 uio, nbytes, tx); 794 rw_exit(&zp->z_map_lock); 795 } 796 tx_bytes -= uio->uio_resid; 797 798 /* 799 * If we made no progress, we're done. If we made even 800 * partial progress, update the znode and ZIL accordingly. 801 */ 802 if (tx_bytes == 0) { 803 dmu_tx_commit(tx); 804 ASSERT(error != 0); 805 break; 806 } 807 808 /* 809 * Clear Set-UID/Set-GID bits on successful write if not 810 * privileged and at least one of the excute bits is set. 811 * 812 * It would be nice to to this after all writes have 813 * been done, but that would still expose the ISUID/ISGID 814 * to another app after the partial write is committed. 815 * 816 * Note: we don't call zfs_fuid_map_id() here because 817 * user 0 is not an ephemeral uid. 818 */ 819 mutex_enter(&zp->z_acl_lock); 820 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 821 (S_IXUSR >> 6))) != 0 && 822 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 823 secpolicy_vnode_setid_retain(cr, 824 (zp->z_phys->zp_mode & S_ISUID) != 0 && 825 zp->z_phys->zp_uid == 0) != 0) { 826 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 827 } 828 mutex_exit(&zp->z_acl_lock); 829 830 /* 831 * Update time stamp. NOTE: This marks the bonus buffer as 832 * dirty, so we don't have to do it again for zp_size. 833 */ 834 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 835 836 /* 837 * Update the file size (zp_size) if it has changed; 838 * account for possible concurrent updates. 839 */ 840 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 841 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 842 uio->uio_loffset); 843 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 844 dmu_tx_commit(tx); 845 846 if (error != 0) 847 break; 848 ASSERT(tx_bytes == nbytes); 849 n -= nbytes; 850 } 851 852 zfs_range_unlock(rl); 853 854 /* 855 * If we're in replay mode, or we made no progress, return error. 856 * Otherwise, it's at least a partial write, so it's successful. 857 */ 858 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 859 ZFS_EXIT(zfsvfs); 860 return (error); 861 } 862 863 if (ioflag & (FSYNC | FDSYNC)) 864 zil_commit(zilog, zp->z_last_itx, zp->z_id); 865 866 ZFS_EXIT(zfsvfs); 867 return (0); 868 } 869 870 void 871 zfs_get_done(dmu_buf_t *db, void *vzgd) 872 { 873 zgd_t *zgd = (zgd_t *)vzgd; 874 rl_t *rl = zgd->zgd_rl; 875 vnode_t *vp = ZTOV(rl->r_zp); 876 877 dmu_buf_rele(db, vzgd); 878 zfs_range_unlock(rl); 879 VN_RELE(vp); 880 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 881 kmem_free(zgd, sizeof (zgd_t)); 882 } 883 884 /* 885 * Get data to generate a TX_WRITE intent log record. 886 */ 887 int 888 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 889 { 890 zfsvfs_t *zfsvfs = arg; 891 objset_t *os = zfsvfs->z_os; 892 znode_t *zp; 893 uint64_t off = lr->lr_offset; 894 dmu_buf_t *db; 895 rl_t *rl; 896 zgd_t *zgd; 897 int dlen = lr->lr_length; /* length of user data */ 898 int error = 0; 899 900 ASSERT(zio); 901 ASSERT(dlen != 0); 902 903 /* 904 * Nothing to do if the file has been removed 905 */ 906 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 907 return (ENOENT); 908 if (zp->z_unlinked) { 909 VN_RELE(ZTOV(zp)); 910 return (ENOENT); 911 } 912 913 /* 914 * Write records come in two flavors: immediate and indirect. 915 * For small writes it's cheaper to store the data with the 916 * log record (immediate); for large writes it's cheaper to 917 * sync the data and get a pointer to it (indirect) so that 918 * we don't have to write the data twice. 919 */ 920 if (buf != NULL) { /* immediate write */ 921 rl = zfs_range_lock(zp, off, dlen, RL_READER); 922 /* test for truncation needs to be done while range locked */ 923 if (off >= zp->z_phys->zp_size) { 924 error = ENOENT; 925 goto out; 926 } 927 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 928 } else { /* indirect write */ 929 uint64_t boff; /* block starting offset */ 930 931 /* 932 * Have to lock the whole block to ensure when it's 933 * written out and it's checksum is being calculated 934 * that no one can change the data. We need to re-check 935 * blocksize after we get the lock in case it's changed! 936 */ 937 for (;;) { 938 if (ISP2(zp->z_blksz)) { 939 boff = P2ALIGN_TYPED(off, zp->z_blksz, 940 uint64_t); 941 } else { 942 boff = 0; 943 } 944 dlen = zp->z_blksz; 945 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 946 if (zp->z_blksz == dlen) 947 break; 948 zfs_range_unlock(rl); 949 } 950 /* test for truncation needs to be done while range locked */ 951 if (off >= zp->z_phys->zp_size) { 952 error = ENOENT; 953 goto out; 954 } 955 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 956 zgd->zgd_rl = rl; 957 zgd->zgd_zilog = zfsvfs->z_log; 958 zgd->zgd_bp = &lr->lr_blkptr; 959 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 960 ASSERT(boff == db->db_offset); 961 lr->lr_blkoff = off - boff; 962 error = dmu_sync(zio, db, &lr->lr_blkptr, 963 lr->lr_common.lrc_txg, zfs_get_done, zgd); 964 ASSERT((error && error != EINPROGRESS) || 965 lr->lr_length <= zp->z_blksz); 966 if (error == 0) 967 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 968 /* 969 * If we get EINPROGRESS, then we need to wait for a 970 * write IO initiated by dmu_sync() to complete before 971 * we can release this dbuf. We will finish everything 972 * up in the zfs_get_done() callback. 973 */ 974 if (error == EINPROGRESS) 975 return (0); 976 dmu_buf_rele(db, zgd); 977 kmem_free(zgd, sizeof (zgd_t)); 978 } 979 out: 980 zfs_range_unlock(rl); 981 VN_RELE(ZTOV(zp)); 982 return (error); 983 } 984 985 /*ARGSUSED*/ 986 static int 987 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 988 caller_context_t *ct) 989 { 990 znode_t *zp = VTOZ(vp); 991 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 992 int error; 993 994 ZFS_ENTER(zfsvfs); 995 ZFS_VERIFY_ZP(zp); 996 997 if (flag & V_ACE_MASK) 998 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 999 else 1000 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1001 1002 ZFS_EXIT(zfsvfs); 1003 return (error); 1004 } 1005 1006 /* 1007 * Lookup an entry in a directory, or an extended attribute directory. 1008 * If it exists, return a held vnode reference for it. 1009 * 1010 * IN: dvp - vnode of directory to search. 1011 * nm - name of entry to lookup. 1012 * pnp - full pathname to lookup [UNUSED]. 1013 * flags - LOOKUP_XATTR set if looking for an attribute. 1014 * rdir - root directory vnode [UNUSED]. 1015 * cr - credentials of caller. 1016 * ct - caller context 1017 * direntflags - directory lookup flags 1018 * realpnp - returned pathname. 1019 * 1020 * OUT: vpp - vnode of located entry, NULL if not found. 1021 * 1022 * RETURN: 0 if success 1023 * error code if failure 1024 * 1025 * Timestamps: 1026 * NA 1027 */ 1028 /* ARGSUSED */ 1029 static int 1030 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1031 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1032 int *direntflags, pathname_t *realpnp) 1033 { 1034 znode_t *zdp = VTOZ(dvp); 1035 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1036 int error; 1037 1038 ZFS_ENTER(zfsvfs); 1039 ZFS_VERIFY_ZP(zdp); 1040 1041 *vpp = NULL; 1042 1043 if (flags & LOOKUP_XATTR) { 1044 /* 1045 * If the xattr property is off, refuse the lookup request. 1046 */ 1047 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1048 ZFS_EXIT(zfsvfs); 1049 return (EINVAL); 1050 } 1051 1052 /* 1053 * We don't allow recursive attributes.. 1054 * Maybe someday we will. 1055 */ 1056 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1057 ZFS_EXIT(zfsvfs); 1058 return (EINVAL); 1059 } 1060 1061 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1062 ZFS_EXIT(zfsvfs); 1063 return (error); 1064 } 1065 1066 /* 1067 * Do we have permission to get into attribute directory? 1068 */ 1069 1070 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1071 B_FALSE, cr)) { 1072 VN_RELE(*vpp); 1073 *vpp = NULL; 1074 } 1075 1076 ZFS_EXIT(zfsvfs); 1077 return (error); 1078 } 1079 1080 if (dvp->v_type != VDIR) { 1081 ZFS_EXIT(zfsvfs); 1082 return (ENOTDIR); 1083 } 1084 1085 /* 1086 * Check accessibility of directory. 1087 */ 1088 1089 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1090 ZFS_EXIT(zfsvfs); 1091 return (error); 1092 } 1093 1094 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1095 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1096 ZFS_EXIT(zfsvfs); 1097 return (EILSEQ); 1098 } 1099 1100 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1101 if (error == 0) { 1102 /* 1103 * Convert device special files 1104 */ 1105 if (IS_DEVVP(*vpp)) { 1106 vnode_t *svp; 1107 1108 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1109 VN_RELE(*vpp); 1110 if (svp == NULL) 1111 error = ENOSYS; 1112 else 1113 *vpp = svp; 1114 } 1115 } 1116 1117 ZFS_EXIT(zfsvfs); 1118 return (error); 1119 } 1120 1121 /* 1122 * Attempt to create a new entry in a directory. If the entry 1123 * already exists, truncate the file if permissible, else return 1124 * an error. Return the vp of the created or trunc'd file. 1125 * 1126 * IN: dvp - vnode of directory to put new file entry in. 1127 * name - name of new file entry. 1128 * vap - attributes of new file. 1129 * excl - flag indicating exclusive or non-exclusive mode. 1130 * mode - mode to open file with. 1131 * cr - credentials of caller. 1132 * flag - large file flag [UNUSED]. 1133 * ct - caller context 1134 * vsecp - ACL to be set 1135 * 1136 * OUT: vpp - vnode of created or trunc'd entry. 1137 * 1138 * RETURN: 0 if success 1139 * error code if failure 1140 * 1141 * Timestamps: 1142 * dvp - ctime|mtime updated if new entry created 1143 * vp - ctime|mtime always, atime if new 1144 */ 1145