1 /*-
2 * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause)
3 *
4 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5 * All rights reserved.
6 *
7 * This software was developed for the FreeBSD Project by Marshall
8 * Kirk McKusick and Network Associates Laboratories, the Security
9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11 * research program
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * Copyright (c) 1982, 1986, 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
62 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
64 */
65
66 #include <sys/cdefs.h>
67 #include "opt_directio.h"
68 #include "opt_ffs.h"
69 #include "opt_ufs.h"
70
71 #include <sys/param.h>
72 #include <sys/bio.h>
73 #include <sys/systm.h>
74 #include <sys/buf.h>
75 #include <sys/conf.h>
76 #include <sys/extattr.h>
77 #include <sys/kernel.h>
78 #include <sys/limits.h>
79 #include <sys/malloc.h>
80 #include <sys/mount.h>
81 #include <sys/priv.h>
82 #include <sys/rwlock.h>
83 #include <sys/stat.h>
84 #include <sys/sysctl.h>
85 #include <sys/vmmeter.h>
86 #include <sys/vnode.h>
87
88 #include <vm/vm.h>
89 #include <vm/vm_param.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_object.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_pager.h>
94 #include <vm/vnode_pager.h>
95
96 #include <ufs/ufs/extattr.h>
97 #include <ufs/ufs/quota.h>
98 #include <ufs/ufs/inode.h>
99 #include <ufs/ufs/ufs_extern.h>
100 #include <ufs/ufs/ufsmount.h>
101 #include <ufs/ufs/dir.h>
102 #ifdef UFS_DIRHASH
103 #include <ufs/ufs/dirhash.h>
104 #endif
105
106 #include <ufs/ffs/fs.h>
107 #include <ufs/ffs/ffs_extern.h>
108
109 #define ALIGNED_TO(ptr, s) \
110 (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
111
112 #ifdef DIRECTIO
113 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
114 #endif
115 static vop_fdatasync_t ffs_fdatasync;
116 static vop_fsync_t ffs_fsync;
117 static vop_getpages_t ffs_getpages;
118 static vop_getpages_async_t ffs_getpages_async;
119 static vop_lock1_t ffs_lock;
120 #ifdef INVARIANTS
121 static vop_unlock_t ffs_unlock_debug;
122 #endif
123 static vop_read_t ffs_read;
124 static vop_write_t ffs_write;
125 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
126 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
127 struct ucred *cred);
128 static vop_strategy_t ffsext_strategy;
129 static vop_closeextattr_t ffs_closeextattr;
130 static vop_deleteextattr_t ffs_deleteextattr;
131 static vop_getextattr_t ffs_getextattr;
132 static vop_listextattr_t ffs_listextattr;
133 static vop_openextattr_t ffs_openextattr;
134 static vop_setextattr_t ffs_setextattr;
135 static vop_vptofh_t ffs_vptofh;
136 static vop_vput_pair_t ffs_vput_pair;
137
138 vop_fplookup_vexec_t ufs_fplookup_vexec;
139
140 /* Global vfs data structures for ufs. */
141 struct vop_vector ffs_vnodeops1 = {
142 .vop_default = &ufs_vnodeops,
143 .vop_fsync = ffs_fsync,
144 .vop_fdatasync = ffs_fdatasync,
145 .vop_getpages = ffs_getpages,
146 .vop_getpages_async = ffs_getpages_async,
147 .vop_lock1 = ffs_lock,
148 #ifdef INVARIANTS
149 .vop_unlock = ffs_unlock_debug,
150 #endif
151 .vop_read = ffs_read,
152 .vop_reallocblks = ffs_reallocblks,
153 .vop_write = ffs_write,
154 .vop_vptofh = ffs_vptofh,
155 .vop_vput_pair = ffs_vput_pair,
156 .vop_fplookup_vexec = ufs_fplookup_vexec,
157 .vop_fplookup_symlink = VOP_EAGAIN,
158 };
159 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
160
161 struct vop_vector ffs_fifoops1 = {
162 .vop_default = &ufs_fifoops,
163 .vop_fsync = ffs_fsync,
164 .vop_fdatasync = ffs_fdatasync,
165 .vop_lock1 = ffs_lock,
166 #ifdef INVARIANTS
167 .vop_unlock = ffs_unlock_debug,
168 #endif
169 .vop_vptofh = ffs_vptofh,
170 .vop_fplookup_vexec = VOP_EAGAIN,
171 .vop_fplookup_symlink = VOP_EAGAIN,
172 };
173 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
174
175 /* Global vfs data structures for ufs. */
176 struct vop_vector ffs_vnodeops2 = {
177 .vop_default = &ufs_vnodeops,
178 .vop_fsync = ffs_fsync,
179 .vop_fdatasync = ffs_fdatasync,
180 .vop_getpages = ffs_getpages,
181 .vop_getpages_async = ffs_getpages_async,
182 .vop_lock1 = ffs_lock,
183 #ifdef INVARIANTS
184 .vop_unlock = ffs_unlock_debug,
185 #endif
186 .vop_read = ffs_read,
187 .vop_reallocblks = ffs_reallocblks,
188 .vop_write = ffs_write,
189 .vop_closeextattr = ffs_closeextattr,
190 .vop_deleteextattr = ffs_deleteextattr,
191 .vop_getextattr = ffs_getextattr,
192 .vop_listextattr = ffs_listextattr,
193 .vop_openextattr = ffs_openextattr,
194 .vop_setextattr = ffs_setextattr,
195 .vop_vptofh = ffs_vptofh,
196 .vop_vput_pair = ffs_vput_pair,
197 .vop_fplookup_vexec = ufs_fplookup_vexec,
198 .vop_fplookup_symlink = VOP_EAGAIN,
199 };
200 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
201
202 struct vop_vector ffs_fifoops2 = {
203 .vop_default = &ufs_fifoops,
204 .vop_fsync = ffs_fsync,
205 .vop_fdatasync = ffs_fdatasync,
206 .vop_lock1 = ffs_lock,
207 #ifdef INVARIANTS
208 .vop_unlock = ffs_unlock_debug,
209 #endif
210 .vop_reallocblks = ffs_reallocblks,
211 .vop_strategy = ffsext_strategy,
212 .vop_closeextattr = ffs_closeextattr,
213 .vop_deleteextattr = ffs_deleteextattr,
214 .vop_getextattr = ffs_getextattr,
215 .vop_listextattr = ffs_listextattr,
216 .vop_openextattr = ffs_openextattr,
217 .vop_setextattr = ffs_setextattr,
218 .vop_vptofh = ffs_vptofh,
219 .vop_fplookup_vexec = VOP_EAGAIN,
220 .vop_fplookup_symlink = VOP_EAGAIN,
221 };
222 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
223
224 /*
225 * Synch an open file.
226 */
227 /* ARGSUSED */
228 static int
ffs_fsync(struct vop_fsync_args * ap)229 ffs_fsync(struct vop_fsync_args *ap)
230 {
231 struct vnode *vp;
232 struct bufobj *bo;
233 int error;
234
235 vp = ap->a_vp;
236 bo = &vp->v_bufobj;
237 retry:
238 error = ffs_syncvnode(vp, ap->a_waitfor, 0);
239 if (error)
240 return (error);
241 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
242 error = softdep_fsync(vp);
243 if (error)
244 return (error);
245
246 /*
247 * The softdep_fsync() function may drop vp lock,
248 * allowing for dirty buffers to reappear on the
249 * bo_dirty list. Recheck and resync as needed.
250 */
251 BO_LOCK(bo);
252 if ((vp->v_type == VREG || vp->v_type == VDIR) &&
253 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
254 BO_UNLOCK(bo);
255 goto retry;
256 }
257 BO_UNLOCK(bo);
258 }
259 if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), 0))
260 return (ENXIO);
261 return (0);
262 }
263
264 int
ffs_syncvnode(struct vnode * vp,int waitfor,int flags)265 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
266 {
267 struct inode *ip;
268 struct bufobj *bo;
269 struct ufsmount *ump;
270 struct buf *bp, *nbp;
271 ufs_lbn_t lbn;
272 int error, passes, wflag;
273 bool still_dirty, unlocked, wait;
274
275 ip = VTOI(vp);
276 bo = &vp->v_bufobj;
277 ump = VFSTOUFS(vp->v_mount);
278 #ifdef WITNESS
279 wflag = IS_SNAPSHOT(ip) ? LK_NOWITNESS : 0;
280 #else
281 wflag = 0;
282 #endif
283
284 /*
285 * When doing MNT_WAIT we must first flush all dependencies
286 * on the inode.
287 */
288 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
289 (error = softdep_sync_metadata(vp)) != 0) {
290 if (ffs_fsfail_cleanup(ump, error))
291 error = 0;
292 return (error);
293 }
294
295 /*
296 * Flush all dirty buffers associated with a vnode.
297 */
298 error = 0;
299 passes = 0;
300 wait = false; /* Always do an async pass first. */
301 unlocked = false;
302 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
303 BO_LOCK(bo);
304 loop:
305 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
306 bp->b_vflags &= ~BV_SCANNED;
307 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
308 /*
309 * Reasons to skip this buffer: it has already been considered
310 * on this pass, the buffer has dependencies that will cause
311 * it to be redirtied and it has not already been deferred,
312 * or it is already being written.
313 */
314 if ((bp->b_vflags & BV_SCANNED) != 0)
315 continue;
316 bp->b_vflags |= BV_SCANNED;
317 /*
318 * Flush indirects in order, if requested.
319 *
320 * Note that if only datasync is requested, we can
321 * skip indirect blocks when softupdates are not
322 * active. Otherwise we must flush them with data,
323 * since dependencies prevent data block writes.
324 */
325 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
326 (lbn_level(bp->b_lblkno) >= passes ||
327 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
328 continue;
329 if (bp->b_lblkno > lbn)
330 panic("ffs_syncvnode: syncing truncated data.");
331 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
332 BO_UNLOCK(bo);
333 } else if (wait) {
334 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
335 LK_INTERLOCK | wflag, BO_LOCKPTR(bo)) != 0) {
336 BO_LOCK(bo);
337 bp->b_vflags &= ~BV_SCANNED;
338 goto next_locked;
339 }
340 } else
341 continue;
342 if ((bp->b_flags & B_DELWRI) == 0)
343 panic("ffs_fsync: not dirty");
344 /*
345 * Check for dependencies and potentially complete them.
346 */
347 if (!LIST_EMPTY(&bp->b_dep) &&
348 (error = softdep_sync_buf(vp, bp,
349 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
350 /*
351 * Lock order conflict, buffer was already unlocked,
352 * and vnode possibly unlocked.
353 */
354 if (error == ERELOOKUP) {
355 if (vp->v_data == NULL)
356 return (EBADF);
357 unlocked = true;
358 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
359 (error = softdep_sync_metadata(vp)) != 0) {
360 if (ffs_fsfail_cleanup(ump, error))
361 error = 0;
362 return (unlocked && error == 0 ?
363 ERELOOKUP : error);
364 }
365 /* Re-evaluate inode size */
366 lbn = lblkno(ITOFS(ip), (ip->i_size +
367 ITOFS(ip)->fs_bsize - 1));
368 goto next;
369 }
370 /* I/O error. */
371 if (error != EBUSY) {
372 BUF_UNLOCK(bp);
373 return (error);
374 }
375 /* If we deferred once, don't defer again. */
376 if ((bp->b_flags & B_DEFERRED) == 0) {
377 bp->b_flags |= B_DEFERRED;
378 BUF_UNLOCK(bp);
379 goto next;
380 }
381 }
382 if (wait) {
383 bremfree(bp);
384 error = bwrite(bp);
385 if (ffs_fsfail_cleanup(ump, error))
386 error = 0;
387 if (error != 0)
388 return (error);
389 } else if ((bp->b_flags & B_CLUSTEROK)) {
390 (void) vfs_bio_awrite(bp);
391 } else {
392 bremfree(bp);
393 (void) bawrite(bp);
394 }
395 next:
396 /*
397 * Since we may have slept during the I/O, we need
398 * to start from a known point.
399 */
400 BO_LOCK(bo);
401 next_locked:
402 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
403 }
404 if (waitfor != MNT_WAIT) {
405 BO_UNLOCK(bo);
406 if ((flags & NO_INO_UPDT) != 0)
407 return (unlocked ? ERELOOKUP : 0);
408 error = ffs_update(vp, 0);
409 if (error == 0 && unlocked)
410 error = ERELOOKUP;
411 return (error);
412 }
413 /* Drain IO to see if we're done. */
414 bufobj_wwait(bo, 0, 0);
415 /*
416 * Block devices associated with filesystems may have new I/O
417 * requests posted for them even if the vnode is locked, so no
418 * amount of trying will get them clean. We make several passes
419 * as a best effort.
420 *
421 * Regular files may need multiple passes to flush all dependency
422 * work as it is possible that we must write once per indirect
423 * level, once for the leaf, and once for the inode and each of
424 * these will be done with one sync and one async pass.
425 */
426 if (bo->bo_dirty.bv_cnt > 0) {
427 if ((flags & DATA_ONLY) == 0) {
428 still_dirty = true;
429 } else {
430 /*
431 * For data-only sync, dirty indirect buffers
432 * are ignored.
433 */
434 still_dirty = false;
435 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
436 if (bp->b_lblkno > -UFS_NDADDR) {
437 still_dirty = true;
438 break;
439 }
440 }
441 }
442
443 if (still_dirty) {
444 /* Write the inode after sync passes to flush deps. */
445 if (wait && DOINGSOFTDEP(vp) &&
446 (flags & NO_INO_UPDT) == 0) {
447 BO_UNLOCK(bo);
448 ffs_update(vp, 1);
449 BO_LOCK(bo);
450 }
451 /* switch between sync/async. */
452 wait = !wait;
453 if (wait || ++passes < UFS_NIADDR + 2)
454 goto loop;
455 }
456 }
457 BO_UNLOCK(bo);
458 error = 0;
459 if ((flags & DATA_ONLY) == 0) {
460 if ((flags & NO_INO_UPDT) == 0)
461 error = ffs_update(vp, 1);
462 if (DOINGSUJ(vp))
463 softdep_journal_fsync(VTOI(vp));
464 } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
465 error = ffs_update(vp, 1);
466 }
467 if (error == 0 && unlocked)
468 error = ERELOOKUP;
469 if (error == 0)
470 ip->i_flag &= ~IN_NEEDSYNC;
471 return (error);
472 }
473
474 static int
ffs_fdatasync(struct vop_fdatasync_args * ap)475 ffs_fdatasync(struct vop_fdatasync_args *ap)
476 {
477
478 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
479 }
480
481 static int
ffs_lock(struct vop_lock1_args * ap)482 ffs_lock(
483 struct vop_lock1_args /* {
484 struct vnode *a_vp;
485 int a_flags;
486 char *file;
487 int line;
488 } */ *ap)
489 {
490 #if !defined(NO_FFS_SNAPSHOT) || defined(DIAGNOSTIC)
491 struct vnode *vp = ap->a_vp;
492 #endif /* !NO_FFS_SNAPSHOT || DIAGNOSTIC */
493 #ifdef DIAGNOSTIC
494 struct inode *ip;
495 #endif /* DIAGNOSTIC */
496 int result;
497 #ifndef NO_FFS_SNAPSHOT
498 int flags;
499 struct lock *lkp;
500
501 /*
502 * Adaptive spinning mixed with SU leads to trouble. use a giant hammer
503 * and only use it when LK_NODDLKTREAT is set. Currently this means it
504 * is only used during path lookup.
505 */
506 if ((ap->a_flags & LK_NODDLKTREAT) != 0)
507 ap->a_flags |= LK_ADAPTIVE;
508 switch (ap->a_flags & LK_TYPE_MASK) {
509 case LK_SHARED:
510 case LK_UPGRADE:
511 case LK_EXCLUSIVE:
512 flags = ap->a_flags;
513 for (;;) {
514 #ifdef DEBUG_VFS_LOCKS
515 VNPASS(vp->v_holdcnt != 0, vp);
516 #endif /* DEBUG_VFS_LOCKS */
517 lkp = vp->v_vnlock;
518 result = lockmgr_lock_flags(lkp, flags,
519 &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line);
520 if (lkp == vp->v_vnlock || result != 0)
521 break;
522 /*
523 * Apparent success, except that the vnode
524 * mutated between snapshot file vnode and
525 * regular file vnode while this process
526 * slept. The lock currently held is not the
527 * right lock. Release it, and try to get the
528 * new lock.
529 */
530 lockmgr_unlock(lkp);
531 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
532 (LK_INTERLOCK | LK_NOWAIT))
533 return (EBUSY);
534 if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
535 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
536 flags &= ~LK_INTERLOCK;
537 }
538 #ifdef DIAGNOSTIC
539 switch (ap->a_flags & LK_TYPE_MASK) {
540 case LK_UPGRADE:
541 case LK_EXCLUSIVE:
542 if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
543 ip = VTOI(vp);
544 if (ip != NULL)
545 ip->i_lock_gen++;
546 }
547 }
548 #endif /* DIAGNOSTIC */
549 break;
550 default:
551 #ifdef DIAGNOSTIC
552 if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
553 ip = VTOI(vp);
554 if (ip != NULL)
555 ufs_unlock_tracker(ip);
556 }
557 #endif /* DIAGNOSTIC */
558 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
559 break;
560 }
561 #else /* NO_FFS_SNAPSHOT */
562 /*
563 * See above for an explanation.
564 */
565 if ((ap->a_flags & LK_NODDLKTREAT) != 0)
566 ap->a_flags |= LK_ADAPTIVE;
567 #ifdef DIAGNOSTIC
568 if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
569 ip = VTOI(vp);
570 if (ip != NULL)
571 ufs_unlock_tracker(ip);
572 }
573 #endif /* DIAGNOSTIC */
574 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
575 #endif /* NO_FFS_SNAPSHOT */
576 #ifdef DIAGNOSTIC
577 switch (ap->a_flags & LK_TYPE_MASK) {
578 case LK_UPGRADE:
579 case LK_EXCLUSIVE:
580 if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
581 ip = VTOI(vp);
582 if (ip != NULL)
583 ip->i_lock_gen++;
584 }
585 }
586 #endif /* DIAGNOSTIC */
587 return (result);
588 }
589
590 #ifdef INVARIANTS
591 static int
ffs_unlock_debug(struct vop_unlock_args * ap)592 ffs_unlock_debug(struct vop_unlock_args *ap)
593 {
594 struct vnode *vp;
595 struct inode *ip;
596
597 vp = ap->a_vp;
598 ip = VTOI(vp);
599 if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
600 if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
601 VI_LOCK(vp);
602 VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
603 ("%s: modified vnode (%x) not on lazy list",
604 __func__, ip->i_flag));
605 VI_UNLOCK(vp);
606 }
607 }
608 KASSERT(vp->v_type != VDIR || vp->v_vnlock->lk_recurse != 0 ||
609 (ip->i_flag & IN_ENDOFF) == 0,
610 ("ufs dir vp %p ip %p flags %#x", vp, ip, ip->i_flag));
611 #ifdef DIAGNOSTIC
612 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL &&
613 vp->v_vnlock->lk_recurse == 0)
614 ufs_unlock_tracker(ip);
615 #endif
616 return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
617 }
618 #endif
619
620 static int
ffs_read_hole(struct uio * uio,long xfersize,long * size)621 ffs_read_hole(struct uio *uio, long xfersize, long *size)
622 {
623 ssize_t saved_resid, tlen;
624 int error;
625
626 while (xfersize > 0) {
627 tlen = min(xfersize, ZERO_REGION_SIZE);
628 saved_resid = uio->uio_resid;
629 error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
630 tlen, uio);
631 if (error != 0)
632 return (error);
633 tlen = saved_resid - uio->uio_resid;
634 xfersize -= tlen;
635 *size -= tlen;
636 }
637 return (0);
638 }
639
640 /*
641 * Vnode op for reading.
642 */
643 static int
ffs_read(struct vop_read_args * ap)644 ffs_read(
645 struct vop_read_args /* {
646 struct vnode *a_vp;
647 struct uio *a_uio;
648 int a_ioflag;
649 struct ucred *a_cred;
650 } */ *ap)
651 {
652 struct vnode *vp;
653 struct inode *ip;
654 struct uio *uio;
655 struct fs *fs;
656 struct buf *bp;
657 ufs_lbn_t lbn, nextlbn;
658 off_t bytesinfile;
659 long size, xfersize, blkoffset;
660 ssize_t orig_resid;
661 int bflag, error, ioflag, seqcount;
662
663 vp = ap->a_vp;
664 uio = ap->a_uio;
665 ioflag = ap->a_ioflag;
666 if (ap->a_ioflag & IO_EXT)
667 #ifdef notyet
668 return (ffs_extread(vp, uio, ioflag));
669 #else
670 panic("ffs_read+IO_EXT");
671 #endif
672 #ifdef DIRECTIO
673 if ((ioflag & IO_DIRECT) != 0) {
674 int workdone;
675
676 error = ffs_rawread(vp, uio, &workdone);
677 if (error != 0 || workdone != 0)
678 return error;
679 }
680 #endif
681
682 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
683 ip = VTOI(vp);
684
685 #ifdef INVARIANTS
686 if (uio->uio_rw != UIO_READ)
687 panic("ffs_read: mode");
688
689 if (vp->v_type == VLNK) {
690 if ((int)ip->i_size < VFSTOUFS(vp->v_mount)->um_maxsymlinklen)
691 panic("ffs_read: short symlink");
692 } else if (vp->v_type != VREG && vp->v_type != VDIR)
693 panic("ffs_read: type %d", vp->v_type);
694 #endif
695 orig_resid = uio->uio_resid;
696 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
697 if (orig_resid == 0)
698 return (0);
699 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
700 fs = ITOFS(ip);
701 if (uio->uio_offset < ip->i_size &&
702 uio->uio_offset >= fs->fs_maxfilesize)
703 return (EOVERFLOW);
704
705 bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
706 #ifdef WITNESS
707 bflag |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0;
708 #endif
709 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
710 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
711 break;
712 lbn = lblkno(fs, uio->uio_offset);
713 nextlbn = lbn + 1;
714
715 /*
716 * size of buffer. The buffer representing the
717 * end of the file is rounded up to the size of
718 * the block type ( fragment or full block,
719 * depending ).
720 */
721 size = blksize(fs, ip, lbn);
722 blkoffset = blkoff(fs, uio->uio_offset);
723
724 /*
725 * The amount we want to transfer in this iteration is
726 * one FS block less the amount of the data before
727 * our startpoint (duh!)
728 */
729 xfersize = fs->fs_bsize - blkoffset;
730
731 /*
732 * But if we actually want less than the block,
733 * or the file doesn't have a whole block more of data,
734 * then use the lesser number.
735 */
736 if (uio->uio_resid < xfersize)
737 xfersize = uio->uio_resid;
738 if (bytesinfile < xfersize)
739 xfersize = bytesinfile;
740
741 if (lblktosize(fs, nextlbn) >= ip->i_size) {
742 /*
743 * Don't do readahead if this is the end of the file.
744 */
745 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
746 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
747 /*
748 * Otherwise if we are allowed to cluster,
749 * grab as much as we can.
750 *
751 * XXX This may not be a win if we are not
752 * doing sequential access.
753 */
754 error = cluster_read(vp, ip->i_size, lbn,
755 size, NOCRED, blkoffset + uio->uio_resid,
756 seqcount, bflag, &bp);
757 } else if (seqcount > 1) {
758 /*
759 * If we are NOT allowed to cluster, then
760 * if we appear to be acting sequentially,
761 * fire off a request for a readahead
762 * as well as a read. Note that the 4th and 5th
763 * arguments point to arrays of the size specified in
764 * the 6th argument.
765 */
766 int nextsize = blksize(fs, ip, nextlbn);
767 error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
768 &nextsize, 1, NOCRED, bflag, NULL, &bp);
769 } else {
770 /*
771 * Failing all of the above, just read what the
772 * user asked for. Interestingly, the same as
773 * the first option above.
774 */
775 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
776 }
777 if (error == EJUSTRETURN) {
778 error = ffs_read_hole(uio, xfersize, &size);
779 if (error == 0)
780 continue;
781 }
782 if (error != 0) {
783 brelse(bp);
784 bp = NULL;
785 break;
786 }
787
788 /*
789 * We should only get non-zero b_resid when an I/O error
790 * has occurred, which should cause us to break above.
791 * However, if the short read did not cause an error,
792 * then we want to ensure that we do not uiomove bad
793 * or uninitialized data.
794 */
795 size -= bp->b_resid;
796 if (size < xfersize) {
797 if (size == 0)
798 break;
799 xfersize = size;
800 }
801
802 if (buf_mapped(bp)) {
803 error = vn_io_fault_uiomove((char *)bp->b_data +
804 blkoffset, (int)xfersize, uio);
805 } else {
806 error = vn_io_fault_pgmove(bp->b_pages,
807 blkoffset + (bp->b_offset & PAGE_MASK),
808 (int)xfersize, uio);
809 }
810 if (error)
811 break;
812
813 vfs_bio_brelse(bp, ioflag);
814 }
815
816 /*
817 * This can only happen in the case of an error
818 * because the loop above resets bp to NULL on each iteration
819 * and on normal completion has not set a new value into it.
820 * so it must have come from a 'break' statement
821 */
822 if (bp != NULL)
823 vfs_bio_brelse(bp, ioflag);
824
825 if ((error == 0 || uio->uio_resid != orig_resid) &&
826 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
827 UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
828 return (error);
829 }
830
831 /*
832 * Vnode op for writing.
833 */
834 static int
ffs_write(struct vop_write_args * ap)835 ffs_write(
836 struct vop_write_args /* {
837 struct vnode *a_vp;
838 struct uio *a_uio;
839 int a_ioflag;
840 struct ucred *a_cred;
841 } */ *ap)
842 {
843 struct vnode *vp;
844 struct uio *uio;
845 struct inode *ip;
846 struct fs *fs;
847 struct buf *bp;
848 ufs_lbn_t lbn;
849 off_t osize;
850 ssize_t resid, r;
851 int seqcount;
852 int blkoffset, error, flags, ioflag, size, xfersize;
853
854 vp = ap->a_vp;
855 if (DOINGSUJ(vp))
856 softdep_prealloc(vp, MNT_WAIT);
857 if (vp->v_data == NULL)
858 return (EBADF);
859
860 uio = ap->a_uio;
861 ioflag = ap->a_ioflag;
862 if (ap->a_ioflag & IO_EXT)
863 #ifdef notyet
864 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
865 #else
866 panic("ffs_write+IO_EXT");
867 #endif
868
869 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
870 ip = VTOI(vp);
871
872 #ifdef INVARIANTS
873 if (uio->uio_rw != UIO_WRITE)
874 panic("ffs_write: mode");
875 #endif
876
877 switch (vp->v_type) {
878 case VREG:
879 if (ioflag & IO_APPEND)
880 uio->uio_offset = ip->i_size;
881 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
882 return (EPERM);
883 /* FALLTHROUGH */
884 case VLNK:
885 break;
886 case VDIR:
887 panic("ffs_write: dir write");
888 break;
889 default:
890 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
891 (int)uio->uio_offset,
892 (int)uio->uio_resid
893 );
894 }
895
896 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
897 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
898 fs = ITOFS(ip);
899
900 /*
901 * Maybe this should be above the vnode op call, but so long as
902 * file servers have no limits, I don't think it matters.
903 */
904 error = vn_rlimit_fsizex(vp, uio, fs->fs_maxfilesize, &r,
905 uio->uio_td);
906 if (error != 0) {
907 vn_rlimit_fsizex_res(uio, r);
908 return (error);
909 }
910
911 resid = uio->uio_resid;
912 osize = ip->i_size;
913 if (seqcount > BA_SEQMAX)
914 flags = BA_SEQMAX << BA_SEQSHIFT;
915 else
916 flags = seqcount << BA_SEQSHIFT;
917 if (ioflag & IO_SYNC)
918 flags |= IO_SYNC;
919 flags |= BA_UNMAPPED;
920
921 for (error = 0; uio->uio_resid > 0;) {
922 lbn = lblkno(fs, uio->uio_offset);
923 blkoffset = blkoff(fs, uio->uio_offset);
924 xfersize = fs->fs_bsize - blkoffset;
925 if (uio->uio_resid < xfersize)
926 xfersize = uio->uio_resid;
927 if (uio->uio_offset + xfersize > ip->i_size)
928 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
929
930 /*
931 * We must perform a read-before-write if the transfer size
932 * does not cover the entire buffer.
933 */
934 if (fs->fs_bsize > xfersize)
935 flags |= BA_CLRBUF;
936 else
937 flags &= ~BA_CLRBUF;
938 /* XXX is uio->uio_offset the right thing here? */
939 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
940 ap->a_cred, flags, &bp);
941 if (error != 0) {
942 vnode_pager_setsize(vp, ip->i_size);
943 break;
944 }
945 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
946 bp->b_flags |= B_NOCACHE;
947
948 if (uio->uio_offset + xfersize > ip->i_size) {
949 ip->i_size = uio->uio_offset + xfersize;
950 DIP_SET(ip, i_size, ip->i_size);
951 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
952 }
953
954 size = blksize(fs, ip, lbn) - bp->b_resid;
955 if (size < xfersize)
956 xfersize = size;
957
958 if (buf_mapped(bp)) {
959 error = vn_io_fault_uiomove((char *)bp->b_data +
960 blkoffset, (int)xfersize, uio);
961 } else {
962 error = vn_io_fault_pgmove(bp->b_pages,
963 blkoffset + (bp->b_offset & PAGE_MASK),
964 (int)xfersize, uio);
965 }
966 /*
967 * If the buffer is not already filled and we encounter an
968 * error while trying to fill it, we have to clear out any
969 * garbage data from the pages instantiated for the buffer.
970 * If we do not, a failed uiomove() during a write can leave
971 * the prior contents of the pages exposed to a userland mmap.
972 *
973 * Note that we need only clear buffers with a transfer size
974 * equal to the block size because buffers with a shorter
975 * transfer size were cleared above by the call to UFS_BALLOC()
976 * with the BA_CLRBUF flag set.
977 *
978 * If the source region for uiomove identically mmaps the
979 * buffer, uiomove() performed the NOP copy, and the buffer
980 * content remains valid because the page fault handler
981 * validated the pages.
982 */
983 if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
984 fs->fs_bsize == xfersize) {
985 if (error == EFAULT && LIST_EMPTY(&bp->b_dep)) {
986 bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
987 brelse(bp);
988 break;
989 } else {
990 vfs_bio_clrbuf(bp);
991 }
992 }
993
994 vfs_bio_set_flags(bp, ioflag);
995
996 /*
997 * If IO_SYNC each buffer is written synchronously. Otherwise
998 * if we have a severe page deficiency write the buffer
999 * asynchronously. Otherwise try to cluster, and if that
1000 * doesn't do it then either do an async write (if O_DIRECT),
1001 * or a delayed write (if not).
1002 */
1003 if (ioflag & IO_SYNC) {
1004 (void)bwrite(bp);
1005 } else if (vm_page_count_severe() ||
1006 buf_dirty_count_severe() ||
1007 (ioflag & IO_ASYNC)) {
1008 bp->b_flags |= B_CLUSTEROK;
1009 bawrite(bp);
1010 } else if (xfersize + blkoffset == fs->fs_bsize) {
1011 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
1012 bp->b_flags |= B_CLUSTEROK;
1013 cluster_write(vp, &ip->i_clusterw, bp,
1014 ip->i_size, seqcount, GB_UNMAPPED);
1015 } else {
1016 bawrite(bp);
1017 }
1018 } else if (ioflag & IO_DIRECT) {
1019 bp->b_flags |= B_CLUSTEROK;
1020 bawrite(bp);
1021 } else {
1022 bp->b_flags |= B_CLUSTEROK;
1023 bdwrite(bp);
1024 }
1025 if (error || xfersize == 0)
1026 break;
1027 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1028 }
1029 /*
1030 * If we successfully wrote any data, and we are not the superuser
1031 * we clear the setuid and setgid bits as a precaution against
1032 * tampering.
1033 */
1034 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
1035 ap->a_cred) {
1036 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
1037 vn_seqc_write_begin(vp);
1038 UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1039 DIP_SET(ip, i_mode, ip->i_mode);
1040 vn_seqc_write_end(vp);
1041 }
1042 }
1043 if (error) {
1044 if (ioflag & IO_UNIT) {
1045 (void)ffs_truncate(vp, osize,
1046 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
1047 uio->uio_offset -= resid - uio->uio_resid;
1048 uio->uio_resid = resid;
1049 }
1050 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
1051 if (!(ioflag & IO_DATASYNC) ||
1052 (ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)))
1053 error = ffs_update(vp, 1);
1054 if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error))
1055 error = ENXIO;
1056 }
1057 vn_rlimit_fsizex_res(uio, r);
1058 return (error);
1059 }
1060
1061 /*
1062 * Extended attribute area reading.
1063 */
1064 static int
ffs_extread(struct vnode * vp,struct uio * uio,int ioflag)1065 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
1066 {
1067 struct inode *ip;
1068 struct ufs2_dinode *dp;
1069 struct fs *fs;
1070 struct buf *bp;
1071 ufs_lbn_t lbn, nextlbn;
1072 off_t bytesinfile;
1073 long size, xfersize, blkoffset;
1074 ssize_t orig_resid;
1075 int error;
1076
1077 ip = VTOI(vp);
1078 fs = ITOFS(ip);
1079 dp = ip->i_din2;
1080
1081 #ifdef INVARIANTS
1082 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
1083 panic("ffs_extread: mode");
1084
1085 #endif
1086 orig_resid = uio->uio_resid;
1087 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
1088 if (orig_resid == 0)
1089 return (0);
1090 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
1091
1092 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1093 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
1094 break;
1095 lbn = lblkno(fs, uio->uio_offset);
1096 nextlbn = lbn + 1;
1097
1098 /*
1099 * size of buffer. The buffer representing the
1100 * end of the file is rounded up to the size of
1101 * the block type ( fragment or full block,
1102 * depending ).
1103 */
1104 size = sblksize(fs, dp->di_extsize, lbn);
1105 blkoffset = blkoff(fs, uio->uio_offset);
1106
1107 /*
1108 * The amount we want to transfer in this iteration is
1109 * one FS block less the amount of the data before
1110 * our startpoint (duh!)
1111 */
1112 xfersize = fs->fs_bsize - blkoffset;
1113
1114 /*
1115 * But if we actually want less than the block,
1116 * or the file doesn't have a whole block more of data,
1117 * then use the lesser number.
1118 */
1119 if (uio->uio_resid < xfersize)
1120 xfersize = uio->uio_resid;
1121 if (bytesinfile < xfersize)
1122 xfersize = bytesinfile;
1123
1124 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1125 /*
1126 * Don't do readahead if this is the end of the info.
1127 */
1128 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1129 } else {
1130 /*
1131 * If we have a second block, then
1132 * fire off a request for a readahead
1133 * as well as a read. Note that the 4th and 5th
1134 * arguments point to arrays of the size specified in
1135 * the 6th argument.
1136 */
1137 int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1138 nextlbn = -1 - nextlbn;
1139 error = breadn(vp, -1 - lbn,
1140 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1141 }
1142 if (error) {
1143 brelse(bp);
1144 bp = NULL;
1145 break;
1146 }
1147
1148 /*
1149 * We should only get non-zero b_resid when an I/O error
1150 * has occurred, which should cause us to break above.
1151 * However, if the short read did not cause an error,
1152 * then we want to ensure that we do not uiomove bad
1153 * or uninitialized data.
1154 */
1155 size -= bp->b_resid;
1156 if (size < xfersize) {
1157 if (size == 0)
1158 break;
1159 xfersize = size;
1160 }
1161
1162 error = uiomove((char *)bp->b_data + blkoffset,
1163 (int)xfersize, uio);
1164 if (error)
1165 break;
1166 vfs_bio_brelse(bp, ioflag);
1167 }
1168
1169 /*
1170 * This can only happen in the case of an error
1171 * because the loop above resets bp to NULL on each iteration
1172 * and on normal completion has not set a new value into it.
1173 * so it must have come from a 'break' statement
1174 */
1175 if (bp != NULL)
1176 vfs_bio_brelse(bp, ioflag);
1177 return (error);
1178 }
1179
1180 /*
1181 * Extended attribute area writing.
1182 */
1183 static int
ffs_extwrite(struct vnode * vp,struct uio * uio,int ioflag,struct ucred * ucred)1184 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1185 {
1186 struct inode *ip;
1187 struct ufs2_dinode *dp;
1188 struct fs *fs;
1189 struct buf *bp;
1190 ufs_lbn_t lbn;
1191 off_t osize;
1192 ssize_t resid;
1193 int blkoffset, error, flags, size, xfersize;
1194
1195 ip = VTOI(vp);
1196 fs = ITOFS(ip);
1197 dp = ip->i_din2;
1198
1199 #ifdef INVARIANTS
1200 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1201 panic("ffs_extwrite: mode");
1202 #endif
1203
1204 if (ioflag & IO_APPEND)
1205 uio->uio_offset = dp->di_extsize;
1206 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1207 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1208 if ((uoff_t)uio->uio_offset + uio->uio_resid >
1209 UFS_NXADDR * fs->fs_bsize)
1210 return (EFBIG);
1211
1212 resid = uio->uio_resid;
1213 osize = dp->di_extsize;
1214 flags = IO_EXT;
1215 if (ioflag & IO_SYNC)
1216 flags |= IO_SYNC;
1217
1218 for (error = 0; uio->uio_resid > 0;) {
1219 lbn = lblkno(fs, uio->uio_offset);
1220 blkoffset = blkoff(fs, uio->uio_offset);
1221 xfersize = fs->fs_bsize - blkoffset;
1222 if (uio->uio_resid < xfersize)
1223 xfersize = uio->uio_resid;
1224
1225 /*
1226 * We must perform a read-before-write if the transfer size
1227 * does not cover the entire buffer.
1228 */
1229 if (fs->fs_bsize > xfersize)
1230 flags |= BA_CLRBUF;
1231 else
1232 flags &= ~BA_CLRBUF;
1233 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1234 ucred, flags, &bp);
1235 if (error != 0)
1236 break;
1237 /*
1238 * If the buffer is not valid we have to clear out any
1239 * garbage data from the pages instantiated for the buffer.
1240 * If we do not, a failed uiomove() during a write can leave
1241 * the prior contents of the pages exposed to a userland
1242 * mmap(). XXX deal with uiomove() errors a better way.
1243 */
1244 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1245 vfs_bio_clrbuf(bp);
1246
1247 if (uio->uio_offset + xfersize > dp->di_extsize) {
1248 dp->di_extsize = uio->uio_offset + xfersize;
1249 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
1250 }
1251
1252 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1253 if (size < xfersize)
1254 xfersize = size;
1255
1256 error =
1257 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1258
1259 vfs_bio_set_flags(bp, ioflag);
1260
1261 /*
1262 * If IO_SYNC each buffer is written synchronously. Otherwise
1263 * if we have a severe page deficiency write the buffer
1264 * asynchronously. Otherwise try to cluster, and if that
1265 * doesn't do it then either do an async write (if O_DIRECT),
1266 * or a delayed write (if not).
1267 */
1268 if (ioflag & IO_SYNC) {
1269 (void)bwrite(bp);
1270 } else if (vm_page_count_severe() ||
1271 buf_dirty_count_severe() ||
1272 xfersize + blkoffset == fs->fs_bsize ||
1273 (ioflag & (IO_ASYNC | IO_DIRECT)))
1274 bawrite(bp);
1275 else
1276 bdwrite(bp);
1277 if (error || xfersize == 0)
1278 break;
1279 UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1280 }
1281 /*
1282 * If we successfully wrote any data, and we are not the superuser
1283 * we clear the setuid and setgid bits as a precaution against
1284 * tampering.
1285 */
1286 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1287 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1288 vn_seqc_write_begin(vp);
1289 UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1290 dp->di_mode = ip->i_mode;
1291 vn_seqc_write_end(vp);
1292 }
1293 }
1294 if (error) {
1295 if (ioflag & IO_UNIT) {
1296 (void)ffs_truncate(vp, osize,
1297 IO_EXT | (ioflag&IO_SYNC), ucred);
1298 uio->uio_offset -= resid - uio->uio_resid;
1299 uio->uio_resid = resid;
1300 }
1301 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1302 error = ffs_update(vp, 1);
1303 return (error);
1304 }
1305
1306 /*
1307 * Vnode operating to retrieve a named extended attribute.
1308 *
1309 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1310 * the length of the EA, and possibly the pointer to the entry and to the data.
1311 */
1312 static int
ffs_findextattr(uint8_t * ptr,uint64_t length,int nspace,const char * name,struct extattr ** eapp,uint8_t ** eac)1313 ffs_findextattr(uint8_t *ptr, uint64_t length, int nspace, const char *name,
1314 struct extattr **eapp, uint8_t **eac)
1315 {
1316 struct extattr *eap, *eaend;
1317 size_t nlen;
1318
1319 nlen = strlen(name);
1320 KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1321 eap = (struct extattr *)ptr;
1322 eaend = (struct extattr *)(ptr + length);
1323 for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1324 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1325 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1326 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1327 || memcmp(eap->ea_name, name, nlen) != 0)
1328 continue;
1329 if (eapp != NULL)
1330 *eapp = eap;
1331 if (eac != NULL)
1332 *eac = EXTATTR_CONTENT(eap);
1333 return (EXTATTR_CONTENT_SIZE(eap));
1334 }
1335 return (-1);
1336 }
1337
1338 static int
ffs_rdextattr(uint8_t ** p,struct vnode * vp,struct thread * td)1339 ffs_rdextattr(uint8_t **p, struct vnode *vp, struct thread *td)
1340 {
1341 const struct extattr *eap, *eaend, *eapnext;
1342 struct inode *ip;
1343 struct ufs2_dinode *dp;
1344 struct fs *fs;
1345 struct uio luio;
1346 struct iovec liovec;
1347 uint64_t easize;
1348 int error;
1349 uint8_t *eae;
1350
1351 ip = VTOI(vp);
1352 fs = ITOFS(ip);
1353 dp = ip->i_din2;
1354 easize = dp->di_extsize;
1355 if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize)
1356 return (EFBIG);
1357
1358 eae = malloc(easize, M_TEMP, M_WAITOK);
1359
1360 liovec.iov_base = eae;
1361 liovec.iov_len = easize;
1362 luio.uio_iov = &liovec;
1363 luio.uio_iovcnt = 1;
1364 luio.uio_offset = 0;
1365 luio.uio_resid = easize;
1366 luio.uio_segflg = UIO_SYSSPACE;
1367 luio.uio_rw = UIO_READ;
1368 luio.uio_td = td;
1369
1370 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1371 if (error) {
1372 free(eae, M_TEMP);
1373 return (error);
1374 }
1375 /* Validate disk xattrfile contents. */
1376 for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend;
1377 eap = eapnext) {
1378 /* Detect zeroed out tail */
1379 if (eap->ea_length < sizeof(*eap) || eap->ea_length == 0) {
1380 easize = (const uint8_t *)eap - eae;
1381 break;
1382 }
1383
1384 eapnext = EXTATTR_NEXT(eap);
1385 /* Bogusly long entry. */
1386 if (eapnext > eaend) {
1387 free(eae, M_TEMP);
1388 return (EINTEGRITY);
1389 }
1390 }
1391 ip->i_ea_len = easize;
1392 *p = eae;
1393 return (0);
1394 }
1395
1396 static void
ffs_lock_ea(struct vnode * vp)1397 ffs_lock_ea(struct vnode *vp)
1398 {
1399 struct inode *ip;
1400
1401 ip = VTOI(vp);
1402 VI_LOCK(vp);
1403 while (ip->i_flag & IN_EA_LOCKED) {
1404 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1405 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1406 0);
1407 }
1408 UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1409 VI_UNLOCK(vp);
1410 }
1411
1412 static void
ffs_unlock_ea(struct vnode * vp)1413 ffs_unlock_ea(struct vnode *vp)
1414 {
1415 struct inode *ip;
1416
1417 ip = VTOI(vp);
1418 VI_LOCK(vp);
1419 if (ip->i_flag & IN_EA_LOCKWAIT)
1420 wakeup(&ip->i_ea_refs);
1421 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1422 VI_UNLOCK(vp);
1423 }
1424
1425 static int
ffs_open_ea(struct vnode * vp,struct ucred * cred,struct thread * td)1426 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1427 {
1428 struct inode *ip;
1429 int error;
1430
1431 ip = VTOI(vp);
1432
1433 ffs_lock_ea(vp);
1434 if (ip->i_ea_area != NULL) {
1435 ip->i_ea_refs++;
1436 ffs_unlock_ea(vp);
1437 return (0);
1438 }
1439 error = ffs_rdextattr(&ip->i_ea_area, vp, td);
1440 if (error) {
1441 ffs_unlock_ea(vp);
1442 return (error);
1443 }
1444 ip->i_ea_error = 0;
1445 ip->i_ea_refs++;
1446 ffs_unlock_ea(vp);
1447 return (0);
1448 }
1449
1450 /*
1451 * Vnode extattr transaction commit/abort
1452 */
1453 static int
ffs_close_ea(struct vnode * vp,int commit,struct ucred * cred,struct thread * td)1454 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1455 {
1456 struct inode *ip;
1457 struct uio luio;
1458 struct iovec *liovec;
1459 struct ufs2_dinode *dp;
1460 size_t ea_len, tlen;
1461 int error, i, lcnt;
1462 bool truncate;
1463
1464 ip = VTOI(vp);
1465
1466 ffs_lock_ea(vp);
1467 if (ip->i_ea_area == NULL) {
1468 ffs_unlock_ea(vp);
1469 return (EINVAL);
1470 }
1471 dp = ip->i_din2;
1472 error = ip->i_ea_error;
1473 truncate = false;
1474 if (commit && error == 0) {
1475 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1476 if (cred == NOCRED)
1477 cred = vp->v_mount->mnt_cred;
1478
1479 ea_len = MAX(ip->i_ea_len, dp->di_extsize);
1480 for (lcnt = 1, tlen = ea_len - ip->i_ea_len; tlen > 0;) {
1481 tlen -= MIN(ZERO_REGION_SIZE, tlen);
1482 lcnt++;
1483 }
1484
1485 liovec = __builtin_alloca(lcnt * sizeof(struct iovec));
1486 luio.uio_iovcnt = lcnt;
1487
1488 liovec[0].iov_base = ip->i_ea_area;
1489 liovec[0].iov_len = ip->i_ea_len;
1490 for (i = 1, tlen = ea_len - ip->i_ea_len; i < lcnt; i++) {
1491 liovec[i].iov_base = __DECONST(void *, zero_region);
1492 liovec[i].iov_len = MIN(ZERO_REGION_SIZE, tlen);
1493 tlen -= liovec[i].iov_len;
1494 }
1495 MPASS(tlen == 0);
1496
1497 luio.uio_iov = liovec;
1498 luio.uio_offset = 0;
1499 luio.uio_resid = ea_len;
1500 luio.uio_segflg = UIO_SYSSPACE;
1501 luio.uio_rw = UIO_WRITE;
1502 luio.uio_td = td;
1503 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1504 if (error == 0 && ip->i_ea_len == 0)
1505 truncate = true;
1506 }
1507 if (--ip->i_ea_refs == 0) {
1508 free(ip->i_ea_area, M_TEMP);
1509 ip->i_ea_area = NULL;
1510 ip->i_ea_len = 0;
1511 ip->i_ea_error = 0;
1512 }
1513 ffs_unlock_ea(vp);
1514
1515 if (truncate)
1516 ffs_truncate(vp, 0, IO_EXT, cred);
1517 return (error);
1518 }
1519
1520 /*
1521 * Vnode extattr strategy routine for fifos.
1522 *
1523 * We need to check for a read or write of the external attributes.
1524 * Otherwise we just fall through and do the usual thing.
1525 */
1526 static int
ffsext_strategy(struct vop_strategy_args * ap)1527 ffsext_strategy(
1528 struct vop_strategy_args /* {
1529 struct vnodeop_desc *a_desc;
1530 struct vnode *a_vp;
1531 struct buf *a_bp;
1532 } */ *ap)
1533 {
1534 struct vnode *vp;
1535 daddr_t lbn;
1536
1537 vp = ap->a_vp;
1538 lbn = ap->a_bp->b_lblkno;
1539 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1540 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1541 if (vp->v_type == VFIFO)
1542 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1543 panic("spec nodes went here");
1544 }
1545
1546 /*
1547 * Vnode extattr transaction commit/abort
1548 */
1549 static int
ffs_openextattr(struct vop_openextattr_args * ap)1550 ffs_openextattr(
1551 struct vop_openextattr_args /* {
1552 struct vnodeop_desc *a_desc;
1553 struct vnode *a_vp;
1554 IN struct ucred *a_cred;
1555 IN struct thread *a_td;
1556 } */ *ap)
1557 {
1558
1559 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1560 return (EOPNOTSUPP);
1561
1562 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1563 }
1564
1565 /*
1566 * Vnode extattr transaction commit/abort
1567 */
1568 static int
ffs_closeextattr(struct vop_closeextattr_args * ap)1569 ffs_closeextattr(
1570 struct vop_closeextattr_args /* {
1571 struct vnodeop_desc *a_desc;
1572 struct vnode *a_vp;
1573 int a_commit;
1574 IN struct ucred *a_cred;
1575 IN struct thread *a_td;
1576 } */ *ap)
1577 {
1578 struct vnode *vp;
1579
1580 vp = ap->a_vp;
1581 if (vp->v_type == VCHR || vp->v_type == VBLK)
1582 return (EOPNOTSUPP);
1583 if (ap->a_commit && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
1584 return (EROFS);
1585
1586 if (ap->a_commit && DOINGSUJ(vp)) {
1587 ASSERT_VOP_ELOCKED(vp, "ffs_closeextattr commit");
1588 softdep_prealloc(vp, MNT_WAIT);
1589 if (vp->v_data == NULL)
1590 return (EBADF);
1591 }
1592 return (ffs_close_ea(vp, ap->a_commit, ap->a_cred, ap->a_td));
1593 }
1594
1595 /*
1596 * Vnode operation to remove a named attribute.
1597 */
1598 static int
ffs_deleteextattr(struct vop_deleteextattr_args * ap)1599 ffs_deleteextattr(
1600 struct vop_deleteextattr_args /* {
1601 IN struct vnode *a_vp;
1602 IN int a_attrnamespace;
1603 IN const char *a_name;
1604 IN struct ucred *a_cred;
1605 IN struct thread *a_td;
1606 } */ *ap)
1607 {
1608 struct vnode *vp;
1609 struct inode *ip;
1610 struct extattr *eap;
1611 uint32_t ul;
1612 int olen, error, i, easize;
1613 uint8_t *eae;
1614 void *tmp;
1615
1616 vp = ap->a_vp;
1617 ip = VTOI(vp);
1618
1619 if (vp->v_type == VCHR || vp->v_type == VBLK)
1620 return (EOPNOTSUPP);
1621 if (strlen(ap->a_name) == 0)
1622 return (EINVAL);
1623 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1624 return (EROFS);
1625
1626 error = extattr_check_cred(vp, ap->a_attrnamespace,
1627 ap->a_cred, ap->a_td, VWRITE);
1628 if (error) {
1629 /*
1630 * ffs_lock_ea is not needed there, because the vnode
1631 * must be exclusively locked.
1632 */
1633 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1634 ip->i_ea_error = error;
1635 return (error);
1636 }
1637
1638 if (DOINGSUJ(vp)) {
1639 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1640 softdep_prealloc(vp, MNT_WAIT);
1641 if (vp->v_data == NULL)
1642 return (EBADF);
1643 }
1644
1645 error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1646 if (error)
1647 return (error);
1648
1649 /* CEM: delete could be done in-place instead */
1650 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1651 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1652 easize = ip->i_ea_len;
1653
1654 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1655 &eap, NULL);
1656 if (olen == -1) {
1657 /* delete but nonexistent */
1658 free(eae, M_TEMP);
1659 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1660 return (ENOATTR);
1661 }
1662 ul = eap->ea_length;
1663 i = (uint8_t *)EXTATTR_NEXT(eap) - eae;
1664 bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1665 easize -= ul;
1666
1667 tmp = ip->i_ea_area;
1668 ip->i_ea_area = eae;
1669 ip->i_ea_len = easize;
1670 free(tmp, M_TEMP);
1671 error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1672 return (error);
1673 }
1674
1675 /*
1676 * Vnode operation to retrieve a named extended attribute.
1677 */
1678 static int
ffs_getextattr(struct vop_getextattr_args * ap)1679 ffs_getextattr(
1680 struct vop_getextattr_args /* {
1681 IN struct vnode *a_vp;
1682 IN int a_attrnamespace;
1683 IN const char *a_name;
1684 INOUT struct uio *a_uio;
1685 OUT size_t *a_size;
1686 IN struct ucred *a_cred;
1687 IN struct thread *a_td;
1688 } */ *ap)
1689 {
1690 struct inode *ip;
1691 uint8_t *eae, *p;
1692 unsigned easize;
1693 int error, ealen;
1694
1695 ip = VTOI(ap->a_vp);
1696
1697 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1698 return (EOPNOTSUPP);
1699
1700 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1701 ap->a_cred, ap->a_td, VREAD);
1702 if (error)
1703 return (error);
1704
1705 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1706 if (error)
1707 return (error);
1708
1709 eae = ip->i_ea_area;
1710 easize = ip->i_ea_len;
1711
1712 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1713 NULL, &p);
1714 if (ealen >= 0) {
1715 error = 0;
1716 if (ap->a_size != NULL)
1717 *ap->a_size = ealen;
1718 else if (ap->a_uio != NULL)
1719 error = uiomove(p, ealen, ap->a_uio);
1720 } else
1721 error = ENOATTR;
1722
1723 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1724 return (error);
1725 }
1726
1727 /*
1728 * Vnode operation to retrieve extended attributes on a vnode.
1729 */
1730 static int
ffs_listextattr(struct vop_listextattr_args * ap)1731 ffs_listextattr(
1732 struct vop_listextattr_args /* {
1733 IN struct vnode *a_vp;
1734 IN int a_attrnamespace;
1735 INOUT struct uio *a_uio;
1736 OUT size_t *a_size;
1737 IN struct ucred *a_cred;
1738 IN struct thread *a_td;
1739 } */ *ap)
1740 {
1741 struct inode *ip;
1742 struct extattr *eap, *eaend;
1743 int error, ealen;
1744
1745 ip = VTOI(ap->a_vp);
1746
1747 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1748 return (EOPNOTSUPP);
1749
1750 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1751 ap->a_cred, ap->a_td, VREAD);
1752 if (error)
1753 return (error);
1754
1755 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1756 if (error)
1757 return (error);
1758
1759 error = 0;
1760 if (ap->a_size != NULL)
1761 *ap->a_size = 0;
1762
1763 KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1764 eap = (struct extattr *)ip->i_ea_area;
1765 eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1766 for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1767 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1768 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1769 if (eap->ea_namespace != ap->a_attrnamespace)
1770 continue;
1771
1772 ealen = eap->ea_namelength;
1773 if (ap->a_size != NULL)
1774 *ap->a_size += ealen + 1;
1775 else if (ap->a_uio != NULL)
1776 error = uiomove(&eap->ea_namelength, ealen + 1,
1777 ap->a_uio);
1778 }
1779
1780 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1781 return (error);
1782 }
1783
1784 /*
1785 * Vnode operation to set a named attribute.
1786 */
1787 static int
ffs_setextattr(struct vop_setextattr_args * ap)1788 ffs_setextattr(
1789 struct vop_setextattr_args /* {
1790 IN struct vnode *a_vp;
1791 IN int a_attrnamespace;
1792 IN const char *a_name;
1793 INOUT struct uio *a_uio;
1794 IN struct ucred *a_cred;
1795 IN struct thread *a_td;
1796 } */ *ap)
1797 {
1798 struct vnode *vp;
1799 struct inode *ip;
1800 struct fs *fs;
1801 struct extattr *eap;
1802 uint32_t ealength, ul;
1803 ssize_t ealen;
1804 int olen, eapad1, eapad2, error, i, easize;
1805 uint8_t *eae;
1806 void *tmp;
1807
1808 vp = ap->a_vp;
1809 ip = VTOI(vp);
1810 fs = ITOFS(ip);
1811
1812 if (vp->v_type == VCHR || vp->v_type == VBLK)
1813 return (EOPNOTSUPP);
1814 if (strlen(ap->a_name) == 0)
1815 return (EINVAL);
1816
1817 /* XXX Now unsupported API to delete EAs using NULL uio. */
1818 if (ap->a_uio == NULL)
1819 return (EOPNOTSUPP);
1820
1821 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1822 return (EROFS);
1823
1824 ealen = ap->a_uio->uio_resid;
1825 if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1826 return (EINVAL);
1827
1828 error = extattr_check_cred(vp, ap->a_attrnamespace,
1829 ap->a_cred, ap->a_td, VWRITE);
1830 if (error) {
1831 /*
1832 * ffs_lock_ea is not needed there, because the vnode
1833 * must be exclusively locked.
1834 */
1835 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1836 ip->i_ea_error = error;
1837 return (error);
1838 }
1839
1840 if (DOINGSUJ(vp)) {
1841 ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1842 softdep_prealloc(vp, MNT_WAIT);
1843 if (vp->v_data == NULL)
1844 return (EBADF);
1845 }
1846
1847 error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1848 if (error)
1849 return (error);
1850
1851 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1852 eapad1 = roundup2(ealength, 8) - ealength;
1853 eapad2 = roundup2(ealen, 8) - ealen;
1854 ealength += eapad1 + ealen + eapad2;
1855
1856 /*
1857 * CEM: rewrites of the same size or smaller could be done in-place
1858 * instead. (We don't acquire any fine-grained locks in here either,
1859 * so we could also do bigger writes in-place.)
1860 */
1861 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1862 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1863 easize = ip->i_ea_len;
1864
1865 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1866 &eap, NULL);
1867 if (olen == -1) {
1868 /* new, append at end */
1869 KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1870 ("unaligned"));
1871 eap = (struct extattr *)(eae + easize);
1872 easize += ealength;
1873 } else {
1874 ul = eap->ea_length;
1875 i = (uint8_t *)EXTATTR_NEXT(eap) - eae;
1876 if (ul != ealength) {
1877 bcopy(EXTATTR_NEXT(eap), (uint8_t *)eap + ealength,
1878 easize - i);
1879 easize += (ealength - ul);
1880 }
1881 }
1882 if (easize > lblktosize(fs, UFS_NXADDR)) {
1883 free(eae, M_TEMP);
1884 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1885 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1886 ip->i_ea_error = ENOSPC;
1887 return (ENOSPC);
1888 }
1889 eap->ea_length = ealength;
1890 eap->ea_namespace = ap->a_attrnamespace;
1891 eap->ea_contentpadlen = eapad2;
1892 eap->ea_namelength = strlen(ap->a_name);
1893 memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1894 bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1895 error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1896 if (error) {
1897 free(eae, M_TEMP);
1898 ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1899 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1900 ip->i_ea_error = error;
1901 return (error);
1902 }
1903 bzero((uint8_t *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1904
1905 tmp = ip->i_ea_area;
1906 ip->i_ea_area = eae;
1907 ip->i_ea_len = easize;
1908 free(tmp, M_TEMP);
1909 error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1910 return (error);
1911 }
1912
1913 /*
1914 * Vnode pointer to File handle
1915 */
1916 static int
ffs_vptofh(struct vop_vptofh_args * ap)1917 ffs_vptofh(
1918 struct vop_vptofh_args /* {
1919 IN struct vnode *a_vp;
1920 IN struct fid *a_fhp;
1921 } */ *ap)
1922 {
1923 struct inode *ip;
1924 struct ufid *ufhp;
1925
1926 ip = VTOI(ap->a_vp);
1927 ufhp = (struct ufid *)ap->a_fhp;
1928 ufhp->ufid_len = sizeof(struct ufid);
1929 ufhp->ufid_ino = ip->i_number;
1930 ufhp->ufid_gen = ip->i_gen;
1931 return (0);
1932 }
1933
1934 SYSCTL_DECL(_vfs_ffs);
1935 static int use_buf_pager = 1;
1936 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1937 "Always use buffer pager instead of bmap");
1938
1939 static daddr_t
ffs_gbp_getblkno(struct vnode * vp,vm_ooffset_t off)1940 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1941 {
1942
1943 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1944 }
1945
1946 static int
ffs_gbp_getblksz(struct vnode * vp,daddr_t lbn,long * sz)1947 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
1948 {
1949
1950 *sz = blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn);
1951 return (0);
1952 }
1953
1954 static int
ffs_getpages(struct vop_getpages_args * ap)1955 ffs_getpages(struct vop_getpages_args *ap)
1956 {
1957 struct vnode *vp;
1958 struct ufsmount *um;
1959
1960 vp = ap->a_vp;
1961 um = VFSTOUFS(vp->v_mount);
1962
1963 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1964 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1965 ap->a_rbehind, ap->a_rahead, NULL, NULL));
1966 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1967 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1968 }
1969
1970 static int
ffs_getpages_async(struct vop_getpages_async_args * ap)1971 ffs_getpages_async(struct vop_getpages_async_args *ap)
1972 {
1973 struct vnode *vp;
1974 struct ufsmount *um;
1975 bool do_iodone;
1976 int error;
1977
1978 vp = ap->a_vp;
1979 um = VFSTOUFS(vp->v_mount);
1980 do_iodone = true;
1981
1982 if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1983 error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1984 ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1985 if (error == 0)
1986 do_iodone = false;
1987 } else {
1988 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1989 ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1990 ffs_gbp_getblksz);
1991 }
1992 if (do_iodone && ap->a_iodone != NULL)
1993 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1994
1995 return (error);
1996 }
1997
1998 static int
ffs_vput_pair(struct vop_vput_pair_args * ap)1999 ffs_vput_pair(struct vop_vput_pair_args *ap)
2000 {
2001 struct mount *mp;
2002 struct vnode *dvp, *vp, *vp1, **vpp;
2003 struct inode *dp, *ip;
2004 ino_t ip_ino;
2005 uint64_t ip_gen;
2006 int error, vp_locked;
2007
2008 dvp = ap->a_dvp;
2009 dp = VTOI(dvp);
2010 vpp = ap->a_vpp;
2011 vp = vpp != NULL ? *vpp : NULL;
2012
2013 if ((dp->i_flag & (IN_NEEDSYNC | IN_ENDOFF)) == 0) {
2014 vput(dvp);
2015 if (vp != NULL && ap->a_unlock_vp)
2016 vput(vp);
2017 return (0);
2018 }
2019
2020 mp = dvp->v_mount;
2021 if (vp != NULL) {
2022 if (ap->a_unlock_vp) {
2023 vput(vp);
2024 } else {
2025 MPASS(vp->v_type != VNON);
2026 vp_locked = VOP_ISLOCKED(vp);
2027 ip = VTOI(vp);
2028 ip_ino = ip->i_number;
2029 ip_gen = ip->i_gen;
2030 VOP_UNLOCK(vp);
2031 }
2032 }
2033
2034 /*
2035 * If compaction or fsync was requested do it in ffs_vput_pair()
2036 * now that other locks are no longer held.
2037 */
2038 if ((dp->i_flag & IN_ENDOFF) != 0) {
2039 VNASSERT(I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size, dvp,
2040 ("IN_ENDOFF set but I_ENDOFF() is not"));
2041 dp->i_flag &= ~IN_ENDOFF;
2042 error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL |
2043 (DOINGASYNC(dvp) ? 0 : IO_SYNC), curthread->td_ucred);
2044 if (error != 0 && error != ERELOOKUP) {
2045 if (!ffs_fsfail_cleanup(VFSTOUFS(mp), error)) {
2046 vn_printf(dvp,
2047 "IN_ENDOFF: failed to truncate, "
2048 "error %d\n", error);
2049 }
2050 #ifdef UFS_DIRHASH
2051 ufsdirhash_free(dp);
2052 #endif
2053 }
2054 SET_I_ENDOFF(dp, 0);
2055 }
2056 if ((dp->i_flag & IN_NEEDSYNC) != 0) {
2057 do {
2058 error = ffs_syncvnode(dvp, MNT_WAIT, 0);
2059 } while (error == ERELOOKUP);
2060 }
2061
2062 vput(dvp);
2063
2064 if (vp == NULL || ap->a_unlock_vp)
2065 return (0);
2066 MPASS(mp != NULL);
2067
2068 /*
2069 * It is possible that vp is reclaimed at this point. Only
2070 * routines that call us with a_unlock_vp == false can find
2071 * that their vp has been reclaimed. There are three areas
2072 * that are affected:
2073 * 1) vn_open_cred() - later VOPs could fail, but
2074 * dead_open() returns 0 to simulate successful open.
2075 * 2) ffs_snapshot() - creation of snapshot fails with EBADF.
2076 * 3) NFS server (several places) - code is prepared to detect
2077 * and respond to dead vnodes by returning ESTALE.
2078 */
2079 VOP_LOCK(vp, vp_locked | LK_RETRY);
2080 if (IS_UFS(vp))
2081 return (0);
2082
2083 /*
2084 * Try harder to recover from reclaimed vp if reclaim was not
2085 * because underlying inode was cleared. We saved inode
2086 * number and inode generation, so we can try to reinstantiate
2087 * exactly same version of inode. If this fails, return
2088 * original doomed vnode and let caller to handle
2089 * consequences.
2090 *
2091 * Note that callers must keep write started around
2092 * VOP_VPUT_PAIR() calls, so it is safe to use mp without
2093 * busying it.
2094 */
2095 VOP_UNLOCK(vp);
2096 error = ffs_inotovp(mp, ip_ino, ip_gen, LK_EXCLUSIVE, &vp1,
2097 FFSV_REPLACE_DOOMED);
2098 if (error != 0) {
2099 VOP_LOCK(vp, vp_locked | LK_RETRY);
2100 } else {
2101 vrele(vp);
2102 *vpp = vp1;
2103 }
2104 return (error);
2105 }
2106