1/*
2 * Copyright (c) 2019-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * This code is derived from software contributed to Berkeley by
33 * Scooter Morris at Genentech Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
60 */
61
62#include <sys/cdefs.h>
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/kernel.h>
66#include <sys/lock.h>
67#include <sys/mount.h>
68#include <sys/proc.h>
69#include <sys/signalvar.h>
70#include <sys/unistd.h>
71#include <sys/user.h>
72#include <sys/vnode.h>
73#include <sys/vnode_internal.h>
74#include <sys/vnode_if.h>
75#include <sys/malloc.h>
76#include <sys/fcntl.h>
77#include <sys/lockf.h>
78#include <sys/sdt.h>
79#include <kern/policy_internal.h>
80
81#include <sys/file_internal.h>
82
83#if (DEVELOPMENT || DEBUG)
84#define LOCKF_DEBUGGING 1
85#endif
86
87#ifdef LOCKF_DEBUGGING
88#include <sys/sysctl.h>
89void lf_print(const char *tag, struct lockf *lock);
90void lf_printlist(const char *tag, struct lockf *lock);
91
92#define LF_DBG_LOCKOP (1 << 0) /* setlk, getlk, clearlk */
93#define LF_DBG_LIST (1 << 1) /* split, coalesce */
94#define LF_DBG_IMPINH (1 << 2) /* importance inheritance */
95#define LF_DBG_TRACE (1 << 3) /* errors, exit */
96#define LF_DBG_DEADLOCK (1 << 4) /* deadlock detection */
97
98static int lockf_debug = 0; /* was 2, could be 3 ;-) */
99SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
100
101/*
102 * If the selector is set, then output the debugging diagnostic.
103 */
104#define LOCKF_DEBUG(mask, ...) \
105 do { \
106 if ((mask) & lockf_debug) { \
107 printf("%s>", __FUNCTION__); \
108 printf(__VA_ARGS__); \
109 } \
110 } while(0)
111
112#define LOCKF_DEBUGP(mask) \
113 ({ \
114 ((mask) & lockf_debug); \
115 })
116#else /* !LOCKF_DEBUGGING */
117#define LOCKF_DEBUG(mask, ...) /* mask */
118#endif /* !LOCKF_DEBUGGING */
119
120KALLOC_TYPE_DEFINE(KT_LOCKF, struct lockf, KT_PRIV_ACCT);
121
122#define NOLOCKF (struct lockf *)0
123#define SELF 0x1
124#define OTHERS 0x2
125#define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
126
127/* return the effective end of a 'struct lockf': lf_end == -1 is OFF_MAX */
128#define LF_END(l) ((l)->lf_end == -1 ? OFF_MAX : (l)->lf_end)
129
130/*
131 * Overlapping lock states
132 *
133 * For lk_find_overlap(..., SELF, ...), the possible sequences are a single:
134 * - OVERLAP_NONE,
135 * - OVERLAP_EQUALS_LOCK, or
136 * - OVERLAP_CONTAINS_LOCK
137 *
138 * or the following sequence:
139 * - optional OVERLAP_STARTS_BEFORE_LOCK
140 * - zero or more OVERLAP_CONTAINED_BY_LOCK
141 * - optional OVERLAP_ENDS_AFTER_LOCK
142 * - OVERLAP_NONE
143 *
144 * In the annotations:
145 * - the search lock is [SS, SE] and
146 * - the returned overlap lock is [OS,OE].
147 */
148typedef enum {
149 OVERLAP_NONE = 0,
150 OVERLAP_EQUALS_LOCK, /* OS == SS && OE == SE */
151 OVERLAP_CONTAINS_LOCK, /* OS <= SS && OE >= SE */
152 OVERLAP_CONTAINED_BY_LOCK, /* OS >= SS && OE <= SE */
153 OVERLAP_STARTS_BEFORE_LOCK, /* OS < SS && OE >= SS */
154 OVERLAP_ENDS_AFTER_LOCK /* OS > SS && OE > SE */
155} overlap_t;
156
157static int lf_clearlock(struct lockf *);
158static int lf_transferlock(struct lockf *);
159static overlap_t lf_findoverlap(struct lockf *,
160 struct lockf *, int, struct lockf ***, struct lockf **);
161static struct lockf *lf_getblock(struct lockf *, pid_t);
162static int lf_getlock(struct lockf *, struct flock *, pid_t);
163static int lf_setlock(struct lockf *, struct timespec *);
164static int lf_split(struct lockf *, struct lockf *);
165static void lf_wakelock(struct lockf *, boolean_t);
166#if IMPORTANCE_INHERITANCE
167static void lf_hold_assertion(task_t, struct lockf *);
168static void lf_jump_to_queue_head(struct lockf *, struct lockf *);
169static void lf_drop_assertion(struct lockf *);
170static void lf_boost_blocking_proc(struct lockf *, struct lockf *);
171static void lf_adjust_assertion(struct lockf *block);
172#endif /* IMPORTANCE_INHERITANCE */
173
174static LCK_GRP_DECLARE(lf_dead_lock_grp, "lf_dead_lock");
175static LCK_MTX_DECLARE(lf_dead_lock, &lf_dead_lock_grp);
176
177/*
178 * lf_advlock
179 *
180 * Description: Advisory record locking support
181 *
182 * Parameters: ap Argument pointer to a vnop_advlock_args
183 * argument descriptor structure for the
184 * lock operation to be attempted.
185 *
186 * Returns: 0 Success
187 * EOVERFLOW
188 * EINVAL
189 * ENOLCK Number of locked regions exceeds limit
190 * lf_setlock:EAGAIN
191 * lf_setlock:EDEADLK
192 * lf_setlock:EINTR
193 * lf_setlock:ENOLCK
194 * lf_setlock:ETIMEDOUT
195 * lf_clearlock:ENOLCK
196 * vnode_size:???
197 *
198 * Notes: We return ENOLCK when we run out of memory to support locks; as
199 * such, there is no specific expectation limit other than the
200 * amount of available resources.
201 */
202int
203lf_advlock(struct vnop_advlock_args *ap)
204{
205 struct vnode *vp = ap->a_vp;
206 struct flock *fl = ap->a_fl;
207 vfs_context_t context = ap->a_context;
208 struct lockf *lock;
209 off_t start, end, oadd;
210 u_quad_t size;
211 int error;
212 struct lockf **head = &vp->v_lockf;
213
214 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
215
216 /*
217 * Avoid the common case of unlocking when inode has no locks.
218 */
219 if (*head == (struct lockf *)0) {
220 if (ap->a_op != F_SETLK) {
221 fl->l_type = F_UNLCK;
222 LOCKF_DEBUG(LF_DBG_TRACE,
223 "lf_advlock: '%s' unlock without lock\n",
224 vfs_context_proc(context)->p_comm);
225 return 0;
226 }
227 }
228
229 /*
230 * Convert the flock structure into a start and end.
231 */
232 switch (fl->l_whence) {
233 case SEEK_SET:
234 case SEEK_CUR:
235 /*
236 * Caller is responsible for adding any necessary offset
237 * when SEEK_CUR is used.
238 */
239 start = fl->l_start;
240 break;
241
242 case SEEK_END:
243
244 /*
245 * It's OK to cast the u_quad_t to and off_t here, since they
246 * are the same storage size, and the value of the returned
247 * contents will never overflow into the sign bit. We need to
248 * do this because we will use size to force range checks.
249 */
250 if ((error = vnode_size(vp, (off_t *)&size, context))) {
251 LOCKF_DEBUG(LF_DBG_TRACE,
252 "lf_advlock: vnode_getattr failed: %d\n", error);
253 return error;
254 }
255
256 if (size > OFF_MAX ||
257 (fl->l_start > 0 &&
258 size > (u_quad_t)(OFF_MAX - fl->l_start))) {
259 return EOVERFLOW;
260 }
261 start = size + fl->l_start;
262 break;
263
264 default:
265 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
266 fl->l_whence);
267 return EINVAL;
268 }
269 if (start < 0) {
270 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
271 start);
272 return EINVAL;
273 }
274 if (fl->l_len < 0) {
275 if (start == 0) {
276 LOCKF_DEBUG(LF_DBG_TRACE,
277 "lf_advlock: len < 0 & start == 0\n");
278 return EINVAL;
279 }
280 end = start - 1;
281 start += fl->l_len;
282 if (start < 0) {
283 LOCKF_DEBUG(LF_DBG_TRACE,
284 "lf_advlock: start < 0 (%qd)\n", start);
285 return EINVAL;
286 }
287 } else if (fl->l_len == 0) {
288 end = -1;
289 } else {
290 oadd = fl->l_len - 1;
291 if (oadd > (off_t)(OFF_MAX - start)) {
292 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
293 return EOVERFLOW;
294 }
295 end = start + oadd;
296 }
297 /*
298 * Create the lockf structure
299 */
300 lock = zalloc_flags(KT_LOCKF, Z_WAITOK | Z_NOFAIL);
301 lock->lf_start = start;
302 lock->lf_end = end;
303 lock->lf_id = ap->a_id;
304 lock->lf_vnode = vp;
305 lock->lf_type = fl->l_type;
306 lock->lf_head = head;
307 lock->lf_next = (struct lockf *)0;
308 TAILQ_INIT(&lock->lf_blkhd);
309 lock->lf_flags = (short)ap->a_flags;
310#if IMPORTANCE_INHERITANCE
311 lock->lf_boosted = LF_NOT_BOOSTED;
312#endif
313 if (ap->a_flags & F_POSIX) {
314 lock->lf_owner = (struct proc *)lock->lf_id;
315 } else {
316 lock->lf_owner = NULL;
317 }
318
319 if (ap->a_flags & F_FLOCK) {
320 lock->lf_flags |= F_WAKE1_SAFE;
321 }
322
323 lck_mtx_lock(lck: &vp->v_lock); /* protect the lockf list */
324 /*
325 * Do the requested operation.
326 */
327 switch (ap->a_op) {
328 case F_SETLK:
329 /*
330 * For OFD locks, lf_id is derived from the fileglob.
331 * Record an "lf_owner" iff this is a confined fd
332 * i.e. it cannot escape this process and will be
333 * F_UNLCKed before the owner exits. (This is
334 * the implicit guarantee needed to ensure lf_owner
335 * remains a valid reference.)
336 */
337 if ((ap->a_flags & F_OFD_LOCK) && (ap->a_flags & F_CONFINED)) {
338 lock->lf_owner = current_proc();
339 }
340 error = lf_setlock(lock, ap->a_timeout);
341 break;
342
343 case F_UNLCK:
344 error = lf_clearlock(lock);
345 zfree(KT_LOCKF, lock);
346 break;
347
348 case F_TRANSFER:
349 /*
350 * The new owner is passed in the context, set the new owner
351 * in the lf_owner field.
352 */
353 lock->lf_owner = vfs_context_proc(ctx: context);
354 assert(lock->lf_owner != current_proc());
355 error = lf_transferlock(lock);
356 zfree(KT_LOCKF, lock);
357 break;
358
359 case F_GETLK:
360 error = lf_getlock(lock, fl, -1);
361 zfree(KT_LOCKF, lock);
362 break;
363
364 case F_GETLKPID:
365 error = lf_getlock(lock, fl, fl->l_pid);
366 zfree(KT_LOCKF, lock);
367 break;
368
369 default:
370 zfree(KT_LOCKF, lock);
371 error = EINVAL;
372 break;
373 }
374 lck_mtx_unlock(lck: &vp->v_lock); /* done manipulating the list */
375
376 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
377 return error;
378}
379
380/*
381 * Empty the queue of msleeping requests for a lock on the given vnode.
382 * Called with the vnode already locked. Used for forced unmount, where
383 * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
384 * that prevents the vnode from ever being drained. Force unmounting wins.
385 */
386void
387lf_abort_advlocks(vnode_t vp)
388{
389 struct lockf *lock;
390
391 if ((lock = vp->v_lockf) == NULL) {
392 return;
393 }
394
395 lck_mtx_assert(lck: &vp->v_lock, LCK_MTX_ASSERT_OWNED);
396
397 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
398 struct lockf *tlock;
399
400 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
401 /*
402 * Setting this flag should cause all
403 * currently blocked F_SETLK request to
404 * return to userland with an errno.
405 */
406 tlock->lf_flags |= F_ABORT;
407 }
408 lf_wakelock(lock, TRUE);
409 }
410}
411
412/*
413 * Take any lock attempts which are currently blocked by a given lock ("from")
414 * and mark them as blocked by a different lock ("to"). Used in the case
415 * where a byte range currently occupied by "from" is to be occupied by "to."
416 */
417static void
418lf_move_blocked(struct lockf *to, struct lockf *from)
419{
420 struct lockf *tlock;
421
422 TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
423 tlock->lf_next = to;
424 }
425
426 TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
427}
428
429/*
430 * lf_coalesce_adjacent
431 *
432 * Description: Helper function: when setting a lock, coalesce adjacent
433 * locks. Needed because adjacent locks are not overlapping,
434 * but POSIX requires that they be coalesced.
435 *
436 * Parameters: lock The new lock which may be adjacent
437 * to already locked regions, and which
438 * should therefore be coalesced with them
439 *
440 * Returns: <void>
441 */
442static void
443lf_coalesce_adjacent(struct lockf *lock)
444{
445 struct lockf **lf = lock->lf_head;
446
447 while (*lf != NOLOCKF) {
448 /* reject locks that obviously could not be coalesced */
449 if ((*lf == lock) ||
450 ((*lf)->lf_id != lock->lf_id) ||
451 ((*lf)->lf_type != lock->lf_type)) {
452 lf = &(*lf)->lf_next;
453 continue;
454 }
455
456 /*
457 * NOTE: Assumes that if two locks are adjacent on the number line
458 * and belong to the same owner, then they are adjacent on the list.
459 */
460 if (LF_END(*lf) < OFF_MAX &&
461 (LF_END(*lf) + 1) == lock->lf_start) {
462 struct lockf *adjacent = *lf;
463
464 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
465 lock->lf_start = (*lf)->lf_start;
466 *lf = lock;
467 lf = &(*lf)->lf_next;
468
469 lf_move_blocked(to: lock, from: adjacent);
470
471 zfree(KT_LOCKF, adjacent);
472 continue;
473 }
474 /* If the lock starts adjacent to us, we can coalesce it */
475 if (LF_END(lock) < OFF_MAX &&
476 (LF_END(lock) + 1) == (*lf)->lf_start) {
477 struct lockf *adjacent = *lf;
478
479 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
480 lock->lf_end = (*lf)->lf_end;
481 lock->lf_next = (*lf)->lf_next;
482 lf = &lock->lf_next;
483
484 lf_move_blocked(to: lock, from: adjacent);
485
486 zfree(KT_LOCKF, adjacent);
487 continue;
488 }
489
490 /* no matching conditions; go on to next lock */
491 lf = &(*lf)->lf_next;
492 }
493}
494
495/*
496 * lf_setlock
497 *
498 * Description: Set a byte-range lock.
499 *
500 * Parameters: lock The lock structure describing the lock
501 * to be set; allocated by the caller, it
502 * will be linked into the lock list if
503 * the set is successful, and freed if the
504 * set is unsuccessful.
505 *
506 * timeout Timeout specified in the case of
507 * SETLKWTIMEOUT.
508 *
509 * Returns: 0 Success
510 * EAGAIN
511 * EDEADLK
512 * lf_split:ENOLCK
513 * lf_clearlock:ENOLCK
514 * msleep:EINTR
515 * msleep:ETIMEDOUT
516 *
517 * Notes: We add the lock to the provisional lock list. We do not
518 * coalesce at this time; this has implications for other lock
519 * requestors in the blocker search mechanism.
520 */
521static int
522lf_setlock(struct lockf *lock, struct timespec *timeout)
523{
524 struct lockf *block;
525 struct lockf **head = lock->lf_head;
526 struct lockf **prev, *overlap;
527 static const char lockstr[] = "lockf";
528 int priority, needtolink, error;
529 struct vnode *vp = lock->lf_vnode;
530 overlap_t ovcase;
531
532#ifdef LOCKF_DEBUGGING
533 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
534 lf_print("lf_setlock", lock);
535 lf_printlist("lf_setlock(in)", lock);
536 }
537#endif /* LOCKF_DEBUGGING */
538 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
539
540 /*
541 * Set the priority
542 */
543 priority = PLOCK;
544 if (lock->lf_type == F_WRLCK) {
545 priority += 4;
546 }
547 priority |= PCATCH;
548scan:
549 /*
550 * Scan lock list for this file looking for locks that would block us.
551 */
552 while ((block = lf_getblock(lock, -1))) {
553 /*
554 * Free the structure and return if nonblocking.
555 */
556 if ((lock->lf_flags & F_WAIT) == 0) {
557 DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
558 zfree(KT_LOCKF, lock);
559 return EAGAIN;
560 }
561
562 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
563
564 /*
565 * We are blocked. Since flock style locks cover
566 * the whole file, there is no chance for deadlock.
567 *
568 * OFD byte-range locks currently do NOT support
569 * deadlock detection.
570 *
571 * For POSIX byte-range locks we must check for deadlock.
572 *
573 * Deadlock detection is done by looking through the
574 * wait channels to see if there are any cycles that
575 * involve us.
576 */
577 if ((lock->lf_flags & F_POSIX) &&
578 (block->lf_flags & F_POSIX)) {
579 lck_mtx_lock(lck: &lf_dead_lock);
580
581 /* The blocked process is waiting on something */
582 struct proc *wproc = block->lf_owner;
583 proc_lock(wproc);
584
585 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
586
587 struct uthread *ut;
588 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
589 /*
590 * If the thread is (a) asleep (uu_wchan != 0)
591 * and (b) in this code (uu_wmesg == lockstr)
592 * then check to see if the lock is blocked behind
593 * someone blocked behind us.
594 *
595 * Note: (i) vp->v_lock is held, preventing other
596 * threads from mutating the blocking list for our vnode.
597 * and (ii) the proc_lock is held i.e the thread list
598 * is stable.
599 *
600 * HOWEVER some thread in wproc might be sleeping on a lockf
601 * structure for a different vnode, and be woken at any
602 * time. Thus the waitblock list could mutate while
603 * it's being inspected by this thread, and what
604 * ut->uu_wchan was just pointing at could even be freed.
605 *
606 * Nevertheless this is safe here because of lf_dead_lock; if
607 * any thread blocked with uu_wmesg == lockstr wakes (see below)
608 * it will try to acquire lf_dead_lock which is already held
609 * here. Holding that lock prevents the lockf structure being
610 * pointed at by ut->uu_wchan from going away. Thus the vnode
611 * involved can be found and locked, and the corresponding
612 * blocking chain can then be examined safely.
613 */
614 const struct lockf *waitblock = (const void *)ut->uu_wchan;
615 if ((waitblock != NULL) && (ut->uu_wmesg == lockstr)) {
616 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
617
618 vnode_t othervp = NULL;
619 if (waitblock->lf_vnode != vp) {
620 /*
621 * This thread in wproc is waiting for a lock
622 * on a different vnode; grab the lock on it
623 * that protects lf_next while we examine it.
624 */
625 othervp = waitblock->lf_vnode;
626 if (!lck_mtx_try_lock(lck: &othervp->v_lock)) {
627 /*
628 * avoid kernel deadlock: drop all
629 * locks, pause for a bit to let the
630 * other thread do what it needs to do,
631 * then (because we drop and retake
632 * v_lock) retry the scan.
633 */
634 proc_unlock(wproc);
635 lck_mtx_unlock(lck: &lf_dead_lock);
636 static struct timespec ts = {
637 .tv_sec = 0,
638 .tv_nsec = 2 * NSEC_PER_MSEC,
639 };
640 static const char pausestr[] = "lockf:pause";
641 (void) msleep(chan: lock, mtx: &vp->v_lock, pri: priority, wmesg: pausestr, ts: &ts);
642 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
643 goto scan;
644 }
645 }
646
647 /*
648 * Get the lock blocking the lock
649 * which would block us, and make
650 * certain it hasn't become unblocked
651 * (been granted, e.g. between the time
652 * we called lf_getblock, and the time
653 * we successfully acquired the
654 * proc_lock).
655 */
656 const struct lockf *nextblock = waitblock->lf_next;
657 if (nextblock == NULL) {
658 if (othervp) {
659 lck_mtx_unlock(lck: &othervp->v_lock);
660 }
661 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with waitblock %p and no lf_next; othervp %p\n", lock, waitblock, othervp);
662 continue;
663 }
664 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, nextblock, nextblock->lf_vnode);
665
666 /*
667 * Make sure it's an advisory range
668 * lock and not any other kind of lock;
669 * if we mix lock types, it's our own
670 * fault.
671 */
672 if ((nextblock->lf_flags & F_POSIX) == 0) {
673 if (othervp) {
674 lck_mtx_unlock(lck: &othervp->v_lock);
675 }
676 continue;
677 }
678
679 /*
680 * If the owner of the lock that's
681 * blocking a lock that's blocking us
682 * getting the requested lock, then we
683 * would deadlock, so error out.
684 */
685 struct proc *bproc = nextblock->lf_owner;
686 const boolean_t deadlocked = bproc == lock->lf_owner;
687
688 if (othervp) {
689 lck_mtx_unlock(lck: &othervp->v_lock);
690 }
691 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
692 if (deadlocked) {
693 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
694 proc_unlock(wproc);
695 lck_mtx_unlock(lck: &lf_dead_lock);
696 zfree(KT_LOCKF, lock);
697 return EDEADLK;
698 }
699 }
700 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
701 }
702 proc_unlock(wproc);
703 lck_mtx_unlock(lck: &lf_dead_lock);
704 }
705
706 /*
707 * For flock type locks, we must first remove
708 * any shared locks that we hold before we sleep
709 * waiting for an exclusive lock.
710 */
711 if ((lock->lf_flags & F_FLOCK) &&
712 lock->lf_type == F_WRLCK) {
713 lock->lf_type = F_UNLCK;
714 if ((error = lf_clearlock(lock)) != 0) {
715 zfree(KT_LOCKF, lock);
716 return error;
717 }
718 lock->lf_type = F_WRLCK;
719 }
720 /*
721 * Add our lock to the blocked list and sleep until we're free.
722 * Remember who blocked us (for deadlock detection).
723 */
724 lock->lf_next = block;
725 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
726
727 if (!(lock->lf_flags & F_FLOCK)) {
728 block->lf_flags &= ~F_WAKE1_SAFE;
729 }
730
731#if IMPORTANCE_INHERITANCE
732 /*
733 * Importance donation is done only for cases where the
734 * owning task can be unambiguously determined.
735 *
736 * POSIX type locks are not inherited by child processes;
737 * we maintain a 1:1 mapping between a lock and its owning
738 * process.
739 *
740 * Flock type locks are inherited across fork() and there is
741 * no 1:1 mapping in the general case. However, the fileglobs
742 * used by OFD locks *may* be confined to the process that
743 * created them, and thus have an "owner", in which case
744 * we also attempt importance donation.
745 */
746 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0) {
747 lf_boost_blocking_proc(lock, block);
748 } else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
749 lock->lf_owner != block->lf_owner &&
750 NULL != lock->lf_owner && NULL != block->lf_owner) {
751 lf_boost_blocking_proc(lock, block);
752 }
753#endif /* IMPORTANCE_INHERITANCE */
754
755#ifdef LOCKF_DEBUGGING
756 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
757 lf_print("lf_setlock: blocking on", block);
758 lf_printlist("lf_setlock(block)", block);
759 }
760#endif /* LOCKF_DEBUGGING */
761 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
762
763 if (lock->lf_flags & F_POSIX) {
764 error = msleep(chan: lock, mtx: &vp->v_lock, pri: priority, wmesg: lockstr, ts: timeout);
765 /*
766 * Ensure that 'lock' doesn't get mutated or freed if a
767 * wakeup occurs while hunting for deadlocks (and holding
768 * lf_dead_lock - see above)
769 */
770 lck_mtx_lock(lck: &lf_dead_lock);
771 lck_mtx_unlock(lck: &lf_dead_lock);
772 } else {
773 static const char lockstr_np[] = "lockf:np";
774 error = msleep(chan: lock, mtx: &vp->v_lock, pri: priority, wmesg: lockstr_np, ts: timeout);
775 }
776
777 if (error == 0 && (lock->lf_flags & F_ABORT) != 0) {
778 error = EBADF;
779 }
780
781 if (lock->lf_next) {
782 /*
783 * lf_wakelock() always sets wakelock->lf_next to
784 * NULL before a wakeup; so we've been woken early
785 * - perhaps by a debugger, signal or other event.
786 *
787 * Remove 'lock' from the block list (avoids double-add
788 * in the spurious case, which would create a cycle)
789 */
790 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
791#if IMPORTANCE_INHERITANCE
792 /*
793 * Adjust the boost on lf_next.
794 */
795 lf_adjust_assertion(block: lock->lf_next);
796#endif /* IMPORTANCE_INHERITANCE */
797 lock->lf_next = NULL;
798
799 if (error == 0) {
800 /*
801 * If this was a spurious wakeup, retry
802 */
803 printf("%s: spurious wakeup, retrying lock\n",
804 __func__);
805 continue;
806 }
807 }
808
809 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
810 if ((block = lf_getblock(lock, -1)) != NULL) {
811 lf_move_blocked(to: block, from: lock);
812 }
813 }
814
815 if (error) {
816 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
817 lf_wakelock(lock, TRUE);
818 }
819 zfree(KT_LOCKF, lock);
820 /* Return ETIMEDOUT if timeout occoured. */
821 if (error == EWOULDBLOCK) {
822 error = ETIMEDOUT;
823 }
824 return error;
825 }
826 }
827
828 /*
829 * No blocks!! Add the lock. Note that we will
830 * downgrade or upgrade any overlapping locks this
831 * process already owns.
832 *
833 * Skip over locks owned by other processes.
834 * Handle any locks that overlap and are owned by ourselves.
835 */
836 prev = head;
837 block = *head;
838 needtolink = 1;
839 for (;;) {
840 const off_t lkend = LF_END(lock);
841 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
842 if (ovcase) {
843 block = overlap->lf_next;
844 }
845 /*
846 * Six cases:
847 * 0) no overlap
848 * 1) overlap == lock
849 * 2) overlap contains lock
850 * 3) lock contains overlap
851 * 4) overlap starts before lock
852 * 5) overlap ends after lock
853 */
854 switch (ovcase) {
855 case OVERLAP_NONE:
856 if (needtolink) {
857 *prev = lock;
858 lock->lf_next = overlap;
859 }
860 break;
861
862 case OVERLAP_EQUALS_LOCK:
863 /*
864 * If downgrading lock, others may be
865 * able to acquire it.
866 */
867 if (lock->lf_type == F_RDLCK &&
868 overlap->lf_type == F_WRLCK) {
869 lf_wakelock(overlap, TRUE);
870 }
871 overlap->lf_type = lock->lf_type;
872 lf_move_blocked(to: overlap, from: lock);
873 zfree(KT_LOCKF, lock);
874 lock = overlap; /* for lf_coalesce_adjacent() */
875 break;
876
877 case OVERLAP_CONTAINS_LOCK:
878 /*
879 * Check for common starting point and different types.
880 */
881 if (overlap->lf_type == lock->lf_type) {
882 lf_move_blocked(to: overlap, from: lock);
883 zfree(KT_LOCKF, lock);
884 lock = overlap; /* for lf_coalesce_adjacent() */
885 break;
886 }
887 if (overlap->lf_start == lock->lf_start) {
888 *prev = lock;
889 lock->lf_next = overlap;
890 assert(lkend < OFF_MAX);
891 overlap->lf_start = lkend + 1;
892 } else {
893 /*
894 * If we can't split the lock, we can't
895 * grant it. Claim a system limit for the
896 * resource shortage.
897 */
898 if (lf_split(overlap, lock)) {
899 zfree(KT_LOCKF, lock);
900 return ENOLCK;
901 }
902 }
903 lf_wakelock(overlap, TRUE);
904 break;
905
906 case OVERLAP_CONTAINED_BY_LOCK:
907 /*
908 * If downgrading lock, others may be able to
909 * acquire it, otherwise take the list.
910 */
911 if (lock->lf_type == F_RDLCK &&
912 overlap->lf_type == F_WRLCK) {
913 lf_wakelock(overlap, TRUE);
914 } else {
915 lf_move_blocked(to: lock, from: overlap);
916 }
917 /*
918 * Add the new lock if necessary and delete the overlap.
919 */
920 if (needtolink) {
921 *prev = lock;
922 lock->lf_next = overlap->lf_next;
923 prev = &lock->lf_next;
924 needtolink = 0;
925 } else {
926 *prev = overlap->lf_next;
927 }
928 zfree(KT_LOCKF, overlap);
929 continue;
930
931 case OVERLAP_STARTS_BEFORE_LOCK:
932 /*
933 * Add lock after overlap on the list.
934 */
935 lock->lf_next = overlap->lf_next;
936 overlap->lf_next = lock;
937 assert(lock->lf_start > 0);
938 overlap->lf_end = lock->lf_start - 1;
939 prev = &lock->lf_next;
940 lf_wakelock(overlap, TRUE);
941 needtolink = 0;
942 continue;
943
944 case OVERLAP_ENDS_AFTER_LOCK:
945 /*
946 * Add the new lock before overlap.
947 */
948 if (needtolink) {
949 *prev = lock;
950 lock->lf_next = overlap;
951 }
952 assert(lkend < OFF_MAX);
953 overlap->lf_start = lkend + 1;
954 lf_wakelock(overlap, TRUE);
955 break;
956 }
957 break;
958 }
959 /* Coalesce adjacent locks with identical attributes */
960 lf_coalesce_adjacent(lock);
961#ifdef LOCKF_DEBUGGING
962 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
963 lf_print("lf_setlock: got the lock", lock);
964 lf_printlist("lf_setlock(out)", lock);
965 }
966#endif /* LOCKF_DEBUGGING */
967 return 0;
968}
969
970
971/*
972 * lf_clearlock
973 *
974 * Description: Remove a byte-range lock on an vnode. Generally, find the
975 * lock (or an overlap to that lock) and remove it (or shrink
976 * it), then wakeup anyone we can.
977 *
978 * Parameters: unlock The lock to clear
979 *
980 * Returns: 0 Success
981 * lf_split:ENOLCK
982 *
983 * Notes: A caller may unlock all the locks owned by the caller by
984 * specifying the entire file range; locks owned by other
985 * callers are not effected by this operation.
986 */
987static int
988lf_clearlock(struct lockf *unlock)
989{
990 struct lockf **head = unlock->lf_head;
991 struct lockf *lf = *head;
992 struct lockf *overlap, **prev;
993 overlap_t ovcase;
994
995 if (lf == NOLOCKF) {
996 return 0;
997 }
998#ifdef LOCKF_DEBUGGING
999 if (unlock->lf_type != F_UNLCK) {
1000 panic("lf_clearlock: bad type");
1001 }
1002 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1003 lf_print("lf_clearlock", unlock);
1004 }
1005#endif /* LOCKF_DEBUGGING */
1006 prev = head;
1007 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
1008 const off_t unlkend = LF_END(unlock);
1009 /*
1010 * Wakeup the list of locks to be retried.
1011 */
1012 lf_wakelock(overlap, FALSE);
1013#if IMPORTANCE_INHERITANCE
1014 if (overlap->lf_boosted == LF_BOOSTED) {
1015 lf_drop_assertion(overlap);
1016 }
1017#endif /* IMPORTANCE_INHERITANCE */
1018
1019 switch (ovcase) {
1020 case OVERLAP_NONE: /* satisfy compiler enum/switch */
1021 break;
1022
1023 case OVERLAP_EQUALS_LOCK:
1024 *prev = overlap->lf_next;
1025 zfree(KT_LOCKF, overlap);
1026 break;
1027
1028 case OVERLAP_CONTAINS_LOCK: /* split it */
1029 if (overlap->lf_start == unlock->lf_start) {
1030 assert(unlkend < OFF_MAX);
1031 overlap->lf_start = unlkend + 1;
1032 break;
1033 }
1034 /*
1035 * If we can't split the lock, we can't grant it.
1036 * Claim a system limit for the resource shortage.
1037 */
1038 if (lf_split(overlap, unlock)) {
1039 return ENOLCK;
1040 }
1041 overlap->lf_next = unlock->lf_next;
1042 break;
1043
1044 case OVERLAP_CONTAINED_BY_LOCK:
1045 *prev = overlap->lf_next;
1046 lf = overlap->lf_next;
1047 zfree(KT_LOCKF, overlap);
1048 continue;
1049
1050 case OVERLAP_STARTS_BEFORE_LOCK:
1051 assert(unlock->lf_start > 0);
1052 overlap->lf_end = unlock->lf_start - 1;
1053 prev = &overlap->lf_next;
1054 lf = overlap->lf_next;
1055 continue;
1056
1057 case OVERLAP_ENDS_AFTER_LOCK:
1058 assert(unlkend < OFF_MAX);
1059 overlap->lf_start = unlkend + 1;
1060 break;
1061 }
1062 break;
1063 }
1064#ifdef LOCKF_DEBUGGING
1065 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1066 lf_printlist("lf_clearlock", unlock);
1067 }
1068#endif /* LOCKF_DEBUGGING */
1069 return 0;
1070}
1071
1072
1073/*
1074 * lf_transferlock
1075 *
1076 * Description: Transfer a give lock from old_proc to new proc during exec
1077 *
1078 * Parameters: unlock The lock to transfer
1079 *
1080 * Returns: 0 Success
1081 *
1082 * Notes: A caller may transfer all the locks owned by the caller by
1083 * specifying the entire file range; locks owned by other
1084 * callers are not effected by this operation.
1085 */
1086static int
1087lf_transferlock(struct lockf *transfer)
1088{
1089 struct lockf **head = transfer->lf_head;
1090 struct lockf *lf = *head;
1091 struct lockf *overlap, **prev;
1092 overlap_t ovcase;
1093
1094 if (lf == NOLOCKF) {
1095 return 0;
1096 }
1097#ifdef LOCKF_DEBUGGING
1098 if (transfer->lf_type != F_TRANSFER) {
1099 panic("lf_transferlock: bad type");
1100 }
1101 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1102 lf_print("lf_transferlock", transfer);
1103 }
1104#endif /* LOCKF_DEBUGGING */
1105 prev = head;
1106 while ((ovcase = lf_findoverlap(lf, transfer, SELF, &prev, &overlap)) != OVERLAP_NONE) {
1107 /* For POSIX Locks, change lf_id and lf_owner */
1108 if (overlap->lf_flags & F_POSIX) {
1109 overlap->lf_id = (caddr_t)transfer->lf_owner;
1110 overlap->lf_owner = transfer->lf_owner;
1111 } else if (overlap->lf_flags & F_OFD_LOCK) {
1112 /* Change the owner of the ofd style lock, if there is an owner */
1113 if (overlap->lf_owner) {
1114 overlap->lf_owner = transfer->lf_owner;
1115 }
1116 }
1117 /* Find the next lock */
1118 lf = overlap->lf_next;
1119 }
1120#ifdef LOCKF_DEBUGGING
1121 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1122 lf_printlist("lf_transferlock", transfer);
1123 }
1124#endif /* LOCKF_DEBUGGING */
1125 return 0;
1126}
1127
1128
1129/*
1130 * lf_getlock
1131 *
1132 * Description: Check whether there is a blocking lock, and if so return
1133 * its process identifier into the lock being requested.
1134 *
1135 * Parameters: lock Pointer to lock to test for blocks
1136 * fl Pointer to flock structure to receive
1137 * the blocking lock information, if a
1138 * blocking lock is found.
1139 * matchpid -1, or pid value to match in lookup.
1140 *
1141 * Returns: 0 Success
1142 *
1143 * Implicit Returns:
1144 * *fl Contents modified to reflect the
1145 * blocking lock, if one is found; not
1146 * modified otherwise
1147 *
1148 * Notes: fl->l_pid will be (-1) for file locks and will only be set to
1149 * the blocking process ID for advisory record locks.
1150 */
1151static int
1152lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
1153{
1154 struct lockf *block;
1155
1156#ifdef LOCKF_DEBUGGING
1157 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1158 lf_print("lf_getlock", lock);
1159 }
1160#endif /* LOCKF_DEBUGGING */
1161
1162 if ((block = lf_getblock(lock, matchpid))) {
1163 fl->l_type = block->lf_type;
1164 fl->l_whence = SEEK_SET;
1165 fl->l_start = block->lf_start;
1166 if (block->lf_end == -1 ||
1167 (block->lf_start == 0 && LF_END(block) == OFF_MAX)) {
1168 fl->l_len = 0;
1169 } else {
1170 fl->l_len = LF_END(block) - block->lf_start + 1;
1171 }
1172 if (NULL != block->lf_owner) {
1173 /*
1174 * lf_owner is only non-NULL when the lock
1175 * "owner" can be unambiguously determined
1176 */
1177 fl->l_pid = proc_pid(block->lf_owner);
1178 } else {
1179 fl->l_pid = -1;
1180 }
1181 } else {
1182 fl->l_type = F_UNLCK;
1183 }
1184 return 0;
1185}
1186
1187/*
1188 * lf_getblock
1189 *
1190 * Description: Walk the list of locks for an inode and return the first
1191 * blocking lock. A lock is considered blocking if we are not
1192 * the lock owner; otherwise, we are permitted to upgrade or
1193 * downgrade it, and it's not considered blocking.
1194 *
1195 * Parameters: lock The lock for which we are interested
1196 * in obtaining the blocking lock, if any
1197 * matchpid -1, or pid value to match in lookup.
1198 *
1199 * Returns: NOLOCKF No blocking lock exists
1200 * !NOLOCKF The address of the blocking lock's
1201 * struct lockf.
1202 */
1203static struct lockf *
1204lf_getblock(struct lockf *lock, pid_t matchpid)
1205{
1206 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1207
1208 for (prev = lock->lf_head;
1209 lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1210 lf = overlap->lf_next) {
1211 /*
1212 * Found an overlap.
1213 *
1214 * If we're matching pids, and it's a record lock,
1215 * or it's an OFD lock on a process-confined fd,
1216 * but the pid doesn't match, then keep on looking ..
1217 */
1218 if (matchpid != -1 &&
1219 (overlap->lf_flags & (F_POSIX | F_OFD_LOCK)) != 0 &&
1220 proc_pid(overlap->lf_owner) != matchpid) {
1221 continue;
1222 }
1223
1224 /*
1225 * does it block us?
1226 */
1227 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) {
1228 return overlap;
1229 }
1230 }
1231 return NOLOCKF;
1232}
1233
1234
1235/*
1236 * lf_findoverlap
1237 *
1238 * Description: Walk the list of locks to find an overlapping lock (if any).
1239 *
1240 * Parameters: lf First lock on lock list
1241 * lock The lock we are checking for an overlap
1242 * check Check type
1243 * prev pointer to pointer pointer to contain
1244 * address of pointer to previous lock
1245 * pointer to overlapping lock, if overlap
1246 * overlap pointer to pointer to contain address
1247 * of overlapping lock
1248 *
1249 * Returns: OVERLAP_NONE
1250 * OVERLAP_EQUALS_LOCK
1251 * OVERLAP_CONTAINS_LOCK
1252 * OVERLAP_CONTAINED_BY_LOCK
1253 * OVERLAP_STARTS_BEFORE_LOCK
1254 * OVERLAP_ENDS_AFTER_LOCK
1255 *
1256 * Implicit Returns:
1257 * *prev The address of the next pointer in the
1258 * lock previous to the overlapping lock;
1259 * this is generally used to relink the
1260 * lock list, avoiding a second iteration.
1261 * *overlap The pointer to the overlapping lock
1262 * itself; this is used to return data in
1263 * the check == OTHERS case, and for the
1264 * caller to modify the overlapping lock,
1265 * in the check == SELF case
1266 *
1267 * Note: This returns only the FIRST overlapping lock. There may be
1268 * more than one. lf_getlock will return the first blocking lock,
1269 * while lf_setlock will iterate over all overlapping locks to
1270 *
1271 * The check parameter can be SELF, meaning we are looking for
1272 * overlapping locks owned by us, or it can be OTHERS, meaning
1273 * we are looking for overlapping locks owned by someone else so
1274 * we can report a blocking lock on an F_GETLK request.
1275 *
1276 * The value of *overlap and *prev are modified, even if there is
1277 * no overlapping lock found; always check the return code.
1278 */
1279static overlap_t
1280lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1281 struct lockf ***prev, struct lockf **overlap)
1282{
1283 int found_self = 0;
1284
1285 *overlap = lf;
1286 if (lf == NOLOCKF) {
1287 return 0;
1288 }
1289#ifdef LOCKF_DEBUGGING
1290 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1291 lf_print("lf_findoverlap: looking for overlap in", lock);
1292 }
1293#endif /* LOCKF_DEBUGGING */
1294 const off_t start = lock->lf_start;
1295 const off_t end = LF_END(lock);
1296 while (lf != NOLOCKF) {
1297 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1298 ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1299 /*
1300 * Locks belonging to one process are adjacent on the
1301 * list, so if we've found any locks belonging to us,
1302 * and we're now seeing something else, then we've
1303 * examined all "self" locks. Note that bailing out
1304 * here is quite important; for coalescing, we assume
1305 * numerically adjacent locks from the same owner to
1306 * be adjacent on the list.
1307 */
1308 if ((type & SELF) && found_self) {
1309 return OVERLAP_NONE;
1310 }
1311
1312 *prev = &lf->lf_next;
1313 *overlap = lf = lf->lf_next;
1314 continue;
1315 }
1316
1317 if ((type & SELF)) {
1318 found_self = 1;
1319 }
1320
1321#ifdef LOCKF_DEBUGGING
1322 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1323 lf_print("\tchecking", lf);
1324 }
1325#endif /* LOCKF_DEBUGGING */
1326 /*
1327 * OK, check for overlap
1328 */
1329 const off_t lfstart = lf->lf_start;
1330 const off_t lfend = LF_END(lf);
1331
1332 if ((start > lfend) || (lfstart > end)) {
1333 /* Case 0 */
1334 LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1335
1336 /*
1337 * NOTE: assumes that locks for the same process are
1338 * nonintersecting and ordered.
1339 */
1340 if ((type & SELF) && lfstart > end) {
1341 return OVERLAP_NONE;
1342 }
1343 *prev = &lf->lf_next;
1344 *overlap = lf = lf->lf_next;
1345 continue;
1346 }
1347 if ((lfstart == start) && (lfend == end)) {
1348 LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1349 return OVERLAP_EQUALS_LOCK;
1350 }
1351 if ((lfstart <= start) && (lfend >= end)) {
1352 LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1353 return OVERLAP_CONTAINS_LOCK;
1354 }
1355 if ((start <= lfstart) && (end >= lfend)) {
1356 LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1357 return OVERLAP_CONTAINED_BY_LOCK;
1358 }
1359 if ((lfstart < start) && (lfend >= start)) {
1360 LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1361 return OVERLAP_STARTS_BEFORE_LOCK;
1362 }
1363 if ((lfstart > start) && (lfend > end)) {
1364 LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1365 return OVERLAP_ENDS_AFTER_LOCK;
1366 }
1367 panic("lf_findoverlap: default");
1368 }
1369 return OVERLAP_NONE;
1370}
1371
1372
1373/*
1374 * lf_split
1375 *
1376 * Description: Split a lock and a contained region into two or three locks
1377 * as necessary.
1378 *
1379 * Parameters: lock1 Lock to split
1380 * lock2 Overlapping lock region requiring the
1381 * split (upgrade/downgrade/unlock)
1382 *
1383 * Returns: 0 Success
1384 * ENOLCK No memory for new lock
1385 *
1386 * Implicit Returns:
1387 * *lock1 Modified original lock
1388 * *lock2 Overlapping lock (inserted into list)
1389 * (new lock) Potential new lock inserted into list
1390 * if split results in 3 locks
1391 *
1392 * Notes: This operation can only fail if the split would result in three
1393 * locks, and there is insufficient memory to allocate the third
1394 * lock; in that case, neither of the locks will be modified.
1395 */
1396static int
1397lf_split(struct lockf *lock1, struct lockf *lock2)
1398{
1399 struct lockf *splitlock;
1400
1401#ifdef LOCKF_DEBUGGING
1402 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1403 lf_print("lf_split", lock1);
1404 lf_print("splitting from", lock2);
1405 }
1406#endif /* LOCKF_DEBUGGING */
1407 /*
1408 * Check to see if splitting into only two pieces.
1409 */
1410 if (lock1->lf_start == lock2->lf_start) {
1411 assert(LF_END(lock2) < OFF_MAX);
1412 lock1->lf_start = LF_END(lock2) + 1;
1413 lock2->lf_next = lock1;
1414 return 0;
1415 }
1416 if (LF_END(lock1) == LF_END(lock2)) {
1417 assert(lock2->lf_start > 0);
1418 lock1->lf_end = lock2->lf_start - 1;
1419 lock2->lf_next = lock1->lf_next;
1420 lock1->lf_next = lock2;
1421 return 0;
1422 }
1423 /*
1424 * Make a new lock consisting of the last part of
1425 * the encompassing lock
1426 */
1427 splitlock = zalloc_flags(KT_LOCKF, Z_WAITOK | Z_NOFAIL);
1428 bcopy(src: lock1, dst: splitlock, n: sizeof *splitlock);
1429 assert(LF_END(lock2) < OFF_MAX);
1430 splitlock->lf_start = LF_END(lock2) + 1;
1431 TAILQ_INIT(&splitlock->lf_blkhd);
1432 assert(lock2->lf_start > 0);
1433 lock1->lf_end = lock2->lf_start - 1;
1434 /*
1435 * OK, now link it in
1436 */
1437 splitlock->lf_next = lock1->lf_next;
1438 lock2->lf_next = splitlock;
1439 lock1->lf_next = lock2;
1440
1441 return 0;
1442}
1443
1444
1445/*
1446 * lf_wakelock
1447 *
1448 * Wakeup a blocklist in the case of a downgrade or unlock, since others
1449 * waiting on the lock may now be able to acquire it.
1450 *
1451 * Parameters: listhead Lock list head on which waiters may
1452 * have pending locks
1453 *
1454 * Returns: <void>
1455 *
1456 * Notes: This function iterates a list of locks and wakes all waiters,
1457 * rather than only waiters for the contended regions. Because
1458 * of this, for heavily contended files, this can result in a
1459 * "thundering herd" situation. Refactoring the code could make
1460 * this operation more efficient, if heavy contention ever results
1461 * in a real-world performance problem.
1462 */
1463static void
1464lf_wakelock(struct lockf *listhead, boolean_t force_all)
1465{
1466 struct lockf *wakelock;
1467 boolean_t wake_all = TRUE;
1468
1469 if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) {
1470 wake_all = FALSE;
1471 }
1472
1473 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1474 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1475 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1476
1477 wakelock->lf_next = NOLOCKF;
1478#ifdef LOCKF_DEBUGGING
1479 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1480 lf_print("lf_wakelock: awakening", wakelock);
1481 }
1482#endif /* LOCKF_DEBUGGING */
1483 if (wake_all == FALSE) {
1484 /*
1485 * If there are items on the list head block list,
1486 * move them to the wakelock list instead, and then
1487 * correct their lf_next pointers.
1488 */
1489 if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1490 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1491
1492 struct lockf *tlock;
1493
1494 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1495 if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1496 /* See rdar://10887303 */
1497 panic("cycle in wakelock list");
1498 }
1499 tlock->lf_next = wakelock;
1500 }
1501 }
1502 }
1503 wakeup(chan: wakelock);
1504
1505 if (wake_all == FALSE) {
1506 break;
1507 }
1508 }
1509}
1510
1511
1512#ifdef LOCKF_DEBUGGING
1513#define GET_LF_OWNER_PID(lf) (proc_pid((lf)->lf_owner))
1514
1515/*
1516 * lf_print DEBUG
1517 *
1518 * Print out a lock; lock information is prefixed by the string in 'tag'
1519 *
1520 * Parameters: tag A string tag for debugging
1521 * lock The lock whose information should be
1522 * displayed
1523 *
1524 * Returns: <void>
1525 */
1526void
1527lf_print(const char *tag, struct lockf *lock)
1528{
1529 printf("%s: lock %p for ", tag, (void *)lock);
1530 if (lock->lf_flags & F_POSIX) {
1531 printf("proc %p (owner %d)",
1532 lock->lf_id, GET_LF_OWNER_PID(lock));
1533 } else if (lock->lf_flags & F_OFD_LOCK) {
1534 printf("fg %p (owner %d)",
1535 lock->lf_id, GET_LF_OWNER_PID(lock));
1536 } else {
1537 printf("id %p", (void *)lock->lf_id);
1538 }
1539 if (lock->lf_vnode != 0) {
1540 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1541 lock->lf_vnode,
1542 lock->lf_type == F_RDLCK ? "shared" :
1543 lock->lf_type == F_WRLCK ? "exclusive" :
1544 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1545 (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1546 } else {
1547 printf(" %s, start 0x%016llx, end 0x%016llx",
1548 lock->lf_type == F_RDLCK ? "shared" :
1549 lock->lf_type == F_WRLCK ? "exclusive" :
1550 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1551 (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1552 }
1553 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
1554 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1555 } else {
1556 printf("\n");
1557 }
1558}
1559
1560
1561/*
1562 * lf_printlist DEBUG
1563 *
1564 * Print out a lock list for the vnode associated with 'lock'; lock information
1565 * is prefixed by the string in 'tag'
1566 *
1567 * Parameters: tag A string tag for debugging
1568 * lock The lock whose vnode's lock list should
1569 * be displayed
1570 *
1571 * Returns: <void>
1572 */
1573void
1574lf_printlist(const char *tag, struct lockf *lock)
1575{
1576 struct lockf *lf, *blk;
1577
1578 if (lock->lf_vnode == 0) {
1579 return;
1580 }
1581
1582 printf("%s: Lock list for vno %p:\n",
1583 tag, lock->lf_vnode);
1584 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1585 printf("\tlock %p for ", (void *)lf);
1586 if (lf->lf_flags & F_POSIX) {
1587 printf("proc %p (owner %d)",
1588 lf->lf_id, GET_LF_OWNER_PID(lf));
1589 } else if (lf->lf_flags & F_OFD_LOCK) {
1590 printf("fg %p (owner %d)",
1591 lf->lf_id, GET_LF_OWNER_PID(lf));
1592 } else {
1593 printf("id %p", (void *)lf->lf_id);
1594 }
1595 printf(", %s, start 0x%016llx, end 0x%016llx",
1596 lf->lf_type == F_RDLCK ? "shared" :
1597 lf->lf_type == F_WRLCK ? "exclusive" :
1598 lf->lf_type == F_UNLCK ? "unlock" :
1599 "unknown", (uint64_t)lf->lf_start, (uint64_t)lf->lf_end);
1600 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1601 printf("\n\t\tlock request %p for ", (void *)blk);
1602 if (blk->lf_flags & F_POSIX) {
1603 printf("proc %p (owner %d)",
1604 blk->lf_id, GET_LF_OWNER_PID(blk));
1605 } else if (blk->lf_flags & F_OFD_LOCK) {
1606 printf("fg %p (owner %d)",
1607 blk->lf_id, GET_LF_OWNER_PID(blk));
1608 } else {
1609 printf("id %p", (void *)blk->lf_id);
1610 }
1611 printf(", %s, start 0x%016llx, end 0x%016llx",
1612 blk->lf_type == F_RDLCK ? "shared" :
1613 blk->lf_type == F_WRLCK ? "exclusive" :
1614 blk->lf_type == F_UNLCK ? "unlock" :
1615 "unknown", (uint64_t)blk->lf_start,
1616 (uint64_t)blk->lf_end);
1617 if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
1618 panic("lf_printlist: bad list");
1619 }
1620 }
1621 printf("\n");
1622 }
1623}
1624#endif /* LOCKF_DEBUGGING */
1625
1626#if IMPORTANCE_INHERITANCE
1627
1628/*
1629 * lf_hold_assertion
1630 *
1631 * Call task importance hold assertion on the owner of the lock.
1632 *
1633 * Parameters: block_task Owner of the lock blocking
1634 * current thread.
1635 *
1636 * block lock on which the current thread
1637 * is blocking on.
1638 *
1639 * Returns: <void>
1640 *
1641 * Notes: The task reference on block_task is not needed to be hold since
1642 * the current thread has vnode lock and block_task has a file
1643 * lock, thus removing file lock in exit requires block_task to
1644 * grab the vnode lock.
1645 */
1646static void
1647lf_hold_assertion(task_t block_task, struct lockf *block)
1648{
1649 if (task_importance_hold_file_lock_assertion(target_task: block_task, count: 1) == 0) {
1650 block->lf_boosted = LF_BOOSTED;
1651 LOCKF_DEBUG(LF_DBG_IMPINH,
1652 "lf: importance hold file lock assert on pid %d lock %p\n",
1653 proc_pid(block->lf_owner), block);
1654 }
1655}
1656
1657
1658/*
1659 * lf_jump_to_queue_head
1660 *
1661 * Jump the lock from the tail of the block queue to the head of
1662 * the queue.
1663 *
1664 * Parameters: block lockf struct containing the
1665 * block queue.
1666 * lock lockf struct to be jumped to the
1667 * front.
1668 *
1669 * Returns: <void>
1670 */
1671static void
1672lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1673{
1674 /* Move the lock to the head of the block queue. */
1675 TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1676 TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1677}
1678
1679
1680/*
1681 * lf_drop_assertion
1682 *
1683 * Drops the task hold assertion.
1684 *
1685 * Parameters: block lockf struct holding the assertion.
1686 *
1687 * Returns: <void>
1688 */
1689static void
1690lf_drop_assertion(struct lockf *block)
1691{
1692 LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1693 proc_pid(block->lf_owner), block);
1694
1695 task_t current_task = proc_task(block->lf_owner);
1696 task_importance_drop_file_lock_assertion(target_task: current_task, count: 1);
1697 block->lf_boosted = LF_NOT_BOOSTED;
1698}
1699
1700/*
1701 * lf_adjust_assertion
1702 *
1703 * Adjusts importance assertion of file lock. Goes through
1704 * all the blocking locks and checks if the file lock needs
1705 * to be boosted anymore.
1706 *
1707 * Parameters: block lockf structure which needs to be adjusted.
1708 *
1709 * Returns: <void>
1710 */
1711static void
1712lf_adjust_assertion(struct lockf *block)
1713{
1714 boolean_t drop_boost = TRUE;
1715 struct lockf *next;
1716
1717 /* Return if the lock is not boosted */
1718 if (block->lf_boosted == LF_NOT_BOOSTED) {
1719 return;
1720 }
1721
1722 TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
1723 /* Check if block and next are same type of locks */
1724 if (((block->lf_flags & next->lf_flags & F_POSIX) != 0) ||
1725 ((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
1726 (block->lf_owner != next->lf_owner) &&
1727 (NULL != block->lf_owner && NULL != next->lf_owner))) {
1728 /* Check if next would be boosting block */
1729 if (task_is_importance_donor(task: proc_task(next->lf_owner)) &&
1730 task_is_importance_receiver_type(task: proc_task(block->lf_owner))) {
1731 /* Found a lock boosting block */
1732 drop_boost = FALSE;
1733 break;
1734 }
1735 }
1736 }
1737
1738 if (drop_boost) {
1739 lf_drop_assertion(block);
1740 }
1741}
1742
1743static void
1744lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1745{
1746 task_t ltask = proc_task(lock->lf_owner);
1747 task_t btask = proc_task(block->lf_owner);
1748
1749 /*
1750 * Check if ltask can donate importance. The
1751 * check of imp_donor bit is done without holding
1752 * any lock. The value may change after you read it,
1753 * but it is ok to boost a task while someone else is
1754 * unboosting you.
1755 *
1756 * TODO: Support live inheritance on file locks.
1757 */
1758 if (task_is_importance_donor(task: ltask)) {
1759 LOCKF_DEBUG(LF_DBG_IMPINH,
1760 "lf: %d: attempt to boost pid %d that holds lock %p\n",
1761 proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1762
1763 if (block->lf_boosted != LF_BOOSTED &&
1764 task_is_importance_receiver_type(task: btask)) {
1765 lf_hold_assertion(block_task: btask, block);
1766 }
1767 lf_jump_to_queue_head(block, lock);
1768 }
1769}
1770#endif /* IMPORTANCE_INHERITANCE */
1771