kern_lockf.c source code [xnu/bsd/kern/kern_lockf.c]

1	/*
2	* Copyright (c) 2019-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1982, 1986, 1989, 1993
30	* The Regents of the University of California. All rights reserved.
31	*
32	* This code is derived from software contributed to Berkeley by
33	* Scooter Morris at Genentech Inc.
34	*
35	* Redistribution and use in source and binary forms, with or without
36	* modification, are permitted provided that the following conditions
37	* are met:
38	* 1. Redistributions of source code must retain the above copyright
39	* notice, this list of conditions and the following disclaimer.
40	* 2. Redistributions in binary form must reproduce the above copyright
41	* notice, this list of conditions and the following disclaimer in the
42	* documentation and/or other materials provided with the distribution.
43	* 4. Neither the name of the University nor the names of its contributors
44	* may be used to endorse or promote products derived from this software
45	* without specific prior written permission.
46	*
47	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57	* SUCH DAMAGE.
58	*
59	* @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
60	*/
61
62	#include <sys/cdefs.h>
63	#include <sys/param.h>
64	#include <sys/systm.h>
65	#include <sys/kernel.h>
66	#include <sys/lock.h>
67	#include <sys/mount.h>
68	#include <sys/proc.h>
69	#include <sys/signalvar.h>
70	#include <sys/unistd.h>
71	#include <sys/user.h>
72	#include <sys/vnode.h>
73	#include <sys/vnode_internal.h>
74	#include <sys/vnode_if.h>
75	#include <sys/malloc.h>
76	#include <sys/fcntl.h>
77	#include <sys/lockf.h>
78	#include <sys/sdt.h>
79	#include <kern/policy_internal.h>
80
81	#include <sys/file_internal.h>
82
83	#if (DEVELOPMENT \|\| DEBUG)
84	#define LOCKF_DEBUGGING 1
85	#endif
86
87	#ifdef LOCKF_DEBUGGING
88	#include <sys/sysctl.h>
89	void lf_print(const char tag, struct* lockf *lock);
90	void lf_printlist(const char tag, struct* lockf *lock);
91
92	#define LF_DBG_LOCKOP (1 << 0) /* setlk, getlk, clearlk */
93	#define LF_DBG_LIST (1 << 1) /* split, coalesce */
94	#define LF_DBG_IMPINH (1 << 2) /* importance inheritance */
95	#define LF_DBG_TRACE (1 << 3) /* errors, exit */
96	#define LF_DBG_DEADLOCK (1 << 4) /* deadlock detection */
97
98	static int lockf_debug = `0`; / was 2, could be 3 ;-) /
99	SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW \| CTLFLAG_LOCKED, &lockf_debug, `0`, "");
100
101	/*
102	* If the selector is set, then output the debugging diagnostic.
103	*/
104	#define LOCKF_DEBUG(mask, ...) \
105	do { \
106	if ((mask) & lockf_debug) { \
107	printf("%s>", __FUNCTION__); \
108	printf(__VA_ARGS__); \
109	} \
110	} while(0)
111
112	#define LOCKF_DEBUGP(mask) \
113	({ \
114	((mask) & lockf_debug); \
115	})
116	#else /* !LOCKF_DEBUGGING */
117	#define LOCKF_DEBUG(mask, ...) /* mask */
118	#endif /* !LOCKF_DEBUGGING */
119
120	KALLOC_TYPE_DEFINE(KT_LOCKF, struct lockf, KT_PRIV_ACCT);
121
122	#define NOLOCKF (struct lockf *)0
123	#define SELF 0x1
124	#define OTHERS 0x2
125	#define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
126
127	/ return the effective end of a 'struct lockf': lf_end == -1 is OFF_MAX /
128	#define LF_END(l) ((l)->lf_end == -1 ? OFF_MAX : (l)->lf_end)
129
130	/*
131	* Overlapping lock states
132	*
133	* For lk_find_overlap(..., SELF, ...), the possible sequences are a single:
134	* - OVERLAP_NONE,
135	* - OVERLAP_EQUALS_LOCK, or
136	* - OVERLAP_CONTAINS_LOCK
137	*
138	* or the following sequence:
139	* - optional OVERLAP_STARTS_BEFORE_LOCK
140	* - zero or more OVERLAP_CONTAINED_BY_LOCK
141	* - optional OVERLAP_ENDS_AFTER_LOCK
142	* - OVERLAP_NONE
143	*
144	* In the annotations:
145	* - the search lock is [SS, SE] and
146	* - the returned overlap lock is [OS,OE].
147	*/
148	typedef enum {
149	OVERLAP_NONE = `0`,
150	OVERLAP_EQUALS_LOCK, / OS == SS && OE == SE /
151	OVERLAP_CONTAINS_LOCK, / OS <= SS && OE >= SE /
152	OVERLAP_CONTAINED_BY_LOCK, / OS >= SS && OE <= SE /
153	OVERLAP_STARTS_BEFORE_LOCK, / OS < SS && OE >= SS /
154	OVERLAP_ENDS_AFTER_LOCK / OS > SS && OE > SE /
155	} overlap_t;
156
157	static int lf_clearlock(struct lockf *);
158	static int lf_transferlock(struct lockf *);
159	static overlap_t lf_findoverlap(struct lockf *,
160	struct lockf , int, struct* lockf *, struct lockf );
161	static struct lockf lf_getblock(struct* lockf *, pid_t);
162	static int lf_getlock(struct lockf , struct* flock *, pid_t);
163	static int lf_setlock(struct lockf , struct* timespec *);
164	static int lf_split(struct lockf , struct* lockf *);
165	static void lf_wakelock(struct lockf *, boolean_t);
166	#if IMPORTANCE_INHERITANCE
167	static void lf_hold_assertion(task_t, struct lockf *);
168	static void lf_jump_to_queue_head(struct lockf , struct* lockf *);
169	static void lf_drop_assertion(struct lockf *);
170	static void lf_boost_blocking_proc(struct lockf , struct* lockf *);
171	static void lf_adjust_assertion(struct lockf *block);
172	#endif /* IMPORTANCE_INHERITANCE */
173
174	static LCK_GRP_DECLARE(lf_dead_lock_grp, "lf_dead_lock");
175	static LCK_MTX_DECLARE(lf_dead_lock, &lf_dead_lock_grp);
176
177	/*
178	* lf_advlock
179	*
180	* Description: Advisory record locking support
181	*
182	* Parameters: ap Argument pointer to a vnop_advlock_args
183	* argument descriptor structure for the
184	* lock operation to be attempted.
185	*
186	* Returns: 0 Success
187	* EOVERFLOW
188	* EINVAL
189	* ENOLCK Number of locked regions exceeds limit
190	* lf_setlock:EAGAIN
191	* lf_setlock:EDEADLK
192	* lf_setlock:EINTR
193	* lf_setlock:ENOLCK
194	* lf_setlock:ETIMEDOUT
195	* lf_clearlock:ENOLCK
196	* vnode_size:???
197	*
198	* Notes: We return ENOLCK when we run out of memory to support locks; as
199	* such, there is no specific expectation limit other than the
200	* amount of available resources.
201	*/
202	int
203	lf_advlock(struct vnop_advlock_args *ap)
204	{
205	struct vnode *vp = ap->a_vp;
206	struct flock *fl = ap->a_fl;
207	vfs_context_t context = ap->a_context;
208	struct lockf *lock;
209	off_t start, end, oadd;
210	u_quad_t size;
211	int error;
212	struct lockf **head = &vp->v_lockf;
213
214	/ XXX HFS may need a !vnode_isreg(vp) EISDIR error here /
215
216	/*
217	* Avoid the common case of unlocking when inode has no locks.
218	*/
219	if (head == (struct* lockf *)`0`) {
220	if (ap->a_op != F_SETLK) {
221	fl->l_type = F_UNLCK;
222	LOCKF_DEBUG(LF_DBG_TRACE,
223	"lf_advlock: '%s' unlock without lock\n",
224	vfs_context_proc(context)->p_comm);
225	return `0`;
226	}
227	}
228
229	/*
230	* Convert the flock structure into a start and end.
231	*/
232	switch (fl->l_whence) {
233	case SEEK_SET:
234	case SEEK_CUR:
235	/*
236	* Caller is responsible for adding any necessary offset
237	* when SEEK_CUR is used.
238	*/
239	start = fl->l_start;
240	break;
241
242	case SEEK_END:
243
244	/*
245	* It's OK to cast the u_quad_t to and off_t here, since they
246	* are the same storage size, and the value of the returned
247	* contents will never overflow into the sign bit. We need to
248	* do this because we will use size to force range checks.
249	*/
250	if ((error = vnode_size(vp, (off_t *)&size, context))) {
251	LOCKF_DEBUG(LF_DBG_TRACE,
252	"lf_advlock: vnode_getattr failed: %d\n", error);
253	return error;
254	}
255
256	if (size > OFF_MAX \|\|
257	(fl->l_start > `0` &&
258	size > (u_quad_t)(OFF_MAX - fl->l_start))) {
259	return EOVERFLOW;
260	}
261	start = size + fl->l_start;
262	break;
263
264	default:
265	LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
266	fl->l_whence);
267	return EINVAL;
268	}
269	if (start < `0`) {
270	LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
271	start);
272	return EINVAL;
273	}
274	if (fl->l_len < `0`) {
275	if (start == `0`) {
276	LOCKF_DEBUG(LF_DBG_TRACE,
277	"lf_advlock: len < 0 & start == 0\n");
278	return EINVAL;
279	}
280	end = start - `1`;
281	start += fl->l_len;
282	if (start < `0`) {
283	LOCKF_DEBUG(LF_DBG_TRACE,
284	"lf_advlock: start < 0 (%qd)\n", start);
285	return EINVAL;
286	}
287	} else if (fl->l_len == `0`) {
288	end = -`1`;
289	} else {
290	oadd = fl->l_len - `1`;
291	if (oadd > (off_t)(OFF_MAX - start)) {
292	LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
293	return EOVERFLOW;
294	}
295	end = start + oadd;
296	}
297	/*
298	* Create the lockf structure
299	*/
300	lock = zalloc_flags(KT_LOCKF, Z_WAITOK \| Z_NOFAIL);
301	lock->lf_start = start;
302	lock->lf_end = end;
303	lock->lf_id = ap->a_id;
304	lock->lf_vnode = vp;
305	lock->lf_type = fl->l_type;
306	lock->lf_head = head;
307	lock->lf_next = (struct lockf *)`0`;
308	TAILQ_INIT(&lock->lf_blkhd);
309	lock->lf_flags = (short)ap->a_flags;
310	#if IMPORTANCE_INHERITANCE
311	lock->lf_boosted = LF_NOT_BOOSTED;
312	#endif
313	if (ap->a_flags & F_POSIX) {
314	lock->lf_owner = (struct proc *)lock->lf_id;
315	} else {
316	lock->lf_owner = NULL;
317	}
318
319	if (ap->a_flags & F_FLOCK) {
320	lock->lf_flags \|= F_WAKE1_SAFE;
321	}
322
323	lck_mtx_lock(lck: &vp->v_lock); / protect the lockf list /
324	/*
325	* Do the requested operation.
326	*/
327	switch (ap->a_op) {
328	case F_SETLK:
329	/*
330	* For OFD locks, lf_id is derived from the fileglob.
331	* Record an "lf_owner" iff this is a confined fd
332	* i.e. it cannot escape this process and will be
333	* F_UNLCKed before the owner exits. (This is
334	* the implicit guarantee needed to ensure lf_owner
335	* remains a valid reference.)
336	*/
337	if ((ap->a_flags & F_OFD_LOCK) && (ap->a_flags & F_CONFINED)) {
338	lock->lf_owner = current_proc();
339	}
340	error = lf_setlock(lock, ap->a_timeout);
341	break;
342
343	case F_UNLCK:
344	error = lf_clearlock(lock);
345	zfree(KT_LOCKF, lock);
346	break;
347
348	case F_TRANSFER:
349	/*
350	* The new owner is passed in the context, set the new owner
351	* in the lf_owner field.
352	*/
353	lock->lf_owner = vfs_context_proc(ctx: context);
354	assert(lock->lf_owner != current_proc());
355	error = lf_transferlock(lock);
356	zfree(KT_LOCKF, lock);
357	break;
358
359	case F_GETLK:
360	error = lf_getlock(lock, fl, -`1`);
361	zfree(KT_LOCKF, lock);
362	break;
363
364	case F_GETLKPID:
365	error = lf_getlock(lock, fl, fl->l_pid);
366	zfree(KT_LOCKF, lock);
367	break;
368
369	default:
370	zfree(KT_LOCKF, lock);
371	error = EINVAL;
372	break;
373	}
374	lck_mtx_unlock(lck: &vp->v_lock); / done manipulating the list /
375
376	LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
377	return error;
378	}
379
380	/*
381	* Empty the queue of msleeping requests for a lock on the given vnode.
382	* Called with the vnode already locked. Used for forced unmount, where
383	* a flock(2) invoker sleeping on a blocked lock holds an iocount reference
384	* that prevents the vnode from ever being drained. Force unmounting wins.
385	*/
386	void
387	lf_abort_advlocks(vnode_t vp)
388	{
389	struct lockf *lock;
390
391	if ((lock = vp->v_lockf) == NULL) {
392	return;
393	}
394
395	lck_mtx_assert(lck: &vp->v_lock, LCK_MTX_ASSERT_OWNED);
396
397	if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
398	struct lockf *tlock;
399
400	TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
401	/*
402	* Setting this flag should cause all
403	* currently blocked F_SETLK request to
404	* return to userland with an errno.
405	*/
406	tlock->lf_flags \|= F_ABORT;
407	}
408	lf_wakelock(lock, TRUE);
409	}
410	}
411
412	/*
413	* Take any lock attempts which are currently blocked by a given lock ("from")
414	* and mark them as blocked by a different lock ("to"). Used in the case
415	* where a byte range currently occupied by "from" is to be occupied by "to."
416	*/
417	static void
418	lf_move_blocked(struct lockf to, struct* lockf *from)
419	{
420	struct lockf *tlock;
421
422	TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
423	tlock->lf_next = to;
424	}
425
426	TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
427	}
428
429	/*
430	* lf_coalesce_adjacent
431	*
432	* Description: Helper function: when setting a lock, coalesce adjacent
433	* locks. Needed because adjacent locks are not overlapping,
434	* but POSIX requires that they be coalesced.
435	*
436	* Parameters: lock The new lock which may be adjacent
437	* to already locked regions, and which
438	* should therefore be coalesced with them
439	*
440	* Returns: <void>
441	*/
442	static void
443	lf_coalesce_adjacent(struct lockf *lock)
444	{
445	struct lockf **lf = lock->lf_head;
446
447	while (*lf != NOLOCKF) {
448	/ reject locks that obviously could not be coalesced /
449	if ((*lf == lock) \|\|
450	((*lf)->lf_id != lock->lf_id) \|\|
451	((*lf)->lf_type != lock->lf_type)) {
452	lf = &(*lf)->lf_next;
453	continue;
454	}
455
456	/*
457	* NOTE: Assumes that if two locks are adjacent on the number line
458	* and belong to the same owner, then they are adjacent on the list.
459	*/
460	if (LF_END(*lf) < OFF_MAX &&
461	(LF_END(*lf) + `1`) == lock->lf_start) {
462	struct lockf adjacent = lf;
463
464	LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
465	lock->lf_start = (*lf)->lf_start;
466	*lf = lock;
467	lf = &(*lf)->lf_next;
468
469	lf_move_blocked(to: lock, from: adjacent);
470
471	zfree(KT_LOCKF, adjacent);
472	continue;
473	}
474	/ If the lock starts adjacent to us, we can coalesce it /
475	if (LF_END(lock) < OFF_MAX &&
476	(LF_END(lock) + `1`) == (*lf)->lf_start) {
477	struct lockf adjacent = lf;
478
479	LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
480	lock->lf_end = (*lf)->lf_end;
481	lock->lf_next = (*lf)->lf_next;
482	lf = &lock->lf_next;
483
484	lf_move_blocked(to: lock, from: adjacent);
485
486	zfree(KT_LOCKF, adjacent);
487	continue;
488	}
489
490	/ no matching conditions; go on to next lock /
491	lf = &(*lf)->lf_next;
492	}
493	}
494
495	/*
496	* lf_setlock
497	*
498	* Description: Set a byte-range lock.
499	*
500	* Parameters: lock The lock structure describing the lock
501	* to be set; allocated by the caller, it
502	* will be linked into the lock list if
503	* the set is successful, and freed if the
504	* set is unsuccessful.
505	*
506	* timeout Timeout specified in the case of
507	* SETLKWTIMEOUT.
508	*
509	* Returns: 0 Success
510	* EAGAIN
511	* EDEADLK
512	* lf_split:ENOLCK
513	* lf_clearlock:ENOLCK
514	* msleep:EINTR
515	* msleep:ETIMEDOUT
516	*
517	* Notes: We add the lock to the provisional lock list. We do not
518	* coalesce at this time; this has implications for other lock
519	* requestors in the blocker search mechanism.
520	*/
521	static int
522	lf_setlock(struct lockf lock, struct* timespec *timeout)
523	{
524	struct lockf *block;
525	struct lockf **head = lock->lf_head;
526	struct lockf *prev, overlap;
527	static const char lockstr[] = "lockf";
528	int priority, needtolink, error;
529	struct vnode *vp = lock->lf_vnode;
530	overlap_t ovcase;
531
532	#ifdef LOCKF_DEBUGGING
533	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
534	lf_print("lf_setlock", lock);
535	lf_printlist("lf_setlock(in)", lock);
536	}
537	#endif /* LOCKF_DEBUGGING */
538	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
539
540	/*
541	* Set the priority
542	*/
543	priority = PLOCK;
544	if (lock->lf_type == F_WRLCK) {
545	priority += `4`;
546	}
547	priority \|= PCATCH;
548	scan:
549	/*
550	* Scan lock list for this file looking for locks that would block us.
551	*/
552	while ((block = lf_getblock(lock, -`1`))) {
553	/*
554	* Free the structure and return if nonblocking.
555	*/
556	if ((lock->lf_flags & F_WAIT) == `0`) {
557	DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
558	zfree(KT_LOCKF, lock);
559	return EAGAIN;
560	}
561
562	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
563
564	/*
565	* We are blocked. Since flock style locks cover
566	* the whole file, there is no chance for deadlock.
567	*
568	* OFD byte-range locks currently do NOT support
569	* deadlock detection.
570	*
571	* For POSIX byte-range locks we must check for deadlock.
572	*
573	* Deadlock detection is done by looking through the
574	* wait channels to see if there are any cycles that
575	* involve us.
576	*/
577	if ((lock->lf_flags & F_POSIX) &&
578	(block->lf_flags & F_POSIX)) {
579	lck_mtx_lock(lck: &lf_dead_lock);
580
581	/ The blocked process is waiting on something /
582	struct proc *wproc = block->lf_owner;
583	proc_lock(wproc);
584
585	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
586
587	struct uthread *ut;
588	TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
589	/*
590	* If the thread is (a) asleep (uu_wchan != 0)
591	* and (b) in this code (uu_wmesg == lockstr)
592	* then check to see if the lock is blocked behind
593	* someone blocked behind us.
594	*
595	* Note: (i) vp->v_lock is held, preventing other
596	* threads from mutating the blocking list for our vnode.
597	* and (ii) the proc_lock is held i.e the thread list
598	* is stable.
599	*
600	* HOWEVER some thread in wproc might be sleeping on a lockf
601	* structure for a different vnode, and be woken at any
602	* time. Thus the waitblock list could mutate while
603	* it's being inspected by this thread, and what
604	* ut->uu_wchan was just pointing at could even be freed.
605	*
606	* Nevertheless this is safe here because of lf_dead_lock; if
607	* any thread blocked with uu_wmesg == lockstr wakes (see below)
608	* it will try to acquire lf_dead_lock which is already held
609	* here. Holding that lock prevents the lockf structure being
610	* pointed at by ut->uu_wchan from going away. Thus the vnode
611	* involved can be found and locked, and the corresponding
612	* blocking chain can then be examined safely.
613	*/
614	const struct lockf waitblock = (const* void *)ut->uu_wchan;
615	if ((waitblock != NULL) && (ut->uu_wmesg == lockstr)) {
616	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
617
618	vnode_t othervp = NULL;
619	if (waitblock->lf_vnode != vp) {
620	/*
621	* This thread in wproc is waiting for a lock
622	* on a different vnode; grab the lock on it
623	* that protects lf_next while we examine it.
624	*/
625	othervp = waitblock->lf_vnode;
626	if (!lck_mtx_try_lock(lck: &othervp->v_lock)) {
627	/*
628	* avoid kernel deadlock: drop all
629	* locks, pause for a bit to let the
630	* other thread do what it needs to do,
631	* then (because we drop and retake
632	* v_lock) retry the scan.
633	*/
634	proc_unlock(wproc);
635	lck_mtx_unlock(lck: &lf_dead_lock);
636	static struct timespec ts = {
637	.tv_sec = `0`,
638	.tv_nsec = `2` * NSEC_PER_MSEC,
639	};
640	static const char pausestr[] = "lockf:pause";
641	(void) msleep(chan: lock, mtx: &vp->v_lock, pri: priority, wmesg: pausestr, ts: &ts);
642	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
643	goto scan;
644	}
645	}
646
647	/*
648	* Get the lock blocking the lock
649	* which would block us, and make
650	* certain it hasn't become unblocked
651	* (been granted, e.g. between the time
652	* we called lf_getblock, and the time
653	* we successfully acquired the
654	* proc_lock).
655	*/
656	const struct lockf *nextblock = waitblock->lf_next;
657	if (nextblock == NULL) {
658	if (othervp) {
659	lck_mtx_unlock(lck: &othervp->v_lock);
660	}
661	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with waitblock %p and no lf_next; othervp %p\n", lock, waitblock, othervp);
662	continue;
663	}
664	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, nextblock, nextblock->lf_vnode);
665
666	/*
667	* Make sure it's an advisory range
668	* lock and not any other kind of lock;
669	* if we mix lock types, it's our own
670	* fault.
671	*/
672	if ((nextblock->lf_flags & F_POSIX) == `0`) {
673	if (othervp) {
674	lck_mtx_unlock(lck: &othervp->v_lock);
675	}
676	continue;
677	}
678
679	/*
680	* If the owner of the lock that's
681	* blocking a lock that's blocking us
682	* getting the requested lock, then we
683	* would deadlock, so error out.
684	*/
685	struct proc *bproc = nextblock->lf_owner;
686	const boolean_t deadlocked = bproc == lock->lf_owner;
687
688	if (othervp) {
689	lck_mtx_unlock(lck: &othervp->v_lock);
690	}
691	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
692	if (deadlocked) {
693	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
694	proc_unlock(wproc);
695	lck_mtx_unlock(lck: &lf_dead_lock);
696	zfree(KT_LOCKF, lock);
697	return EDEADLK;
698	}
699	}
700	LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
701	}
702	proc_unlock(wproc);
703	lck_mtx_unlock(lck: &lf_dead_lock);
704	}
705
706	/*
707	* For flock type locks, we must first remove
708	* any shared locks that we hold before we sleep
709	* waiting for an exclusive lock.
710	*/
711	if ((lock->lf_flags & F_FLOCK) &&
712	lock->lf_type == F_WRLCK) {
713	lock->lf_type = F_UNLCK;
714	if ((error = lf_clearlock(lock)) != `0`) {
715	zfree(KT_LOCKF, lock);
716	return error;
717	}
718	lock->lf_type = F_WRLCK;
719	}
720	/*
721	* Add our lock to the blocked list and sleep until we're free.
722	* Remember who blocked us (for deadlock detection).
723	*/
724	lock->lf_next = block;
725	TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
726
727	if (!(lock->lf_flags & F_FLOCK)) {
728	block->lf_flags &= ~F_WAKE1_SAFE;
729	}
730
731	#if IMPORTANCE_INHERITANCE
732	/*
733	* Importance donation is done only for cases where the
734	* owning task can be unambiguously determined.
735	*
736	* POSIX type locks are not inherited by child processes;
737	* we maintain a 1:1 mapping between a lock and its owning
738	* process.
739	*
740	* Flock type locks are inherited across fork() and there is
741	* no 1:1 mapping in the general case. However, the fileglobs
742	* used by OFD locks may be confined to the process that
743	* created them, and thus have an "owner", in which case
744	* we also attempt importance donation.
745	*/
746	if ((lock->lf_flags & block->lf_flags & F_POSIX) != `0`) {
747	lf_boost_blocking_proc(lock, block);
748	} else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
749	lock->lf_owner != block->lf_owner &&
750	NULL != lock->lf_owner && NULL != block->lf_owner) {
751	lf_boost_blocking_proc(lock, block);
752	}
753	#endif /* IMPORTANCE_INHERITANCE */
754
755	#ifdef LOCKF_DEBUGGING
756	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
757	lf_print("lf_setlock: blocking on", block);
758	lf_printlist("lf_setlock(block)", block);
759	}
760	#endif /* LOCKF_DEBUGGING */
761	DTRACE_FSINFO(advlock__wait, vnode_t, vp);
762
763	if (lock->lf_flags & F_POSIX) {
764	error = msleep(chan: lock, mtx: &vp->v_lock, pri: priority, wmesg: lockstr, ts: timeout);
765	/*
766	* Ensure that 'lock' doesn't get mutated or freed if a
767	* wakeup occurs while hunting for deadlocks (and holding
768	* lf_dead_lock - see above)
769	*/
770	lck_mtx_lock(lck: &lf_dead_lock);
771	lck_mtx_unlock(lck: &lf_dead_lock);
772	} else {
773	static const char lockstr_np[] = "lockf:np";
774	error = msleep(chan: lock, mtx: &vp->v_lock, pri: priority, wmesg: lockstr_np, ts: timeout);
775	}
776
777	if (error == `0` && (lock->lf_flags & F_ABORT) != `0`) {
778	error = EBADF;
779	}
780
781	if (lock->lf_next) {
782	/*
783	* lf_wakelock() always sets wakelock->lf_next to
784	* NULL before a wakeup; so we've been woken early
785	* - perhaps by a debugger, signal or other event.
786	*
787	* Remove 'lock' from the block list (avoids double-add
788	* in the spurious case, which would create a cycle)
789	*/
790	TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
791	#if IMPORTANCE_INHERITANCE
792	/*
793	* Adjust the boost on lf_next.
794	*/
795	lf_adjust_assertion(block: lock->lf_next);
796	#endif /* IMPORTANCE_INHERITANCE */
797	lock->lf_next = NULL;
798
799	if (error == `0`) {
800	/*
801	* If this was a spurious wakeup, retry
802	*/
803	printf("%s: spurious wakeup, retrying lock\n",
804	__func__);
805	continue;
806	}
807	}
808
809	if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
810	if ((block = lf_getblock(lock, -`1`)) != NULL) {
811	lf_move_blocked(to: block, from: lock);
812	}
813	}
814
815	if (error) {
816	if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
817	lf_wakelock(lock, TRUE);
818	}
819	zfree(KT_LOCKF, lock);
820	/ Return ETIMEDOUT if timeout occoured. /
821	if (error == EWOULDBLOCK) {
822	error = ETIMEDOUT;
823	}
824	return error;
825	}
826	}
827
828	/*
829	* No blocks!! Add the lock. Note that we will
830	* downgrade or upgrade any overlapping locks this
831	* process already owns.
832	*
833	* Skip over locks owned by other processes.
834	* Handle any locks that overlap and are owned by ourselves.
835	*/
836	prev = head;
837	block = *head;
838	needtolink = `1`;
839	for (;;) {
840	const off_t lkend = LF_END(lock);
841	ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
842	if (ovcase) {
843	block = overlap->lf_next;
844	}
845	/*
846	* Six cases:
847	* 0) no overlap
848	* 1) overlap == lock
849	* 2) overlap contains lock
850	* 3) lock contains overlap
851	* 4) overlap starts before lock
852	* 5) overlap ends after lock
853	*/
854	switch (ovcase) {
855	case OVERLAP_NONE:
856	if (needtolink) {
857	*prev = lock;
858	lock->lf_next = overlap;
859	}
860	break;
861
862	case OVERLAP_EQUALS_LOCK:
863	/*
864	* If downgrading lock, others may be
865	* able to acquire it.
866	*/
867	if (lock->lf_type == F_RDLCK &&
868	overlap->lf_type == F_WRLCK) {
869	lf_wakelock(overlap, TRUE);
870	}
871	overlap->lf_type = lock->lf_type;
872	lf_move_blocked(to: overlap, from: lock);
873	zfree(KT_LOCKF, lock);
874	lock = overlap; / for lf_coalesce_adjacent() /
875	break;
876
877	case OVERLAP_CONTAINS_LOCK:
878	/*
879	* Check for common starting point and different types.
880	*/
881	if (overlap->lf_type == lock->lf_type) {
882	lf_move_blocked(to: overlap, from: lock);
883	zfree(KT_LOCKF, lock);
884	lock = overlap; / for lf_coalesce_adjacent() /
885	break;
886	}
887	if (overlap->lf_start == lock->lf_start) {
888	*prev = lock;
889	lock->lf_next = overlap;
890	assert(lkend < OFF_MAX);
891	overlap->lf_start = lkend + `1`;
892	} else {
893	/*
894	* If we can't split the lock, we can't
895	* grant it. Claim a system limit for the
896	* resource shortage.
897	*/
898	if (lf_split(overlap, lock)) {
899	zfree(KT_LOCKF, lock);
900	return ENOLCK;
901	}
902	}
903	lf_wakelock(overlap, TRUE);
904	break;
905
906	case OVERLAP_CONTAINED_BY_LOCK:
907	/*
908	* If downgrading lock, others may be able to
909	* acquire it, otherwise take the list.
910	*/
911	if (lock->lf_type == F_RDLCK &&
912	overlap->lf_type == F_WRLCK) {
913	lf_wakelock(overlap, TRUE);
914	} else {
915	lf_move_blocked(to: lock, from: overlap);
916	}
917	/*
918	* Add the new lock if necessary and delete the overlap.
919	*/
920	if (needtolink) {
921	*prev = lock;
922	lock->lf_next = overlap->lf_next;
923	prev = &lock->lf_next;
924	needtolink = `0`;
925	} else {
926	*prev = overlap->lf_next;
927	}
928	zfree(KT_LOCKF, overlap);
929	continue;
930
931	case OVERLAP_STARTS_BEFORE_LOCK:
932	/*
933	* Add lock after overlap on the list.
934	*/
935	lock->lf_next = overlap->lf_next;
936	overlap->lf_next = lock;
937	assert(lock->lf_start > `0`);
938	overlap->lf_end = lock->lf_start - `1`;
939	prev = &lock->lf_next;
940	lf_wakelock(overlap, TRUE);
941	needtolink = `0`;
942	continue;
943
944	case OVERLAP_ENDS_AFTER_LOCK:
945	/*
946	* Add the new lock before overlap.
947	*/
948	if (needtolink) {
949	*prev = lock;
950	lock->lf_next = overlap;
951	}
952	assert(lkend < OFF_MAX);
953	overlap->lf_start = lkend + `1`;
954	lf_wakelock(overlap, TRUE);
955	break;
956	}
957	break;
958	}
959	/ Coalesce adjacent locks with identical attributes /
960	lf_coalesce_adjacent(lock);
961	#ifdef LOCKF_DEBUGGING
962	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
963	lf_print("lf_setlock: got the lock", lock);
964	lf_printlist("lf_setlock(out)", lock);
965	}
966	#endif /* LOCKF_DEBUGGING */
967	return `0`;
968	}
969
970
971	/*
972	* lf_clearlock
973	*
974	* Description: Remove a byte-range lock on an vnode. Generally, find the
975	* lock (or an overlap to that lock) and remove it (or shrink
976	* it), then wakeup anyone we can.
977	*
978	* Parameters: unlock The lock to clear
979	*
980	* Returns: 0 Success
981	* lf_split:ENOLCK
982	*
983	* Notes: A caller may unlock all the locks owned by the caller by
984	* specifying the entire file range; locks owned by other
985	* callers are not effected by this operation.
986	*/
987	static int
988	lf_clearlock(struct lockf *unlock)
989	{
990	struct lockf **head = unlock->lf_head;
991	struct lockf lf = head;
992	struct lockf overlap, *prev;
993	overlap_t ovcase;
994
995	if (lf == NOLOCKF) {
996	return `0`;
997	}
998	#ifdef LOCKF_DEBUGGING
999	if (unlock->lf_type != F_UNLCK) {
1000	panic("lf_clearlock: bad type");
1001	}
1002	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1003	lf_print("lf_clearlock", unlock);
1004	}
1005	#endif /* LOCKF_DEBUGGING */
1006	prev = head;
1007	while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
1008	const off_t unlkend = LF_END(unlock);
1009	/*
1010	* Wakeup the list of locks to be retried.
1011	*/
1012	lf_wakelock(overlap, FALSE);
1013	#if IMPORTANCE_INHERITANCE
1014	if (overlap->lf_boosted == LF_BOOSTED) {
1015	lf_drop_assertion(overlap);
1016	}
1017	#endif /* IMPORTANCE_INHERITANCE */
1018
1019	switch (ovcase) {
1020	case OVERLAP_NONE: / satisfy compiler enum/switch /
1021	break;
1022
1023	case OVERLAP_EQUALS_LOCK:
1024	*prev = overlap->lf_next;
1025	zfree(KT_LOCKF, overlap);
1026	break;
1027
1028	case OVERLAP_CONTAINS_LOCK: / split it /
1029	if (overlap->lf_start == unlock->lf_start) {
1030	assert(unlkend < OFF_MAX);
1031	overlap->lf_start = unlkend + `1`;
1032	break;
1033	}
1034	/*
1035	* If we can't split the lock, we can't grant it.
1036	* Claim a system limit for the resource shortage.
1037	*/
1038	if (lf_split(overlap, unlock)) {
1039	return ENOLCK;
1040	}
1041	overlap->lf_next = unlock->lf_next;
1042	break;
1043
1044	case OVERLAP_CONTAINED_BY_LOCK:
1045	*prev = overlap->lf_next;
1046	lf = overlap->lf_next;
1047	zfree(KT_LOCKF, overlap);
1048	continue;
1049
1050	case OVERLAP_STARTS_BEFORE_LOCK:
1051	assert(unlock->lf_start > `0`);
1052	overlap->lf_end = unlock->lf_start - `1`;
1053	prev = &overlap->lf_next;
1054	lf = overlap->lf_next;
1055	continue;
1056
1057	case OVERLAP_ENDS_AFTER_LOCK:
1058	assert(unlkend < OFF_MAX);
1059	overlap->lf_start = unlkend + `1`;
1060	break;
1061	}
1062	break;
1063	}
1064	#ifdef LOCKF_DEBUGGING
1065	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1066	lf_printlist("lf_clearlock", unlock);
1067	}
1068	#endif /* LOCKF_DEBUGGING */
1069	return `0`;
1070	}
1071
1072
1073	/*
1074	* lf_transferlock
1075	*
1076	* Description: Transfer a give lock from old_proc to new proc during exec
1077	*
1078	* Parameters: unlock The lock to transfer
1079	*
1080	* Returns: 0 Success
1081	*
1082	* Notes: A caller may transfer all the locks owned by the caller by
1083	* specifying the entire file range; locks owned by other
1084	* callers are not effected by this operation.
1085	*/
1086	static int
1087	lf_transferlock(struct lockf *transfer)
1088	{
1089	struct lockf **head = transfer->lf_head;
1090	struct lockf lf = head;
1091	struct lockf overlap, *prev;
1092	overlap_t ovcase;
1093
1094	if (lf == NOLOCKF) {
1095	return `0`;
1096	}
1097	#ifdef LOCKF_DEBUGGING
1098	if (transfer->lf_type != F_TRANSFER) {
1099	panic("lf_transferlock: bad type");
1100	}
1101	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1102	lf_print("lf_transferlock", transfer);
1103	}
1104	#endif /* LOCKF_DEBUGGING */
1105	prev = head;
1106	while ((ovcase = lf_findoverlap(lf, transfer, SELF, &prev, &overlap)) != OVERLAP_NONE) {
1107	/ For POSIX Locks, change lf_id and lf_owner /
1108	if (overlap->lf_flags & F_POSIX) {
1109	overlap->lf_id = (caddr_t)transfer->lf_owner;
1110	overlap->lf_owner = transfer->lf_owner;
1111	} else if (overlap->lf_flags & F_OFD_LOCK) {
1112	/ Change the owner of the ofd style lock, if there is an owner /
1113	if (overlap->lf_owner) {
1114	overlap->lf_owner = transfer->lf_owner;
1115	}
1116	}
1117	/ Find the next lock /
1118	lf = overlap->lf_next;
1119	}
1120	#ifdef LOCKF_DEBUGGING
1121	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1122	lf_printlist("lf_transferlock", transfer);
1123	}
1124	#endif /* LOCKF_DEBUGGING */
1125	return `0`;
1126	}
1127
1128
1129	/*
1130	* lf_getlock
1131	*
1132	* Description: Check whether there is a blocking lock, and if so return
1133	* its process identifier into the lock being requested.
1134	*
1135	* Parameters: lock Pointer to lock to test for blocks
1136	* fl Pointer to flock structure to receive
1137	* the blocking lock information, if a
1138	* blocking lock is found.
1139	* matchpid -1, or pid value to match in lookup.
1140	*
1141	* Returns: 0 Success
1142	*
1143	* Implicit Returns:
1144	* *fl Contents modified to reflect the
1145	* blocking lock, if one is found; not
1146	* modified otherwise
1147	*
1148	* Notes: fl->l_pid will be (-1) for file locks and will only be set to
1149	* the blocking process ID for advisory record locks.
1150	*/
1151	static int
1152	lf_getlock(struct lockf lock, struct* flock *fl, pid_t matchpid)
1153	{
1154	struct lockf *block;
1155
1156	#ifdef LOCKF_DEBUGGING
1157	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1158	lf_print("lf_getlock", lock);
1159	}
1160	#endif /* LOCKF_DEBUGGING */
1161
1162	if ((block = lf_getblock(lock, matchpid))) {
1163	fl->l_type = block->lf_type;
1164	fl->l_whence = SEEK_SET;
1165	fl->l_start = block->lf_start;
1166	if (block->lf_end == -`1` \|\|
1167	(block->lf_start == `0` && LF_END(block) == OFF_MAX)) {
1168	fl->l_len = `0`;
1169	} else {
1170	fl->l_len = LF_END(block) - block->lf_start + `1`;
1171	}
1172	if (NULL != block->lf_owner) {
1173	/*
1174	* lf_owner is only non-NULL when the lock
1175	* "owner" can be unambiguously determined
1176	*/
1177	fl->l_pid = proc_pid(block->lf_owner);
1178	} else {
1179	fl->l_pid = -`1`;
1180	}
1181	} else {
1182	fl->l_type = F_UNLCK;
1183	}
1184	return `0`;
1185	}
1186
1187	/*
1188	* lf_getblock
1189	*
1190	* Description: Walk the list of locks for an inode and return the first
1191	* blocking lock. A lock is considered blocking if we are not
1192	* the lock owner; otherwise, we are permitted to upgrade or
1193	* downgrade it, and it's not considered blocking.
1194	*
1195	* Parameters: lock The lock for which we are interested
1196	* in obtaining the blocking lock, if any
1197	* matchpid -1, or pid value to match in lookup.
1198	*
1199	* Returns: NOLOCKF No blocking lock exists
1200	* !NOLOCKF The address of the blocking lock's
1201	* struct lockf.
1202	*/
1203	static struct lockf *
1204	lf_getblock(struct lockf *lock, pid_t matchpid)
1205	{
1206	struct lockf *prev, overlap, lf = (lock->lf_head);
1207
1208	for (prev = lock->lf_head;
1209	lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1210	lf = overlap->lf_next) {
1211	/*
1212	* Found an overlap.
1213	*
1214	* If we're matching pids, and it's a record lock,
1215	* or it's an OFD lock on a process-confined fd,
1216	* but the pid doesn't match, then keep on looking ..
1217	*/
1218	if (matchpid != -`1` &&
1219	(overlap->lf_flags & (F_POSIX \| F_OFD_LOCK)) != `0` &&
1220	proc_pid(overlap->lf_owner) != matchpid) {
1221	continue;
1222	}
1223
1224	/*
1225	* does it block us?
1226	*/
1227	if ((lock->lf_type == F_WRLCK \|\| overlap->lf_type == F_WRLCK)) {
1228	return overlap;
1229	}
1230	}
1231	return NOLOCKF;
1232	}
1233
1234
1235	/*
1236	* lf_findoverlap
1237	*
1238	* Description: Walk the list of locks to find an overlapping lock (if any).
1239	*
1240	* Parameters: lf First lock on lock list
1241	* lock The lock we are checking for an overlap
1242	* check Check type
1243	* prev pointer to pointer pointer to contain
1244	* address of pointer to previous lock
1245	* pointer to overlapping lock, if overlap
1246	* overlap pointer to pointer to contain address
1247	* of overlapping lock
1248	*
1249	* Returns: OVERLAP_NONE
1250	* OVERLAP_EQUALS_LOCK
1251	* OVERLAP_CONTAINS_LOCK
1252	* OVERLAP_CONTAINED_BY_LOCK
1253	* OVERLAP_STARTS_BEFORE_LOCK
1254	* OVERLAP_ENDS_AFTER_LOCK
1255	*
1256	* Implicit Returns:
1257	* *prev The address of the next pointer in the
1258	* lock previous to the overlapping lock;
1259	* this is generally used to relink the
1260	* lock list, avoiding a second iteration.
1261	* *overlap The pointer to the overlapping lock
1262	* itself; this is used to return data in
1263	* the check == OTHERS case, and for the
1264	* caller to modify the overlapping lock,
1265	* in the check == SELF case
1266	*
1267	* Note: This returns only the FIRST overlapping lock. There may be
1268	* more than one. lf_getlock will return the first blocking lock,
1269	* while lf_setlock will iterate over all overlapping locks to
1270	*
1271	* The check parameter can be SELF, meaning we are looking for
1272	* overlapping locks owned by us, or it can be OTHERS, meaning
1273	* we are looking for overlapping locks owned by someone else so
1274	* we can report a blocking lock on an F_GETLK request.
1275	*
1276	* The value of overlap and prev are modified, even if there is
1277	* no overlapping lock found; always check the return code.
1278	*/
1279	static overlap_t
1280	lf_findoverlap(struct lockf lf, struct* lockf lock, int* type,
1281	struct lockf *prev, struct lockf overlap)
1282	{
1283	int found_self = `0`;
1284
1285	*overlap = lf;
1286	if (lf == NOLOCKF) {
1287	return `0`;
1288	}
1289	#ifdef LOCKF_DEBUGGING
1290	if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1291	lf_print("lf_findoverlap: looking for overlap in", lock);
1292	}
1293	#endif /* LOCKF_DEBUGGING */
1294	const off_t start = lock->lf_start;
1295	const off_t end = LF_END(lock);
1296	while (lf != NOLOCKF) {
1297	if (((type & SELF) && lf->lf_id != lock->lf_id) \|\|
1298	((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1299	/*
1300	* Locks belonging to one process are adjacent on the
1301	* list, so if we've found any locks belonging to us,
1302	* and we're now seeing something else, then we've
1303	* examined all "self" locks. Note that bailing out
1304	* here is quite important; for coalescing, we assume
1305	* numerically adjacent locks from the same owner to
1306	* be adjacent on the list.
1307	*/
1308	if ((type & SELF) && found_self) {
1309	return OVERLAP_NONE;
1310	}
1311
1312	*prev = &lf->lf_next;
1313	*overlap = lf = lf->lf_next;
1314	continue;
1315	}
1316
1317	if ((type & SELF)) {
1318	found_self = `1`;
1319	}
1320
1321	#ifdef LOCKF_DEBUGGING
1322	if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1323	lf_print("\tchecking", lf);
1324	}
1325	#endif /* LOCKF_DEBUGGING */
1326	/*
1327	* OK, check for overlap
1328	*/
1329	const off_t lfstart = lf->lf_start;
1330	const off_t lfend = LF_END(lf);
1331
1332	if ((start > lfend) \|\| (lfstart > end)) {
1333	/ Case 0 /
1334	LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1335
1336	/*
1337	* NOTE: assumes that locks for the same process are
1338	* nonintersecting and ordered.
1339	*/
1340	if ((type & SELF) && lfstart > end) {
1341	return OVERLAP_NONE;
1342	}
1343	*prev = &lf->lf_next;
1344	*overlap = lf = lf->lf_next;
1345	continue;
1346	}
1347	if ((lfstart == start) && (lfend == end)) {
1348	LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1349	return OVERLAP_EQUALS_LOCK;
1350	}
1351	if ((lfstart <= start) && (lfend >= end)) {
1352	LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1353	return OVERLAP_CONTAINS_LOCK;
1354	}
1355	if ((start <= lfstart) && (end >= lfend)) {
1356	LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1357	return OVERLAP_CONTAINED_BY_LOCK;
1358	}
1359	if ((lfstart < start) && (lfend >= start)) {
1360	LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1361	return OVERLAP_STARTS_BEFORE_LOCK;
1362	}
1363	if ((lfstart > start) && (lfend > end)) {
1364	LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1365	return OVERLAP_ENDS_AFTER_LOCK;
1366	}
1367	panic("lf_findoverlap: default");
1368	}
1369	return OVERLAP_NONE;
1370	}
1371
1372
1373	/*
1374	* lf_split
1375	*
1376	* Description: Split a lock and a contained region into two or three locks
1377	* as necessary.
1378	*
1379	* Parameters: lock1 Lock to split
1380	* lock2 Overlapping lock region requiring the
1381	* split (upgrade/downgrade/unlock)
1382	*
1383	* Returns: 0 Success
1384	* ENOLCK No memory for new lock
1385	*
1386	* Implicit Returns:
1387	* *lock1 Modified original lock
1388	* *lock2 Overlapping lock (inserted into list)
1389	* (new lock) Potential new lock inserted into list
1390	* if split results in 3 locks
1391	*
1392	* Notes: This operation can only fail if the split would result in three
1393	* locks, and there is insufficient memory to allocate the third
1394	* lock; in that case, neither of the locks will be modified.
1395	*/
1396	static int
1397	lf_split(struct lockf lock1, struct* lockf *lock2)
1398	{
1399	struct lockf *splitlock;
1400
1401	#ifdef LOCKF_DEBUGGING
1402	if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1403	lf_print("lf_split", lock1);
1404	lf_print("splitting from", lock2);
1405	}
1406	#endif /* LOCKF_DEBUGGING */
1407	/*
1408	* Check to see if splitting into only two pieces.
1409	*/
1410	if (lock1->lf_start == lock2->lf_start) {
1411	assert(LF_END(lock2) < OFF_MAX);
1412	lock1->lf_start = LF_END(lock2) + `1`;
1413	lock2->lf_next = lock1;
1414	return `0`;
1415	}
1416	if (LF_END(lock1) == LF_END(lock2)) {
1417	assert(lock2->lf_start > `0`);
1418	lock1->lf_end = lock2->lf_start - `1`;
1419	lock2->lf_next = lock1->lf_next;
1420	lock1->lf_next = lock2;
1421	return `0`;
1422	}
1423	/*
1424	* Make a new lock consisting of the last part of
1425	* the encompassing lock
1426	*/
1427	splitlock = zalloc_flags(KT_LOCKF, Z_WAITOK \| Z_NOFAIL);
1428	bcopy(src: lock1, dst: splitlock, n: sizeof *splitlock);
1429	assert(LF_END(lock2) < OFF_MAX);
1430	splitlock->lf_start = LF_END(lock2) + `1`;
1431	TAILQ_INIT(&splitlock->lf_blkhd);
1432	assert(lock2->lf_start > `0`);
1433	lock1->lf_end = lock2->lf_start - `1`;
1434	/*
1435	* OK, now link it in
1436	*/
1437	splitlock->lf_next = lock1->lf_next;
1438	lock2->lf_next = splitlock;
1439	lock1->lf_next = lock2;
1440
1441	return `0`;
1442	}
1443
1444
1445	/*
1446	* lf_wakelock
1447	*
1448	* Wakeup a blocklist in the case of a downgrade or unlock, since others
1449	* waiting on the lock may now be able to acquire it.
1450	*
1451	* Parameters: listhead Lock list head on which waiters may
1452	* have pending locks
1453	*
1454	* Returns: <void>
1455	*
1456	* Notes: This function iterates a list of locks and wakes all waiters,
1457	* rather than only waiters for the contended regions. Because
1458	* of this, for heavily contended files, this can result in a
1459	* "thundering herd" situation. Refactoring the code could make
1460	* this operation more efficient, if heavy contention ever results
1461	* in a real-world performance problem.
1462	*/
1463	static void
1464	lf_wakelock(struct lockf *listhead, boolean_t force_all)
1465	{
1466	struct lockf *wakelock;
1467	boolean_t wake_all = TRUE;
1468
1469	if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) {
1470	wake_all = FALSE;
1471	}
1472
1473	while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1474	wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1475	TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1476
1477	wakelock->lf_next = NOLOCKF;
1478	#ifdef LOCKF_DEBUGGING
1479	if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1480	lf_print("lf_wakelock: awakening", wakelock);
1481	}
1482	#endif /* LOCKF_DEBUGGING */
1483	if (wake_all == FALSE) {
1484	/*
1485	* If there are items on the list head block list,
1486	* move them to the wakelock list instead, and then
1487	* correct their lf_next pointers.
1488	*/
1489	if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1490	TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1491
1492	struct lockf *tlock;
1493
1494	TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1495	if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1496	/ See rdar://10887303 /
1497	panic("cycle in wakelock list");
1498	}
1499	tlock->lf_next = wakelock;
1500	}
1501	}
1502	}
1503	wakeup(chan: wakelock);
1504
1505	if (wake_all == FALSE) {
1506	break;
1507	}
1508	}
1509	}
1510
1511
1512	#ifdef LOCKF_DEBUGGING
1513	#define GET_LF_OWNER_PID(lf) (proc_pid((lf)->lf_owner))
1514
1515	/*
1516	* lf_print DEBUG
1517	*
1518	* Print out a lock; lock information is prefixed by the string in 'tag'
1519	*
1520	* Parameters: tag A string tag for debugging
1521	* lock The lock whose information should be
1522	* displayed
1523	*
1524	* Returns: <void>
1525	*/
1526	void
1527	lf_print(const char tag, struct* lockf *lock)
1528	{
1529	printf("%s: lock %p for ", tag, (void *)lock);
1530	if (lock->lf_flags & F_POSIX) {
1531	printf("proc %p (owner %d)",
1532	lock->lf_id, GET_LF_OWNER_PID(lock));
1533	} else if (lock->lf_flags & F_OFD_LOCK) {
1534	printf("fg %p (owner %d)",
1535	lock->lf_id, GET_LF_OWNER_PID(lock));
1536	} else {
1537	printf("id %p", (void *)lock->lf_id);
1538	}
1539	if (lock->lf_vnode != `0`) {
1540	printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1541	lock->lf_vnode,
1542	lock->lf_type == F_RDLCK ? "shared" :
1543	lock->lf_type == F_WRLCK ? "exclusive" :
1544	lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1545	(uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1546	} else {
1547	printf(" %s, start 0x%016llx, end 0x%016llx",
1548	lock->lf_type == F_RDLCK ? "shared" :
1549	lock->lf_type == F_WRLCK ? "exclusive" :
1550	lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1551	(uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1552	}
1553	if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
1554	printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1555	} else {
1556	printf("\n");
1557	}
1558	}
1559
1560
1561	/*
1562	* lf_printlist DEBUG
1563	*
1564	* Print out a lock list for the vnode associated with 'lock'; lock information
1565	* is prefixed by the string in 'tag'
1566	*
1567	* Parameters: tag A string tag for debugging
1568	* lock The lock whose vnode's lock list should
1569	* be displayed
1570	*
1571	* Returns: <void>
1572	*/
1573	void
1574	lf_printlist(const char tag, struct* lockf *lock)
1575	{
1576	struct lockf lf, blk;
1577
1578	if (lock->lf_vnode == `0`) {
1579	return;
1580	}
1581
1582	printf("%s: Lock list for vno %p:\n",
1583	tag, lock->lf_vnode);
1584	for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1585	printf("\tlock %p for ", (void *)lf);
1586	if (lf->lf_flags & F_POSIX) {
1587	printf("proc %p (owner %d)",
1588	lf->lf_id, GET_LF_OWNER_PID(lf));
1589	} else if (lf->lf_flags & F_OFD_LOCK) {
1590	printf("fg %p (owner %d)",
1591	lf->lf_id, GET_LF_OWNER_PID(lf));
1592	} else {
1593	printf("id %p", (void *)lf->lf_id);
1594	}
1595	printf(", %s, start 0x%016llx, end 0x%016llx",
1596	lf->lf_type == F_RDLCK ? "shared" :
1597	lf->lf_type == F_WRLCK ? "exclusive" :
1598	lf->lf_type == F_UNLCK ? "unlock" :
1599	"unknown", (uint64_t)lf->lf_start, (uint64_t)lf->lf_end);
1600	TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1601	printf("\n\t\tlock request %p for ", (void *)blk);
1602	if (blk->lf_flags & F_POSIX) {
1603	printf("proc %p (owner %d)",
1604	blk->lf_id, GET_LF_OWNER_PID(blk));
1605	} else if (blk->lf_flags & F_OFD_LOCK) {
1606	printf("fg %p (owner %d)",
1607	blk->lf_id, GET_LF_OWNER_PID(blk));
1608	} else {
1609	printf("id %p", (void *)blk->lf_id);
1610	}
1611	printf(", %s, start 0x%016llx, end 0x%016llx",
1612	blk->lf_type == F_RDLCK ? "shared" :
1613	blk->lf_type == F_WRLCK ? "exclusive" :
1614	blk->lf_type == F_UNLCK ? "unlock" :
1615	"unknown", (uint64_t)blk->lf_start,
1616	(uint64_t)blk->lf_end);
1617	if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
1618	panic("lf_printlist: bad list");
1619	}
1620	}
1621	printf("\n");
1622	}
1623	}
1624	#endif /* LOCKF_DEBUGGING */
1625
1626	#if IMPORTANCE_INHERITANCE
1627
1628	/*
1629	* lf_hold_assertion
1630	*
1631	* Call task importance hold assertion on the owner of the lock.
1632	*
1633	* Parameters: block_task Owner of the lock blocking
1634	* current thread.
1635	*
1636	* block lock on which the current thread
1637	* is blocking on.
1638	*
1639	* Returns: <void>
1640	*
1641	* Notes: The task reference on block_task is not needed to be hold since
1642	* the current thread has vnode lock and block_task has a file
1643	* lock, thus removing file lock in exit requires block_task to
1644	* grab the vnode lock.
1645	*/
1646	static void
1647	lf_hold_assertion(task_t block_task, struct lockf *block)
1648	{
1649	if (task_importance_hold_file_lock_assertion(target_task: block_task, count: `1`) == `0`) {
1650	block->lf_boosted = LF_BOOSTED;
1651	LOCKF_DEBUG(LF_DBG_IMPINH,
1652	"lf: importance hold file lock assert on pid %d lock %p\n",
1653	proc_pid(block->lf_owner), block);
1654	}
1655	}
1656
1657
1658	/*
1659	* lf_jump_to_queue_head
1660	*
1661	* Jump the lock from the tail of the block queue to the head of
1662	* the queue.
1663	*
1664	* Parameters: block lockf struct containing the
1665	* block queue.
1666	* lock lockf struct to be jumped to the
1667	* front.
1668	*
1669	* Returns: <void>
1670	*/
1671	static void
1672	lf_jump_to_queue_head(struct lockf block, struct* lockf *lock)
1673	{
1674	/ Move the lock to the head of the block queue. /
1675	TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1676	TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1677	}
1678
1679
1680	/*
1681	* lf_drop_assertion
1682	*
1683	* Drops the task hold assertion.
1684	*
1685	* Parameters: block lockf struct holding the assertion.
1686	*
1687	* Returns: <void>
1688	*/
1689	static void
1690	lf_drop_assertion(struct lockf *block)
1691	{
1692	LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1693	proc_pid(block->lf_owner), block);
1694
1695	task_t current_task = proc_task(block->lf_owner);
1696	task_importance_drop_file_lock_assertion(target_task: current_task, count: `1`);
1697	block->lf_boosted = LF_NOT_BOOSTED;
1698	}
1699
1700	/*
1701	* lf_adjust_assertion
1702	*
1703	* Adjusts importance assertion of file lock. Goes through
1704	* all the blocking locks and checks if the file lock needs
1705	* to be boosted anymore.
1706	*
1707	* Parameters: block lockf structure which needs to be adjusted.
1708	*
1709	* Returns: <void>
1710	*/
1711	static void
1712	lf_adjust_assertion(struct lockf *block)
1713	{
1714	boolean_t drop_boost = TRUE;
1715	struct lockf *next;
1716
1717	/ Return if the lock is not boosted /
1718	if (block->lf_boosted == LF_NOT_BOOSTED) {
1719	return;
1720	}
1721
1722	TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
1723	/ Check if block and next are same type of locks /
1724	if (((block->lf_flags & next->lf_flags & F_POSIX) != `0`) \|\|
1725	((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
1726	(block->lf_owner != next->lf_owner) &&
1727	(NULL != block->lf_owner && NULL != next->lf_owner))) {
1728	/ Check if next would be boosting block /
1729	if (task_is_importance_donor(task: proc_task(next->lf_owner)) &&
1730	task_is_importance_receiver_type(task: proc_task(block->lf_owner))) {
1731	/ Found a lock boosting block /
1732	drop_boost = FALSE;
1733	break;
1734	}
1735	}
1736	}
1737
1738	if (drop_boost) {
1739	lf_drop_assertion(block);
1740	}
1741	}
1742
1743	static void
1744	lf_boost_blocking_proc(struct lockf lock, struct* lockf *block)
1745	{
1746	task_t ltask = proc_task(lock->lf_owner);
1747	task_t btask = proc_task(block->lf_owner);
1748
1749	/*
1750	* Check if ltask can donate importance. The
1751	* check of imp_donor bit is done without holding
1752	* any lock. The value may change after you read it,
1753	* but it is ok to boost a task while someone else is
1754	* unboosting you.
1755	*
1756	* TODO: Support live inheritance on file locks.
1757	*/
1758	if (task_is_importance_donor(task: ltask)) {
1759	LOCKF_DEBUG(LF_DBG_IMPINH,
1760	"lf: %d: attempt to boost pid %d that holds lock %p\n",
1761	proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1762
1763	if (block->lf_boosted != LF_BOOSTED &&
1764	task_is_importance_receiver_type(task: btask)) {
1765	lf_hold_assertion(block_task: btask, block);
1766	}
1767	lf_jump_to_queue_head(block, lock);
1768	}
1769	}
1770	#endif /* IMPORTANCE_INHERITANCE */
1771

Browse the source code of xnu/bsd/kern/kern_lockf.c