1/*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: ipc/ipc_pset.c
60 * Author: Rich Draves
61 * Date: 1989
62 *
63 * Functions to manipulate IPC port sets.
64 */
65
66#include <mach/port.h>
67#include <mach/kern_return.h>
68#include <mach/message.h>
69#include <ipc/ipc_mqueue.h>
70#include <ipc/ipc_object.h>
71#include <ipc/ipc_pset.h>
72#include <ipc/ipc_right.h>
73#include <ipc/ipc_space.h>
74#include <ipc/ipc_port.h>
75#include <ipc/ipc_kmsg.h>
76#include <kern/policy_internal.h>
77
78#include <kern/kern_types.h>
79
80#include <vm/vm_map.h>
81#include <libkern/section_keywords.h>
82#include <pthread/priority_private.h>
83
84/* processor_set stole ipc_pset_init */
85static void
86ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name, int policy)
87{
88 waitq_init(waitq: &pset->ips_wqset, type: WQT_PORT_SET, policy: policy | SYNC_POLICY_FIFO);
89 klist_init(list: &pset->ips_klist);
90 pset->ips_wqset.wqset_index = MACH_PORT_INDEX(name);
91}
92
93/*
94 * Routine: ipc_pset_alloc
95 * Purpose:
96 * Allocate a port set.
97 * Conditions:
98 * Nothing locked. If successful, the port set is returned
99 * locked. (The caller doesn't have a reference.)
100 * Returns:
101 * KERN_SUCCESS The port set is allocated.
102 * KERN_INVALID_TASK The space is dead.
103 * KERN_NO_SPACE No room for an entry in the space.
104 */
105
106kern_return_t
107ipc_pset_alloc(
108 ipc_space_t space,
109 mach_port_name_t *namep,
110 ipc_pset_t *psetp)
111{
112 ipc_pset_t pset;
113 mach_port_name_t name;
114 kern_return_t kr;
115
116 kr = ipc_object_alloc(space, IOT_PORT_SET,
117 MACH_PORT_TYPE_PORT_SET, urefs: 0,
118 namep: &name, objectp: (ipc_object_t *) &pset);
119 if (kr != KERN_SUCCESS) {
120 return kr;
121 }
122 /* space is locked */
123
124 ipc_port_set_init(pset, name, SYNC_POLICY_INIT_LOCKED);
125 /* port set is locked */
126
127 is_write_unlock(space);
128
129 *namep = name;
130 *psetp = pset;
131 return KERN_SUCCESS;
132}
133
134/*
135 * Routine: ipc_pset_alloc_name
136 * Purpose:
137 * Allocate a port set, with a specific name.
138 * Conditions:
139 * Nothing locked. If successful, the port set is returned
140 * locked. (The caller doesn't have a reference.)
141 * Returns:
142 * KERN_SUCCESS The port set is allocated.
143 * KERN_INVALID_TASK The space is dead.
144 * KERN_NAME_EXISTS The name already denotes a right.
145 */
146
147kern_return_t
148ipc_pset_alloc_name(
149 ipc_space_t space,
150 mach_port_name_t name,
151 ipc_pset_t *psetp)
152{
153 return ipc_object_alloc_name(space, IOT_PORT_SET,
154 MACH_PORT_TYPE_PORT_SET, urefs: 0,
155 name, objectp: (ipc_object_t *)psetp, finish_init: ^(ipc_object_t object){
156 ipc_port_set_init(ips_object_to_pset(object), name,
157 SYNC_POLICY_INIT_LOCKED);
158 });
159}
160
161
162/*
163 * Routine: ipc_pset_alloc_special
164 * Purpose:
165 * Allocate a port set in a special space.
166 * The new port set is returned with one ref.
167 * If unsuccessful, IPS_NULL is returned.
168 * Conditions:
169 * Nothing locked.
170 */
171ipc_pset_t
172ipc_pset_alloc_special(
173 __assert_only ipc_space_t space)
174{
175 ipc_pset_t pset;
176
177 assert(space != IS_NULL);
178 assert(!is_active(space));
179
180 pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
181 if (pset == IPS_NULL) {
182 return IPS_NULL;
183 }
184
185 os_atomic_init(&pset->ips_object.io_bits, io_makebits(IOT_PORT_SET));
186 os_atomic_init(&pset->ips_object.io_references, 1);
187
188 ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT, policy: 0);
189
190 return pset;
191}
192
193
194/*
195 * Routine: ipc_pset_destroy
196 * Purpose:
197 * Destroys a port_set.
198 * Conditions:
199 * The port_set is locked and alive.
200 * The caller has a reference, which is consumed.
201 * Afterwards, the port_set is unlocked and dead.
202 */
203
204void
205ipc_pset_destroy(
206 ipc_space_t space,
207 ipc_pset_t pset)
208{
209 waitq_link_list_t free_l = { };
210
211 assert(ips_active(pset));
212
213 io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE);
214
215 /*
216 * Set all waiters on the portset running to
217 * discover the change.
218 *
219 * Then under the same lock hold, deinit the waitq-set,
220 * which will remove all the member message queues,
221 * linkages and clean up preposts.
222 */
223 ipc_mqueue_changed(space, waitq: &pset->ips_wqset);
224 waitq_invalidate(wq: &pset->ips_wqset);
225 waitq_set_unlink_all_locked(wqset: &pset->ips_wqset, free_l: &free_l);
226
227 ips_mq_unlock(pset);
228
229 ips_release(pset); /* consume the ref our caller gave us */
230
231 waitq_link_free_list(type: WQT_PORT_SET, list: &free_l);
232}
233
234/*
235 * Routine: ipc_pset_finalize
236 * Purpose:
237 * Called on last reference deallocate to
238 * free any remaining data associated with the pset.
239 * Conditions:
240 * Nothing locked.
241 */
242void
243ipc_pset_finalize(
244 ipc_pset_t pset)
245{
246 waitq_deinit(waitq: &pset->ips_wqset);
247}
248
249
250#pragma mark - kevent support
251
252/*
253 * Kqueue EVFILT_MACHPORT support
254 *
255 * - kn_ipc_{port,pset} points to the monitored ipc port or pset. If the knote
256 * is using a kqwl, it is eligible to participate in sync IPC overrides.
257 *
258 * For the first such sync IPC message in the port, we set up the port's
259 * turnstile to directly push on the kqwl's turnstile (which is in turn set up
260 * during filt_machportattach). If userspace responds to the message, the
261 * turnstile push is severed the point of reply. If userspace returns without
262 * responding to the message, we sever the turnstile push at the
263 * point of reenabling the knote to deliver the next message. This is why the
264 * knote needs to remember the port. For more details, see also
265 * filt_machport_turnstile_complete.
266 *
267 * If there are multiple other sync IPC messages in the port, messages 2 to n
268 * redirect their turnstile push to the kqwl through an intermediatry "knote"
269 * turnstile which in turn, pushes on the kqwl turnstile. This knote turnstile
270 * is stored in the kn_hook. See also filt_machport_turnstile_prepare_lazily.
271 *
272 * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
273 * that can be used to direct-deliver messages when
274 * MACH_RCV_MSG is set in kn_sfflags
275 *
276 * - (in/out) ext[1] holds a mach_msg_size_t representing the size
277 * of the userspace buffer held in ext[0].
278 *
279 * - (out) ext[2] is used to deliver qos information
280 * about the send queue to userspace.
281 *
282 * - (abused) ext[3] is used in kernel to hold a reference to the first port
283 * with a turnstile that participate to sync IPC override. For more details,
284 * see filt_machport_stash_port
285 *
286 * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
287 * of turnstiles for rights copied out as part of direct message delivery
288 * when they can participate to sync IPC override.
289 *
290 * It is used to atomically neuter the sync IPC override when the knote is
291 * re-enabled.
292 *
293 */
294
295#include <sys/event.h>
296#include <sys/errno.h>
297
298static int
299filt_pset_filter_result(ipc_pset_t pset)
300{
301 ips_mq_lock_held(pset);
302
303 if (!waitq_is_valid(wq: &pset->ips_wqset)) {
304 return 0;
305 }
306
307 return waitq_set_first_prepost(wqset: &pset->ips_wqset, flags: WQS_PREPOST_PEEK) ?
308 FILTER_ACTIVE : 0;
309}
310
311static int
312filt_port_filter_result(struct knote *kn, ipc_port_t port)
313{
314 struct kqueue *kqwl = knote_get_kq(kn);
315 ipc_kmsg_t first;
316 int result = 0;
317
318 ip_mq_lock_held(port);
319
320 if (kn->kn_sfflags & MACH_RCV_MSG) {
321 result = FILTER_RESET_EVENT_QOS;
322 }
323
324 if (!waitq_is_valid(wq: &port->ip_waitq)) {
325 return result;
326 }
327
328 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqu: kqwl)) {
329 kqueue_set_iotier_override(kqu: kqwl, iotier_override: port->ip_kernel_iotier_override);
330 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
331 }
332
333 first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
334 if (!first) {
335 return result;
336 }
337
338 result = FILTER_ACTIVE;
339 if (kn->kn_sfflags & MACH_RCV_MSG) {
340 result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
341 }
342
343#if CONFIG_PREADOPT_TG
344 struct thread_group *tg = ipc_kmsg_get_thread_group(kmsg: first);
345 if (tg) {
346 struct kqueue *kq = knote_get_kq(kn);
347 kqueue_set_preadopted_thread_group(kq, tg,
348 qos: first->ikm_qos_override);
349 }
350#endif
351
352 return result;
353}
354
355struct turnstile *
356filt_ipc_kqueue_turnstile(struct knote *kn)
357{
358 assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
359 return kqueue_turnstile(knote_get_kq(kn));
360}
361
362bool
363filt_machport_kqueue_has_turnstile(struct knote *kn)
364{
365 assert(kn->kn_filter == EVFILT_MACHPORT);
366 return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
367 && (kn->kn_flags & EV_DISPATCH);
368}
369
370/*
371 * Stashes a port that participate to sync IPC override on the knote until the
372 * knote is re-enabled.
373 *
374 * It returns:
375 * - the turnstile to use as an inheritor for the stashed port
376 * - the kind of stash that happened as PORT_SYNC_* value among:
377 * o not stashed (no sync IPC support)
378 * o stashed in the knote (in kn_ext[3])
379 * o to be hooked to the kn_hook knote
380 */
381struct turnstile *
382filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
383{
384 struct turnstile *ts = TURNSTILE_NULL;
385
386 if (kn->kn_filter == EVFILT_WORKLOOP) {
387 assert(kn->kn_ipc_port == NULL);
388 kn->kn_ipc_port = port;
389 ip_reference(port);
390 if (link) {
391 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
392 }
393 ts = filt_ipc_kqueue_turnstile(kn);
394 } else if (!filt_machport_kqueue_has_turnstile(kn)) {
395 if (link) {
396 *link = PORT_SYNC_LINK_NO_LINKAGE;
397 }
398 } else if (kn->kn_ext[3] == 0) {
399 ip_reference(port);
400 kn->kn_ext[3] = (uintptr_t)port;
401 ts = filt_ipc_kqueue_turnstile(kn);
402 if (link) {
403 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
404 }
405 } else {
406 ts = (struct turnstile *)knote_kn_hook_get_raw(kn);
407 if (link) {
408 *link = PORT_SYNC_LINK_WORKLOOP_STASH;
409 }
410 }
411
412 return ts;
413}
414
415/*
416 * Lazily prepare a turnstile so that filt_machport_stash_port()
417 * can be called with the mqueue lock held.
418 *
419 * It will allocate a turnstile in kn_hook if:
420 * - the knote supports sync IPC override,
421 * - we already stashed a port in kn_ext[3],
422 * - the object that will be copied out has a chance to ask to be stashed.
423 *
424 * It is setup so that its inheritor is the workloop turnstile that has been
425 * allocated when this knote was attached.
426 */
427void
428filt_machport_turnstile_prepare_lazily(
429 struct knote *kn,
430 mach_msg_type_name_t msgt_name,
431 ipc_port_t port)
432{
433 /* This is called from within filt_machportprocess */
434 assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
435
436 if (!filt_machport_kqueue_has_turnstile(kn)) {
437 return;
438 }
439
440 if (kn->kn_ext[3] == 0 || knote_kn_hook_get_raw(kn)) {
441 return;
442 }
443
444 struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
445 if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
446 (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
447 struct turnstile *kn_ts = turnstile_alloc();
448 struct turnstile *ts_store;
449 kn_ts = turnstile_prepare(proprietor: (uintptr_t)kn, tstore: &ts_store, turnstile: kn_ts, type: TURNSTILE_KNOTE);
450 knote_kn_hook_set_raw(kn, kn_hook: ts_store);
451
452 turnstile_update_inheritor(turnstile: kn_ts, new_inheritor: ts,
453 flags: TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
454 turnstile_cleanup();
455 }
456}
457
458static void
459filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
460{
461 struct turnstile *ts = TURNSTILE_NULL;
462
463 ip_mq_lock(port);
464 if (port->ip_specialreply) {
465 /*
466 * If the reply has been sent to the special reply port already,
467 * then the special reply port may already be reused to do something
468 * entirely different.
469 *
470 * However, the only reason for it to still point to this knote is
471 * that it's still waiting for a reply, so when this is the case,
472 * neuter the linkage.
473 */
474 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
475 port->ip_sync_inheritor_knote == kn) {
476 ipc_port_adjust_special_reply_port_locked(special_reply_port: port, NULL,
477 flags: (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
478 /* port unlocked */
479 } else {
480 ip_mq_unlock(port);
481 }
482 } else {
483 /*
484 * For receive rights, if their IMQ_KNOTE() is still this
485 * knote, then sever the link.
486 */
487 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
488 port->ip_messages.imq_inheritor_knote == kn) {
489 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
490 ts = port_send_turnstile(port);
491 }
492 if (ts) {
493 turnstile_reference(turnstile: ts);
494 turnstile_update_inheritor(turnstile: ts, TURNSTILE_INHERITOR_NULL,
495 flags: TURNSTILE_IMMEDIATE_UPDATE);
496 }
497 ip_mq_unlock(port);
498
499 if (ts) {
500 turnstile_update_inheritor_complete(turnstile: ts,
501 flags: TURNSTILE_INTERLOCK_NOT_HELD);
502 turnstile_deallocate(turnstile: ts);
503 }
504 }
505
506 ip_release(port);
507}
508
509void
510filt_wldetach_sync_ipc(struct knote *kn)
511{
512 ipc_port_t port = kn->kn_ipc_port;
513 filt_machport_turnstile_complete_port(kn, port);
514 kn->kn_ipc_port = IP_NULL;
515}
516
517/*
518 * Other half of filt_machport_turnstile_prepare_lazily()
519 *
520 * This is serialized by the knote state machine.
521 */
522static void
523filt_machport_turnstile_complete(struct knote *kn)
524{
525 if (kn->kn_ext[3]) {
526 ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
527 filt_machport_turnstile_complete_port(kn, port);
528 kn->kn_ext[3] = 0;
529 }
530
531 struct turnstile *ts = knote_kn_hook_get_raw(kn);
532 if (ts) {
533 turnstile_update_inheritor(turnstile: ts, TURNSTILE_INHERITOR_NULL,
534 flags: TURNSTILE_IMMEDIATE_UPDATE);
535 turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_HELD);
536
537 struct turnstile *ts_store = ts;
538 turnstile_complete(proprietor: (uintptr_t)kn, tstore: (struct turnstile **)&ts_store, turnstile: &ts, type: TURNSTILE_KNOTE);
539 knote_kn_hook_set_raw(kn, kn_hook: ts_store);
540
541 turnstile_cleanup();
542
543 assert(ts);
544 turnstile_deallocate(turnstile: ts);
545 }
546}
547
548static void
549filt_machport_link(struct klist *klist, struct knote *kn)
550{
551 struct knote *hd = SLIST_FIRST(klist);
552
553 if (hd && filt_machport_kqueue_has_turnstile(kn)) {
554 SLIST_INSERT_AFTER(hd, kn, kn_selnext);
555 } else {
556 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
557 }
558}
559
560static void
561filt_machport_unlink(struct klist *klist, struct knote *kn)
562{
563 struct knote **knprev;
564
565 KNOTE_DETACH(klist, kn);
566
567 /* make sure the first knote is a knote we can push on */
568 SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
569 if (filt_machport_kqueue_has_turnstile(kn)) {
570 *knprev = SLIST_NEXT(kn, kn_selnext);
571 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
572 break;
573 }
574 }
575}
576
577int
578filt_wlattach_sync_ipc(struct knote *kn)
579{
580 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
581 ipc_space_t space = current_space();
582 ipc_entry_bits_t bits;
583 ipc_object_t object;
584 ipc_port_t port = IP_NULL;
585 int error = 0;
586
587 if (ipc_right_lookup_read(space, name, bitsp: &bits, objectp: &object) != KERN_SUCCESS) {
588 return ENOENT;
589 }
590 /* object is locked and active */
591
592 if (bits & MACH_PORT_TYPE_RECEIVE) {
593 port = ip_object_to_port(object);
594 if (port->ip_specialreply) {
595 error = ENOENT;
596 }
597 } else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
598 port = ip_object_to_port(object);
599 if (!port->ip_specialreply) {
600 error = ENOENT;
601 }
602 } else {
603 error = ENOENT;
604 }
605 if (error) {
606 io_unlock(object);
607 return error;
608 }
609
610 if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
611 io_unlock(object);
612 /*
613 * We cannot start a sync IPC inheritance chain, only further one
614 * Note: this can also happen if the inheritance chain broke
615 * because the original requestor died.
616 */
617 return ENOENT;
618 }
619
620 if (port->ip_specialreply) {
621 ipc_port_adjust_special_reply_port_locked(special_reply_port: port, kn,
622 IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
623 } else {
624 ipc_port_adjust_port_locked(port, kn, FALSE);
625 }
626
627 /* make sure the port was stashed */
628 assert(kn->kn_ipc_port == port);
629
630 /* port has been unlocked by ipc_port_adjust_* */
631
632 return 0;
633}
634
635static int
636filt_psetattach(struct knote *kn, ipc_pset_t pset)
637{
638 int result = 0;
639
640 ips_reference(pset);
641 kn->kn_ipc_pset = pset;
642
643 filt_machport_link(klist: &pset->ips_klist, kn);
644 result = filt_pset_filter_result(pset);
645 ips_mq_unlock(pset);
646
647 return result;
648}
649
650static int
651filt_portattach(struct knote *kn, ipc_port_t port)
652{
653 struct turnstile *send_turnstile = TURNSTILE_NULL;
654 int result = 0;
655
656 if (port->ip_specialreply) {
657 /*
658 * Registering for kevents on special reply ports
659 * isn't supported for two reasons:
660 *
661 * 1. it really makes very little sense for a port that
662 * is supposed to be used synchronously
663 *
664 * 2. their ports's ip_klist field will be used to
665 * store the receive turnstile, so we can't possibly
666 * attach them anyway.
667 */
668 ip_mq_unlock(port);
669 knote_set_error(kn, ENOTSUP);
670 return 0;
671 }
672
673 ip_reference(port);
674 kn->kn_ipc_port = port;
675 if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
676 /*
677 * We're attaching a port that used to have an IMQ_KNOTE,
678 * clobber this state, we'll fixup its turnstile inheritor below.
679 */
680 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
681 }
682
683 filt_machport_link(klist: &port->ip_klist, kn);
684 result = filt_port_filter_result(kn, port);
685
686 /*
687 * Update the port's turnstile inheritor
688 *
689 * Unlike filt_machportdetach(), we don't have to care about races for
690 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
691 * already pushing knotes, and if the current one becomes the new
692 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
693 * returns.
694 */
695 send_turnstile = port_send_turnstile(port);
696 if (send_turnstile) {
697 turnstile_reference(turnstile: send_turnstile);
698 ipc_port_send_update_inheritor(port, turnstile: send_turnstile,
699 flags: TURNSTILE_IMMEDIATE_UPDATE);
700
701 /*
702 * rdar://problem/48861190
703 *
704 * When a listener connection resumes a peer,
705 * updating the inheritor above has moved the push
706 * from the current thread to the workloop.
707 *
708 * However, we haven't told the workloop yet
709 * that it needs a thread request, and we risk
710 * to be preeempted as soon as we drop the space
711 * lock below.
712 *
713 * To avoid this disable preemption and let kevent
714 * reenable it after it takes the kqlock.
715 */
716 disable_preemption();
717 result |= FILTER_THREADREQ_NODEFEER;
718 }
719
720 ip_mq_unlock(port);
721
722 if (send_turnstile) {
723 turnstile_update_inheritor_complete(turnstile: send_turnstile,
724 flags: TURNSTILE_INTERLOCK_NOT_HELD);
725 turnstile_deallocate_safe(turnstile: send_turnstile);
726 }
727
728 return result;
729}
730
731static int
732filt_machportattach(struct knote *kn, __unused struct kevent_qos_s *kev)
733{
734 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
735 ipc_space_t space = current_space();
736 ipc_entry_bits_t bits;
737 ipc_object_t object;
738 kern_return_t kr;
739
740 kn->kn_flags &= ~EV_EOF;
741 kn->kn_ext[3] = 0;
742
743 if (filt_machport_kqueue_has_turnstile(kn)) {
744 /*
745 * If the filter is likely to support sync IPC override,
746 * and it happens to be attaching to a workloop,
747 * make sure the workloop has an allocated turnstile.
748 */
749 kqueue_alloc_turnstile(knote_get_kq(kn));
750 }
751
752 kr = ipc_right_lookup_read(space, name, bitsp: &bits, objectp: &object);
753
754 if (kr != KERN_SUCCESS) {
755 knote_set_error(kn, ENOENT);
756 return 0;
757 }
758 /* object is locked and active */
759
760 if (bits & MACH_PORT_TYPE_PORT_SET) {
761 kn->kn_filtid = EVFILTID_MACH_PORT_SET;
762 return filt_psetattach(kn, ips_object_to_pset(object));
763 }
764
765 if (bits & MACH_PORT_TYPE_RECEIVE) {
766 kn->kn_filtid = EVFILTID_MACH_PORT;
767 return filt_portattach(kn, ip_object_to_port(object));
768 }
769
770 io_unlock(object);
771 knote_set_error(kn, ENOTSUP);
772 return 0;
773}
774
775static void
776filt_psetdetach(struct knote *kn)
777{
778 ipc_pset_t pset = kn->kn_ipc_pset;
779
780 filt_machport_turnstile_complete(kn);
781
782 ips_mq_lock(pset);
783
784 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
785 /*
786 * ipc_mqueue_changed() already unhooked this knote from the waitq,
787 */
788 } else {
789 filt_machport_unlink(klist: &pset->ips_klist, kn);
790 }
791
792 kn->kn_ipc_pset = IPS_NULL;
793 ips_mq_unlock(pset);
794 ips_release(pset);
795}
796
797static void
798filt_portdetach(struct knote *kn)
799{
800 ipc_port_t port = kn->kn_ipc_port;
801 struct turnstile *send_turnstile = TURNSTILE_NULL;
802
803 filt_machport_turnstile_complete(kn);
804
805 ip_mq_lock(port);
806 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
807 /*
808 * ipc_mqueue_changed() already unhooked this knote from the waitq,
809 */
810 } else {
811 /*
812 * When the knote being detached is the first one in the list,
813 * then unlinking the knote *and* updating the turnstile inheritor
814 * need to happen atomically with respect to the callers of
815 * turnstile_workloop_pusher_info().
816 *
817 * The caller of turnstile_workloop_pusher_info() will use the kq req
818 * lock (and hence the kqlock), so we just need to hold the kqlock too.
819 */
820 assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
821 if (kn == SLIST_FIRST(&port->ip_klist)) {
822 send_turnstile = port_send_turnstile(port);
823 }
824 filt_machport_unlink(klist: &port->ip_klist, kn);
825 struct kqueue *kq = knote_get_kq(kn);
826 kqueue_set_iotier_override(kqu: kq, THROTTLE_LEVEL_END);
827 }
828
829 if (send_turnstile) {
830 turnstile_reference(turnstile: send_turnstile);
831 ipc_port_send_update_inheritor(port, turnstile: send_turnstile,
832 flags: TURNSTILE_IMMEDIATE_UPDATE);
833 }
834
835 /* Clear the knote pointer once the knote has been removed from turnstile */
836 kn->kn_ipc_port = IP_NULL;
837 ip_mq_unlock(port);
838
839 if (send_turnstile) {
840 turnstile_update_inheritor_complete(turnstile: send_turnstile,
841 flags: TURNSTILE_INTERLOCK_NOT_HELD);
842 turnstile_deallocate(turnstile: send_turnstile);
843 }
844
845 ip_release(port);
846}
847
848/*
849 * filt_{pset,port}event - deliver events into the mach port filter
850 *
851 * Mach port message arrival events are currently only posted via the
852 * kqueue filter routine for ports.
853 *
854 * If there is a message at the head of the queue,
855 * we indicate that the knote should go active. If
856 * the message is to be direct-received, we adjust the
857 * QoS of the knote according the requested and override
858 * QoS of that first message.
859 *
860 * When the knote is for a port-set, the hint is non 0
861 * and is the waitq which is posting.
862 */
863static int
864filt_psetevent(struct knote *kn __unused, long hint __assert_only)
865{
866 /*
867 * When called for a port-set,
868 * the posting port waitq is locked.
869 *
870 * waitq_set_first_prepost()
871 * in filt_machport_filter_result()
872 * would try to lock it and be very sad.
873 *
874 * Just trust what we know to be true.
875 */
876 assert(hint != 0);
877 return FILTER_ACTIVE;
878}
879
880static int
881filt_portevent(struct knote *kn, long hint __assert_only)
882{
883 assert(hint == 0);
884 return filt_port_filter_result(kn, port: kn->kn_ipc_port);
885}
886
887void
888ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
889{
890 KNOTE(&ips_from_waitq(wqs)->ips_klist, (long)waitq);
891}
892
893static void
894filt_machporttouch(struct knote *kn, struct kevent_qos_s *kev)
895{
896 /*
897 * Specificying MACH_RCV_MSG or MACH_RCV_SYNC_PEEK during attach results in
898 * allocation of a turnstile. Modifying the filter flags to include these
899 * flags later, without a turnstile being allocated, leads to
900 * inconsistencies.
901 */
902 if ((kn->kn_sfflags ^ kev->fflags) & (MACH_RCV_MSG | MACH_RCV_SYNC_PEEK)) {
903 kev->flags |= EV_ERROR;
904 kev->data = EINVAL;
905 return;
906 }
907
908 /* copy in new settings and save off new input fflags */
909 kn->kn_sfflags = kev->fflags;
910 kn->kn_ext[0] = kev->ext[0];
911 kn->kn_ext[1] = kev->ext[1];
912
913 if (kev->flags & EV_ENABLE) {
914 /*
915 * If the knote is being enabled, make sure there's no lingering
916 * IPC overrides from the previous message delivery.
917 */
918 filt_machport_turnstile_complete(kn);
919 }
920}
921
922static int
923filt_psettouch(struct knote *kn, struct kevent_qos_s *kev)
924{
925 ipc_pset_t pset = kn->kn_ipc_pset;
926 int result = 0;
927
928 filt_machporttouch(kn, kev);
929 if (kev->flags & EV_ERROR) {
930 return 0;
931 }
932
933 ips_mq_lock(pset);
934 result = filt_pset_filter_result(pset);
935 ips_mq_unlock(pset);
936
937 return result;
938}
939
940static int
941filt_porttouch(struct knote *kn, struct kevent_qos_s *kev)
942{
943 ipc_port_t port = kn->kn_ipc_port;
944 int result = 0;
945
946 filt_machporttouch(kn, kev);
947 if (kev->flags & EV_ERROR) {
948 return 0;
949 }
950
951 ip_mq_lock(port);
952 result = filt_port_filter_result(kn, port);
953 ip_mq_unlock(port);
954
955 return result;
956}
957
958static int
959filt_machportprocess(
960 struct knote *kn,
961 struct kevent_qos_s *kev,
962 ipc_object_t object,
963 ipc_object_type_t otype)
964{
965 thread_t self = current_thread();
966 kevent_ctx_t kectx = NULL;
967
968 wait_result_t wresult;
969 mach_msg_option64_t option64;
970 mach_vm_address_t msg_addr;
971 mach_msg_size_t max_msg_size, cpout_aux_size, cpout_msg_size;
972 uint32_t ppri;
973 mach_msg_qos_t oqos;
974
975 int result = FILTER_ACTIVE;
976
977 /* Capture current state */
978 knote_fill_kevent(kn, kev, MACH_PORT_NULL);
979
980 /* Clear port reference, use ext3 as size of msg aux data */
981 kev->ext[3] = 0;
982
983 /* If already deallocated/moved return one last EOF event */
984 if (kev->flags & EV_EOF) {
985 return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
986 }
987
988 /*
989 * Only honor supported receive options. If no options are
990 * provided, just force a MACH_RCV_LARGE to detect the
991 * name of the port and sizeof the waiting message.
992 *
993 * Extend kn_sfflags to 64 bits.
994 */
995 option64 = (mach_msg_option64_t)kn->kn_sfflags & (MACH_RCV_MSG |
996 MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
997 MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
998
999 if (option64 & MACH_RCV_MSG) {
1000 msg_addr = (mach_vm_address_t) kn->kn_ext[0];
1001 max_msg_size = (mach_msg_size_t) kn->kn_ext[1];
1002
1003 /*
1004 * Copy out the incoming message as vector, and append aux data
1005 * immediately after the message proper (if any) and report its
1006 * size on ext3.
1007 */
1008 option64 |= (MACH64_MSG_VECTOR | MACH64_RCV_LINEAR_VECTOR);
1009
1010 /*
1011 * If the kevent didn't specify a buffer and length, carve a buffer
1012 * from the filter processing data according to the flags.
1013 */
1014 if (max_msg_size == 0) {
1015 kectx = kevent_get_context(thread: self);
1016 msg_addr = (mach_vm_address_t)kectx->kec_data_out;
1017 max_msg_size = (mach_msg_size_t)kectx->kec_data_resid;
1018 option64 |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
1019 /* Receive vector linearly onto stack */
1020 if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
1021 option64 |= MACH64_RCV_STACK;
1022 }
1023 }
1024 } else {
1025 /* just detect the port name (if a set) and size of the first message */
1026 option64 = MACH_RCV_LARGE;
1027 msg_addr = 0;
1028 max_msg_size = 0;
1029 }
1030
1031 /*
1032 * Set up to receive a message or the notification of a
1033 * too large message. But never allow this call to wait.
1034 * If the user provided aditional options, like trailer
1035 * options, pass those through here. But we don't support
1036 * scatter lists through this interface.
1037 *
1038 * Note: while in filt_machportprocess(),
1039 * the knote has a reference on `object` that we can borrow.
1040 */
1041 self->ith_object = object;
1042
1043 /* Using msg_addr as combined buffer for message proper and aux */
1044 self->ith_msg_addr = msg_addr;
1045 self->ith_max_msize = max_msg_size;
1046 self->ith_msize = 0;
1047
1048 self->ith_aux_addr = 0;
1049 self->ith_max_asize = 0;
1050 self->ith_asize = 0;
1051
1052 self->ith_option = option64;
1053 self->ith_receiver_name = MACH_PORT_NULL;
1054 option64 |= MACH_RCV_TIMEOUT; // never wait
1055 self->ith_state = MACH_RCV_IN_PROGRESS;
1056 self->ith_knote = kn;
1057
1058 ipc_object_lock(object, type: otype);
1059
1060 wresult = ipc_mqueue_receive_on_thread_and_unlock(
1061 io_waitq(object),
1062 option64,
1063 max_size: self->ith_max_msize, /* max msg suze */
1064 max_aux_size: 0, /* max aux size 0, using combined buffer */
1065 rcv_timeout: 0, /* immediate timeout */
1066 THREAD_INTERRUPTIBLE,
1067 thread: self);
1068 /* port unlocked */
1069
1070 /* If we timed out, or the process is exiting, just zero. */
1071 if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
1072 assert(self->turnstile != TURNSTILE_NULL);
1073 self->ith_knote = ITH_KNOTE_NULL;
1074 return 0;
1075 }
1076
1077 assert(wresult == THREAD_NOT_WAITING);
1078 assert(self->ith_state != MACH_RCV_IN_PROGRESS);
1079
1080 /*
1081 * If we weren't attempting to receive a message
1082 * directly, we need to return the port name in
1083 * the kevent structure.
1084 */
1085 if ((option64 & MACH_RCV_MSG) != MACH_RCV_MSG) {
1086 assert(self->ith_state == MACH_RCV_TOO_LARGE);
1087 assert(self->ith_kmsg == IKM_NULL);
1088 kev->data = self->ith_receiver_name;
1089 self->ith_knote = ITH_KNOTE_NULL;
1090 return result;
1091 }
1092
1093#if CONFIG_PREADOPT_TG
1094 /* If we're the first EVFILT_MACHPORT knote that is being processed for this
1095 * kqwl, then make sure to preadopt the thread group from the kmsg we're
1096 * about to receive. This is to make sure that we fix up the preadoption
1097 * thread group correctly on the receive side for the first message.
1098 */
1099 struct kqueue *kq = knote_get_kq(kn);
1100
1101 if (self->ith_kmsg) {
1102 struct thread_group *tg = ipc_kmsg_get_thread_group(kmsg: self->ith_kmsg);
1103
1104 kqueue_process_preadopt_thread_group(t: self, kq, tg);
1105 }
1106#endif
1107 if (otype == IOT_PORT) {
1108 ipc_port_t port = ip_object_to_port(object);
1109 struct kqueue *kqwl = knote_get_kq(kn);
1110 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqu: kqwl)) {
1111 /*
1112 * Lock the port to make sure port->ip_kernel_iotier_override does
1113 * not change while updating the kqueue override, else kqueue could
1114 * have old iotier value.
1115 */
1116 ip_mq_lock(port);
1117 kqueue_set_iotier_override(kqu: kqwl, iotier_override: port->ip_kernel_iotier_override);
1118 ip_mq_unlock(port);
1119 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
1120 }
1121 }
1122
1123 /*
1124 * Attempt to receive the message directly, returning
1125 * the results in the fflags field.
1126 */
1127 io_reference(object);
1128 kev->fflags = mach_msg_receive_results_kevent(size: &cpout_msg_size,
1129 aux_size: &cpout_aux_size, ppri: &ppri, oqos: &oqos);
1130
1131 /* kmsg and object reference consumed */
1132
1133 /*
1134 * if the user asked for the identity of ports containing a
1135 * a too-large message, return it in the data field (as we
1136 * do for messages we didn't try to receive).
1137 */
1138 if (kev->fflags == MACH_RCV_TOO_LARGE) {
1139 kev->ext[1] = self->ith_msize;
1140 kev->ext[3] = self->ith_asize; /* Only lower 32 bits of ext3 are used */
1141 if (option64 & MACH_RCV_LARGE_IDENTITY) {
1142 kev->data = self->ith_receiver_name;
1143 } else {
1144 kev->data = MACH_PORT_NULL;
1145 }
1146 } else {
1147 kev->ext[1] = cpout_msg_size;
1148 kev->ext[3] = cpout_aux_size; /* Only lower 32 bits of ext3 are used */
1149 kev->data = MACH_PORT_NULL;
1150 }
1151
1152 /*
1153 * If we used a data buffer carved out from the filt_process data,
1154 * store the address used in the knote and adjust the residual and
1155 * other parameters for future use.
1156 */
1157 if (kectx) {
1158 assert(kectx->kec_data_resid >= cpout_msg_size + cpout_aux_size);
1159 kectx->kec_data_resid -= cpout_msg_size + cpout_aux_size;
1160 if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1161 kev->ext[0] = kectx->kec_data_out;
1162 kectx->kec_data_out += cpout_msg_size + cpout_aux_size;
1163 } else {
1164 assert(option64 & MACH64_RCV_STACK);
1165 kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
1166 }
1167 }
1168
1169 /*
1170 * Apply message-based QoS values to output kevent as prescribed.
1171 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1172 */
1173 if (kev->fflags == MACH_MSG_SUCCESS) {
1174 kev->ext[2] = ((uint64_t)ppri << 32) |
1175 _pthread_priority_make_from_thread_qos(qos: oqos, relpri: 0, flags: 0);
1176 }
1177
1178 self->ith_knote = ITH_KNOTE_NULL;
1179 return result;
1180}
1181
1182static int
1183filt_psetprocess(struct knote *kn, struct kevent_qos_s *kev)
1184{
1185 ipc_object_t io = ips_to_object(kn->kn_ipc_pset);
1186
1187 return filt_machportprocess(kn, kev, object: io, IOT_PORT_SET);
1188}
1189
1190static int
1191filt_portprocess(struct knote *kn, struct kevent_qos_s *kev)
1192{
1193 ipc_object_t io = ip_to_object(kn->kn_ipc_port);
1194
1195 return filt_machportprocess(kn, kev, object: io, IOT_PORT);
1196}
1197
1198static void
1199filt_machportsanitizedcopyout(struct knote *kn, struct kevent_qos_s *kev)
1200{
1201 *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
1202
1203 // We may have stashed the address to the port that is pushing on the sync
1204 // IPC so clear it out.
1205 kev->ext[3] = 0;
1206}
1207
1208const struct filterops machport_attach_filtops = {
1209 .f_adjusts_qos = true,
1210 .f_extended_codes = true,
1211 .f_attach = filt_machportattach,
1212 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1213};
1214
1215const struct filterops mach_port_filtops = {
1216 .f_adjusts_qos = true,
1217 .f_extended_codes = true,
1218 .f_detach = filt_portdetach,
1219 .f_event = filt_portevent,
1220 .f_touch = filt_porttouch,
1221 .f_process = filt_portprocess,
1222 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1223};
1224
1225const struct filterops mach_port_set_filtops = {
1226 .f_adjusts_qos = true,
1227 .f_extended_codes = true,
1228 .f_detach = filt_psetdetach,
1229 .f_event = filt_psetevent,
1230 .f_touch = filt_psettouch,
1231 .f_process = filt_psetprocess,
1232 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1233};
1234