1/*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: ipc/ipc_pset.c
60 * Author: Rich Draves
61 * Date: 1989
62 *
63 * Functions to manipulate IPC port sets.
64 */
65
66#include <mach/port.h>
67#include <mach/kern_return.h>
68#include <mach/message.h>
69#include <ipc/ipc_mqueue.h>
70#include <ipc/ipc_object.h>
71#include <ipc/ipc_pset.h>
72#include <ipc/ipc_right.h>
73#include <ipc/ipc_space.h>
74#include <ipc/ipc_port.h>
75
76#include <kern/kern_types.h>
77
78#include <vm/vm_map.h>
79#include <libkern/section_keywords.h>
80
81/*
82 * Routine: ipc_pset_alloc
83 * Purpose:
84 * Allocate a port set.
85 * Conditions:
86 * Nothing locked. If successful, the port set is returned
87 * locked. (The caller doesn't have a reference.)
88 * Returns:
89 * KERN_SUCCESS The port set is allocated.
90 * KERN_INVALID_TASK The space is dead.
91 * KERN_NO_SPACE No room for an entry in the space.
92 * KERN_RESOURCE_SHORTAGE Couldn't allocate memory.
93 */
94
95kern_return_t
96ipc_pset_alloc(
97 ipc_space_t space,
98 mach_port_name_t *namep,
99 ipc_pset_t *psetp)
100{
101 ipc_pset_t pset;
102 mach_port_name_t name;
103 kern_return_t kr;
104
105 kr = ipc_object_alloc(space, IOT_PORT_SET,
106 MACH_PORT_TYPE_PORT_SET, 0,
107 &name, (ipc_object_t *) &pset);
108 if (kr != KERN_SUCCESS) {
109 return kr;
110 }
111 /* pset and space are locked */
112
113 ipc_mqueue_init(&pset->ips_messages, TRUE /* set */);
114 is_write_unlock(space);
115
116 *namep = name;
117 *psetp = pset;
118 return KERN_SUCCESS;
119}
120
121/*
122 * Routine: ipc_pset_alloc_name
123 * Purpose:
124 * Allocate a port set, with a specific name.
125 * Conditions:
126 * Nothing locked. If successful, the port set is returned
127 * locked. (The caller doesn't have a reference.)
128 * Returns:
129 * KERN_SUCCESS The port set is allocated.
130 * KERN_INVALID_TASK The space is dead.
131 * KERN_NAME_EXISTS The name already denotes a right.
132 * KERN_RESOURCE_SHORTAGE Couldn't allocate memory.
133 */
134
135kern_return_t
136ipc_pset_alloc_name(
137 ipc_space_t space,
138 mach_port_name_t name,
139 ipc_pset_t *psetp)
140{
141 ipc_pset_t pset;
142 kern_return_t kr;
143
144 kr = ipc_object_alloc_name(space, IOT_PORT_SET,
145 MACH_PORT_TYPE_PORT_SET, 0,
146 name, (ipc_object_t *) &pset);
147 if (kr != KERN_SUCCESS) {
148 return kr;
149 }
150 /* pset is locked */
151
152 ipc_mqueue_init(&pset->ips_messages, TRUE /* set */);
153
154 *psetp = pset;
155 return KERN_SUCCESS;
156}
157
158
159/*
160 * Routine: ipc_pset_alloc_special
161 * Purpose:
162 * Allocate a port set in a special space.
163 * The new port set is returned with one ref.
164 * If unsuccessful, IPS_NULL is returned.
165 * Conditions:
166 * Nothing locked.
167 */
168ipc_pset_t
169ipc_pset_alloc_special(
170 __assert_only ipc_space_t space)
171{
172 ipc_pset_t pset;
173
174 assert(space != IS_NULL);
175 assert(space->is_table == IE_NULL);
176 assert(!is_active(space));
177
178 __IGNORE_WCASTALIGN(pset = (ipc_pset_t)io_alloc(IOT_PORT_SET));
179 if (pset == IPS_NULL) {
180 return IPS_NULL;
181 }
182
183 bzero((char *)pset, sizeof(*pset));
184
185 io_lock_init(&pset->ips_object);
186 pset->ips_references = 1;
187 pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0);
188
189 ipc_mqueue_init(&pset->ips_messages, TRUE /* set */);
190
191 return pset;
192}
193
194
195/*
196 * Routine: ipc_pset_member
197 * Purpose:
198 * Checks to see if a port is a member of a pset
199 * Conditions:
200 * Both port and port set are locked.
201 * The port must be active.
202 */
203boolean_t
204ipc_pset_member(
205 ipc_pset_t pset,
206 ipc_port_t port)
207{
208 assert(ip_active(port));
209
210 return (ipc_mqueue_member(&port->ip_messages, &pset->ips_messages));
211}
212
213
214/*
215 * Routine: ipc_pset_add
216 * Purpose:
217 * Puts a port into a port set.
218 * Conditions:
219 * Both port and port set are locked and active.
220 * The owner of the port set is also receiver for the port.
221 */
222
223kern_return_t
224ipc_pset_add(
225 ipc_pset_t pset,
226 ipc_port_t port,
227 uint64_t *reserved_link,
228 uint64_t *reserved_prepost)
229{
230 kern_return_t kr;
231
232 assert(ips_active(pset));
233 assert(ip_active(port));
234
235 kr = ipc_mqueue_add(&port->ip_messages, &pset->ips_messages,
236 reserved_link, reserved_prepost);
237
238 return kr;
239}
240
241
242
243/*
244 * Routine: ipc_pset_remove
245 * Purpose:
246 * Removes a port from a port set.
247 * The port set loses a reference.
248 * Conditions:
249 * Both port and port set are locked.
250 * The port must be active.
251 */
252
253kern_return_t
254ipc_pset_remove(
255 ipc_pset_t pset,
256 ipc_port_t port)
257{
258 kern_return_t kr;
259
260 assert(ip_active(port));
261
262 if (port->ip_in_pset == 0)
263 return KERN_NOT_IN_SET;
264
265 kr = ipc_mqueue_remove(&port->ip_messages, &pset->ips_messages);
266
267 return kr;
268}
269
270/*
271 * Routine: ipc_pset_lazy_allocate
272 * Purpose:
273 * lazily initialize the wqset of a port set.
274 * Conditions:
275 * Nothing locked.
276 */
277
278kern_return_t
279ipc_pset_lazy_allocate(
280 ipc_space_t space,
281 mach_port_name_t psname)
282{
283 kern_return_t kr;
284 ipc_entry_t entry;
285 ipc_object_t psobj;
286 ipc_pset_t pset;
287
288 kr = ipc_right_lookup_read(space, psname, &entry);
289 if (kr != KERN_SUCCESS)
290 return kr;
291
292 /* space is read-locked and active */
293 if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) == 0) {
294 is_read_unlock(space);
295 kr = KERN_INVALID_RIGHT;
296 return kr;
297 }
298
299 psobj = entry->ie_object;
300 __IGNORE_WCASTALIGN(pset = (ipc_pset_t) psobj);
301 assert(pset != NULL);
302 ipc_mqueue_t set_mqueue = &pset->ips_messages;
303 struct waitq_set *wqset = &set_mqueue->imq_set_queue;
304
305 io_reference(psobj);
306 is_read_unlock(space);
307
308 /*
309 * lazily initialize the wqset to avoid
310 * possible allocation while linking
311 * under spinlocks.
312 */
313 waitq_set_lazy_init_link(wqset);
314 io_release(psobj);
315
316 return KERN_SUCCESS;
317}
318
319/*
320 * Routine: ipc_pset_remove_from_all
321 * Purpose:
322 * Removes a port from all it's port sets.
323 * Conditions:
324 * port is locked and active.
325 */
326
327kern_return_t
328ipc_pset_remove_from_all(
329 ipc_port_t port)
330{
331 if (port->ip_in_pset == 0)
332 return KERN_NOT_IN_SET;
333
334 /*
335 * Remove the port's mqueue from all sets
336 */
337 ipc_mqueue_remove_from_all(&port->ip_messages);
338 return KERN_SUCCESS;
339}
340
341
342/*
343 * Routine: ipc_pset_destroy
344 * Purpose:
345 * Destroys a port_set.
346 * Conditions:
347 * The port_set is locked and alive.
348 * The caller has a reference, which is consumed.
349 * Afterwards, the port_set is unlocked and dead.
350 */
351
352void
353ipc_pset_destroy(
354 ipc_pset_t pset)
355{
356 assert(ips_active(pset));
357
358 pset->ips_object.io_bits &= ~IO_BITS_ACTIVE;
359
360 /*
361 * remove all the member message queues
362 * AND remove this message queue from any containing sets
363 */
364 ipc_mqueue_remove_all(&pset->ips_messages);
365
366 /*
367 * Set all waiters on the portset running to
368 * discover the change.
369 */
370 imq_lock(&pset->ips_messages);
371 ipc_mqueue_changed(&pset->ips_messages);
372 imq_unlock(&pset->ips_messages);
373
374 ipc_mqueue_deinit(&pset->ips_messages);
375
376 ips_unlock(pset);
377 ips_release(pset); /* consume the ref our caller gave us */
378}
379
380/*
381 * Kqueue EVFILT_MACHPORT support
382 *
383 * - kn_ptr.p_mqueue points to the monitored mqueue
384 *
385 * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
386 * that can be used to direct-deliver messages when
387 * MACH_RCV_MSG is set in kn_sfflags
388 *
389 * - (in/out) ext[1] holds a mach_msg_size_t representing the size
390 * of the userspace buffer held in ext[0].
391 *
392 * - (out) ext[2] is used to deliver qos information
393 * about the send queue to userspace.
394 *
395 * - (abused) ext[3] is used in kernel to hold a reference to the first port
396 * with a turnstile that participate to sync IPC override.
397 *
398 * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
399 * of turnstiles for rights copied out as part of direct message delivery
400 * when they can participate to sync IPC override.
401 *
402 * It is used to atomically neuter the sync IPC override when the knote is
403 * re-enabled.
404 *
405 */
406
407#include <sys/event.h>
408#include <sys/errno.h>
409
410static int
411filt_machport_adjust_qos(struct knote *kn, ipc_kmsg_t first)
412{
413 if (kn->kn_sfflags & MACH_RCV_MSG) {
414 int qos = _pthread_priority_thread_qos(first->ikm_qos_override);
415 return FILTER_ADJUST_EVENT_QOS(qos);
416 }
417 return 0;
418}
419
420struct turnstile *
421filt_machport_kqueue_turnstile(struct knote *kn)
422{
423 if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
424 return kqueue_turnstile(knote_get_kq(kn));
425 }
426 return TURNSTILE_NULL;
427}
428
429/*
430 * Stashes a port that participate to sync IPC override until the knote
431 * is being re-enabled.
432 *
433 * It returns:
434 * - the turnstile to use as an inheritor for the stashed port
435 * - the kind of stash that happened as PORT_SYNC_* value among:
436 * o not stashed (no sync IPC support)
437 * o stashed in the knote (in kn_ext[3])
438 * o to be hooked to the kn_hook knote
439 */
440struct turnstile *
441filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
442{
443 struct turnstile *ts = filt_machport_kqueue_turnstile(kn);
444
445 if (!ts) {
446 if (link) *link = PORT_SYNC_LINK_NO_LINKAGE;
447 } else if (kn->kn_ext[3] == 0) {
448 ip_reference(port);
449 kn->kn_ext[3] = (uintptr_t)port;
450 if (link) *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
451 } else {
452 ts = (struct turnstile *)kn->kn_hook;
453 if (link) *link = PORT_SYNC_LINK_WORKLOOP_STASH;
454 }
455
456 return ts;
457}
458
459struct turnstile *
460filt_machport_stashed_special_reply_port_turnstile(ipc_port_t port)
461{
462 struct knote *kn = port->ip_sync_inheritor_knote;
463
464 assert(port->ip_specialreply);
465 assert(port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE);
466 if (kn->kn_ext[3] == (uint64_t)port) {
467 return kqueue_turnstile(knote_get_kq(kn));
468 }
469 return kn->kn_hook;
470}
471
472/*
473 * Lazily prepare a turnstile so that filt_machport_stash_port()
474 * can be called with the mqueue lock held.
475 *
476 * It will allocate a turnstile in kn_hook if:
477 * - the knote supports sync IPC override,
478 * - we already stashed a port in kn_ext[3],
479 * - the object that will be copied out has a chance to ask to be stashed.
480 *
481 * It is setup so that its inheritor is the workloop turnstile that has been
482 * allocated when this knote was attached.
483 */
484void
485filt_machport_turnstile_prepare_lazily(
486 struct knote *kn,
487 mach_msg_type_name_t msgt_name,
488 ipc_port_t port)
489{
490 /* This is called from within filt_machportprocess */
491 assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
492
493 struct turnstile *ts = filt_machport_kqueue_turnstile(kn);
494 if (ts == TURNSTILE_NULL || kn->kn_ext[3] == 0 || kn->kn_hook)
495 return;
496
497 if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
498 (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
499 struct turnstile *kn_ts = turnstile_alloc();
500 kn_ts = turnstile_prepare((uintptr_t)kn,
501 (struct turnstile **)&kn->kn_hook, kn_ts, TURNSTILE_KNOTE);
502 turnstile_update_inheritor(kn_ts, ts,
503 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
504 turnstile_cleanup();
505 }
506}
507
508/*
509 * Other half of filt_machport_turnstile_prepare_lazily()
510 *
511 * This is serialized by the knote state machine.
512 */
513static void
514filt_machport_turnstile_complete(struct knote *kn)
515{
516 struct turnstile *ts = TURNSTILE_NULL;
517
518 if (kn->kn_ext[3]) {
519 ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
520 ipc_mqueue_t mqueue = &port->ip_messages;
521
522 ip_lock(port);
523 if (port->ip_specialreply) {
524 /*
525 * If the reply has been sent to the special reply port already,
526 * then the special reply port may already be reused to do something
527 * entirely different.
528 *
529 * However, the only reason for it to still point to this knote is
530 * that it's still waiting for a reply, so when this is the case,
531 * neuter the linkage.
532 */
533 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
534 port->ip_sync_inheritor_knote == kn) {
535 ipc_port_adjust_special_reply_port_locked(port, NULL,
536 (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
537 } else {
538 ip_unlock(port);
539 }
540 } else {
541 struct turnstile *kq_ts = kqueue_turnstile(knote_get_kq(kn));
542
543 /*
544 * For receive rights, if their IMQ_INHERITOR() is still this
545 * workloop, then sever the link.
546 *
547 * It has a theoretical hole: if the port is sent again to a new
548 * receive right that is also monitored by the same kqueue,
549 * we would sever the link incorrectly.
550 *
551 * However this would be a REALLY cumbersome thing to do.
552 */
553 imq_lock(mqueue);
554 if (!IMQ_KLIST_VALID(mqueue) && IMQ_INHERITOR(mqueue) == kq_ts) {
555 turnstile_deallocate_safe(kq_ts);
556 klist_init(&mqueue->imq_klist);
557 ts = port_send_turnstile(port);
558 }
559 if (ts) {
560 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
561 TURNSTILE_IMMEDIATE_UPDATE);
562 turnstile_reference(ts);
563 }
564 imq_unlock(mqueue);
565 ip_unlock(port);
566
567 if (ts) {
568 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
569 turnstile_deallocate(ts);
570 }
571 }
572
573 ip_release(port);
574 kn->kn_ext[3] = 0;
575 }
576
577 if (kn->kn_hook) {
578 ts = kn->kn_hook;
579
580 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
581 TURNSTILE_IMMEDIATE_UPDATE);
582 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
583
584 turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts);
585 turnstile_cleanup();
586
587 assert(ts);
588 turnstile_deallocate(ts);
589 }
590}
591
592static int
593filt_machportattach(
594 struct knote *kn,
595 __unused struct kevent_internal_s *kev)
596{
597 mach_port_name_t name = (mach_port_name_t)kn->kn_kevent.ident;
598 uint64_t wq_link_id = waitq_link_reserve(NULL);
599 ipc_space_t space = current_space();
600 ipc_kmsg_t first;
601 struct turnstile *turnstile = TURNSTILE_NULL;
602 struct turnstile *send_turnstile = TURNSTILE_NULL;
603
604 int error;
605 int result = 0;
606 kern_return_t kr;
607 ipc_entry_t entry;
608 ipc_mqueue_t mqueue;
609
610 kn->kn_flags &= ~EV_EOF;
611 kn->kn_ext[3] = 0;
612
613 if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
614 /*
615 * If the filter is likely to support sync IPC override,
616 * and it happens to be attaching to a workloop,
617 * make sure the workloop has an allocated turnstile.
618 */
619 turnstile = kqueue_alloc_turnstile(knote_get_kq(kn));
620 }
621
622 kr = ipc_right_lookup_read(space, name, &entry);
623
624check_lookup:
625 if (kr == KERN_SUCCESS) {
626 /* space is read-locked and active */
627
628 if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) {
629 ipc_pset_t pset;
630
631 if (knote_link_waitqset_should_lazy_alloc(kn)) {
632 is_read_unlock(space);
633
634 /*
635 * We need to link the portset of the kn,
636 * to insure that the link is allocated before taking
637 * any spinlocks.
638 */
639 knote_link_waitqset_lazy_alloc(kn);
640
641 /*
642 * We had to drop the space lock because knote_link_waitqset_lazy_alloc()
643 * could have allocated memory. The ipc_right_lookup_read()
644 * function returns with the space locked, so we need to revalidate state.
645 */
646 kr = ipc_right_lookup_read(space, name, &entry);
647 if (!(kr == KERN_SUCCESS) || !(entry->ie_bits & MACH_PORT_TYPE_PORT_SET)) {
648 goto check_lookup;
649 }
650 }
651
652 __IGNORE_WCASTALIGN(pset = (ipc_pset_t)entry->ie_object);
653 mqueue = &pset->ips_messages;
654 ips_reference(pset);
655
656 imq_lock(mqueue);
657 kn->kn_ptr.p_mqueue = mqueue;
658
659 /*
660 * Bind the portset wait queue directly to knote/kqueue.
661 * This allows us to just use wait_queue foo to effect a wakeup,
662 * rather than having to call knote() from the Mach code on each
663 * message. We still attach the knote to the mqueue klist for
664 * NOTE_REVOKE purposes only.
665 */
666 error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id);
667 if (!error) {
668 assert(IMQ_KLIST_VALID(mqueue));
669 KNOTE_ATTACH(&mqueue->imq_klist, kn);
670 imq_unlock(mqueue);
671 } else {
672 kn->kn_ptr.p_mqueue = IMQ_NULL;
673 imq_unlock(mqueue);
674 ips_release(pset);
675 }
676
677 is_read_unlock(space);
678
679 /*
680 * linked knotes are marked stay-active and therefore don't
681 * need an indication of their fired state to be returned
682 * from the attach operation.
683 */
684
685 } else if (entry->ie_bits & MACH_PORT_TYPE_RECEIVE) {
686 ipc_port_t port;
687
688 __IGNORE_WCASTALIGN(port = (ipc_port_t)entry->ie_object);
689 mqueue = &port->ip_messages;
690 ip_reference(port);
691
692 /*
693 * attach knote to port and determine result
694 * If the filter requested direct message receipt,
695 * we may need to adjust the qos of the knote to
696 * reflect the requested and override qos of the
697 * first message in the queue.
698 */
699 imq_lock(mqueue);
700 kn->kn_ptr.p_mqueue = mqueue;
701 if (!IMQ_KLIST_VALID(mqueue)) {
702 /*
703 * We're attaching a port that used to have an IMQ_INHERITOR,
704 * clobber this state, and set the inheritor of its turnstile
705 * to the kqueue it's now attached to.
706 */
707 turnstile_deallocate_safe(IMQ_INHERITOR(mqueue));
708 klist_init(&mqueue->imq_klist);
709 }
710 KNOTE_ATTACH(&mqueue->imq_klist, kn);
711
712 /* Update the port's turnstile inheritor */
713 send_turnstile = port_send_turnstile(port);
714 if (send_turnstile) {
715 turnstile_reference(send_turnstile);
716 turnstile_update_inheritor(send_turnstile, turnstile,
717 (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
718 }
719
720 if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
721 result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
722 }
723 imq_unlock(mqueue);
724 is_read_unlock(space);
725 if (send_turnstile) {
726 turnstile_update_inheritor_complete(send_turnstile,
727 TURNSTILE_INTERLOCK_NOT_HELD);
728 turnstile_deallocate(send_turnstile);
729 }
730
731 error = 0;
732 } else {
733 is_read_unlock(space);
734 error = ENOTSUP;
735 }
736 } else {
737 error = ENOENT;
738 }
739
740 waitq_link_release(wq_link_id);
741
742 /* bail out on errors */
743 if (error) {
744 knote_set_error(kn, error);
745 return 0;
746 }
747
748 return result;
749}
750
751/* NOT proud of these - we should have a stricter relationship between mqueue and ipc object */
752#define mqueue_to_pset(mq) ((ipc_pset_t)((uintptr_t)mq-offsetof(struct ipc_pset, ips_messages)))
753#define mqueue_to_port(mq) ((ipc_port_t)((uintptr_t)mq-offsetof(struct ipc_port, ip_messages)))
754#define mqueue_to_object(mq) (((ipc_object_t)(mq)) - 1)
755
756
757static void
758filt_machportdetach(
759 struct knote *kn)
760{
761 ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
762 ipc_object_t object = mqueue_to_object(mqueue);
763 struct turnstile *send_turnstile = TURNSTILE_NULL;
764
765 filt_machport_turnstile_complete(kn);
766
767 imq_lock(mqueue);
768 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
769 /*
770 * ipc_mqueue_changed() already unhooked this knote from the mqueue,
771 */
772 } else {
773 assert(IMQ_KLIST_VALID(mqueue));
774 KNOTE_DETACH(&mqueue->imq_klist, kn);
775 }
776
777 if (io_otype(object) == IOT_PORT) {
778 ipc_port_t port = ip_from_mq(mqueue);
779
780 send_turnstile = port_send_turnstile(port);
781 if (send_turnstile) {
782 turnstile_reference(send_turnstile);
783 turnstile_update_inheritor(send_turnstile,
784 ipc_port_get_inheritor(port),
785 TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE);
786 }
787 }
788
789 /* Clear the knote pointer once the knote has been removed from turnstile */
790 kn->kn_ptr.p_mqueue = IMQ_NULL;
791 imq_unlock(mqueue);
792
793 if (send_turnstile) {
794 turnstile_update_inheritor_complete(send_turnstile,
795 TURNSTILE_INTERLOCK_NOT_HELD);
796 turnstile_deallocate(send_turnstile);
797 }
798
799 if (io_otype(object) == IOT_PORT_SET) {
800 /*
801 * Unlink the portset wait queue from knote/kqueue.
802 * JMM - Does this need to be atomic under the mq lock?
803 */
804 (void)knote_unlink_waitq(kn, &mqueue->imq_wait_queue);
805 }
806 io_release(object);
807}
808
809/*
810 * filt_machportevent - deliver events into the mach port filter
811 *
812 * Mach port message arrival events are currently only posted via the
813 * kqueue filter routine for ports. Port sets are marked stay-active
814 * and the wait queue code will break any kqueue waiters out to go
815 * poll the stay-queued knotes again.
816 *
817 * If there is a message at the head of the queue,
818 * we indicate that the knote should go active. If
819 * the message is to be direct-received, we adjust the
820 * QoS of the knote according the requested and override
821 * QoS of that first message.
822 *
823 * NOTE_REVOKE events are a legacy way to indicate that the port/portset
824 * was deallocated or left the current Mach portspace (modern technique
825 * is with an EV_VANISHED protocol). If we see NOTE_REVOKE, deliver an
826 * EV_EOF event for these changes (hopefully it will get delivered before
827 * the port name recycles to the same generation count and someone tries
828 * to re-register a kevent for it or the events are udata-specific -
829 * avoiding a conflict).
830 */
831static int
832filt_machportevent(
833 struct knote *kn,
834 long hint)
835{
836 ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
837 ipc_kmsg_t first;
838 int result = 0;
839
840 /* mqueue locked by caller */
841 assert(imq_held(mqueue));
842
843 if (hint == NOTE_REVOKE) {
844 kn->kn_flags |= EV_EOF | EV_ONESHOT;
845 result = FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
846 } else if (imq_is_valid(mqueue)) {
847 assert(!imq_is_set(mqueue));
848 if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
849 result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
850 }
851 }
852
853 return result;
854}
855
856static int
857filt_machporttouch(
858 struct knote *kn,
859 struct kevent_internal_s *kev)
860{
861 ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
862 ipc_kmsg_t first;
863 int result = 0;
864
865 /* copy in new settings and save off new input fflags */
866 kn->kn_sfflags = kev->fflags;
867 kn->kn_ext[0] = kev->ext[0];
868 kn->kn_ext[1] = kev->ext[1];
869
870 if (kev->flags & EV_ENABLE) {
871 /*
872 * If the knote is being enabled, make sure there's no lingering
873 * IPC overrides from the previous message delivery.
874 */
875 filt_machport_turnstile_complete(kn);
876 }
877
878 /*
879 * If the mqueue is a valid port and there is a message
880 * that will be direct-received from the knote, update
881 * the knote qos based on the first message and trigger
882 * the event. If there are no more messages, reset the
883 * QoS to the value provided by the kevent.
884 */
885 imq_lock(mqueue);
886 if (imq_is_valid(mqueue) && !imq_is_set(mqueue) &&
887 (first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
888 result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
889 } else if (kn->kn_sfflags & MACH_RCV_MSG) {
890 result = FILTER_RESET_EVENT_QOS;
891 }
892 imq_unlock(mqueue);
893
894 return result;
895}
896
897static int
898filt_machportprocess(
899 struct knote *kn,
900 struct filt_process_s *process_data,
901 struct kevent_internal_s *kev)
902{
903 ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
904 ipc_object_t object = mqueue_to_object(mqueue);
905 thread_t self = current_thread();
906 boolean_t used_filtprocess_data = FALSE;
907
908 wait_result_t wresult;
909 mach_msg_option_t option;
910 mach_vm_address_t addr;
911 mach_msg_size_t size;
912
913 /* Capture current state */
914 *kev = kn->kn_kevent;
915 kev->ext[3] = 0; /* hide our port reference from userspace */
916
917 /* If already deallocated/moved return one last EOF event */
918 if (kev->flags & EV_EOF) {
919 return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
920 }
921
922 /*
923 * Only honor supported receive options. If no options are
924 * provided, just force a MACH_RCV_TOO_LARGE to detect the
925 * name of the port and sizeof the waiting message.
926 */
927 option = kn->kn_sfflags & (MACH_RCV_MSG|MACH_RCV_LARGE|MACH_RCV_LARGE_IDENTITY|
928 MACH_RCV_TRAILER_MASK|MACH_RCV_VOUCHER);
929
930 if (option & MACH_RCV_MSG) {
931 addr = (mach_vm_address_t) kn->kn_ext[0];
932 size = (mach_msg_size_t) kn->kn_ext[1];
933
934 /*
935 * If the kevent didn't specify a buffer and length, carve a buffer
936 * from the filter processing data according to the flags.
937 */
938 if (size == 0 && process_data != NULL) {
939 used_filtprocess_data = TRUE;
940
941 addr = (mach_vm_address_t)process_data->fp_data_out;
942 size = (mach_msg_size_t)process_data->fp_data_resid;
943 option |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
944 if (process_data->fp_flags & KEVENT_FLAG_STACK_DATA)
945 option |= MACH_RCV_STACK;
946 }
947 } else {
948 /* just detect the port name (if a set) and size of the first message */
949 option = MACH_RCV_LARGE;
950 addr = 0;
951 size = 0;
952 }
953
954 imq_lock(mqueue);
955
956 /* just use the reference from here on out */
957 io_reference(object);
958
959 /*
960 * Set up to receive a message or the notification of a
961 * too large message. But never allow this call to wait.
962 * If the user provided aditional options, like trailer
963 * options, pass those through here. But we don't support
964 * scatter lists through this interface.
965 */
966 self->ith_object = object;
967 self->ith_msg_addr = addr;
968 self->ith_rsize = size;
969 self->ith_msize = 0;
970 self->ith_option = option;
971 self->ith_receiver_name = MACH_PORT_NULL;
972 self->ith_continuation = NULL;
973 option |= MACH_RCV_TIMEOUT; // never wait
974 self->ith_state = MACH_RCV_IN_PROGRESS;
975 self->ith_knote = kn;
976
977 wresult = ipc_mqueue_receive_on_thread(
978 mqueue,
979 option,
980 size, /* max_size */
981 0, /* immediate timeout */
982 THREAD_INTERRUPTIBLE,
983 self);
984 /* mqueue unlocked */
985
986 /*
987 * If we timed out, or the process is exiting, just release the
988 * reference on the ipc_object and return zero.
989 */
990 if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
991 assert(self->turnstile != TURNSTILE_NULL);
992 io_release(object);
993 return 0;
994 }
995
996 assert(wresult == THREAD_NOT_WAITING);
997 assert(self->ith_state != MACH_RCV_IN_PROGRESS);
998
999 /*
1000 * If we weren't attempting to receive a message
1001 * directly, we need to return the port name in
1002 * the kevent structure.
1003 */
1004 if ((option & MACH_RCV_MSG) != MACH_RCV_MSG) {
1005 assert(self->ith_state == MACH_RCV_TOO_LARGE);
1006 assert(self->ith_kmsg == IKM_NULL);
1007 kev->data = self->ith_receiver_name;
1008 io_release(object);
1009 return FILTER_ACTIVE;
1010 }
1011
1012 /*
1013 * Attempt to receive the message directly, returning
1014 * the results in the fflags field.
1015 */
1016 kev->fflags = mach_msg_receive_results(&size);
1017
1018 /* kmsg and object reference consumed */
1019
1020 /*
1021 * if the user asked for the identity of ports containing a
1022 * a too-large message, return it in the data field (as we
1023 * do for messages we didn't try to receive).
1024 */
1025 if (kev->fflags == MACH_RCV_TOO_LARGE) {
1026 kev->ext[1] = self->ith_msize;
1027 if (option & MACH_RCV_LARGE_IDENTITY)
1028 kev->data = self->ith_receiver_name;
1029 else
1030 kev->data = MACH_PORT_NULL;
1031 } else {
1032 kev->ext[1] = size;
1033 kev->data = MACH_PORT_NULL;
1034 }
1035
1036 /*
1037 * If we used a data buffer carved out from the filt_process data,
1038 * store the address used in the knote and adjust the residual and
1039 * other parameters for future use.
1040 */
1041 if (used_filtprocess_data) {
1042 assert(process_data->fp_data_resid >= size);
1043 process_data->fp_data_resid -= size;
1044 if ((process_data->fp_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1045 kev->ext[0] = process_data->fp_data_out;
1046 process_data->fp_data_out += size;
1047 } else {
1048 assert(option & MACH_RCV_STACK);
1049 kev->ext[0] = process_data->fp_data_out +
1050 process_data->fp_data_resid;
1051 }
1052 }
1053
1054 /*
1055 * Apply message-based QoS values to output kevent as prescribed.
1056 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1057 *
1058 * The mach_msg_receive_results() call saved off the message
1059 * QoS values in the continuation save area on successful receive.
1060 */
1061 if (kev->fflags == MACH_MSG_SUCCESS) {
1062 kev->ext[2] = ((uint64_t)self->ith_qos << 32) |
1063 (uint64_t)self->ith_qos_override;
1064 }
1065
1066 return FILTER_ACTIVE;
1067}
1068
1069/*
1070 * Peek to see if the message queue associated with the knote has any
1071 * events. This pre-hook is called when a filter uses the stay-
1072 * on-queue mechanism (as the knote_link_waitq mechanism does for
1073 * portsets) and someone calls select() against the containing kqueue.
1074 *
1075 * Just peek at the pre-post status of the portset's wait queue
1076 * to determine if it has anything interesting. We can do it
1077 * without holding the lock, as it is just a snapshot in time
1078 * (if this is used as part of really waiting for events, we
1079 * will catch changes in this status when the event gets posted
1080 * up to the knote's kqueue).
1081 */
1082static int
1083filt_machportpeek(struct knote *kn)
1084{
1085 ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
1086
1087 return ipc_mqueue_set_peek(mqueue) ? FILTER_ACTIVE : 0;
1088}
1089
1090SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = {
1091 .f_adjusts_qos = true,
1092 .f_extended_codes = true,
1093 .f_attach = filt_machportattach,
1094 .f_detach = filt_machportdetach,
1095 .f_event = filt_machportevent,
1096 .f_touch = filt_machporttouch,
1097 .f_process = filt_machportprocess,
1098 .f_peek = filt_machportpeek,
1099};
1100