1/*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Flow Control and Feedback Advisory
31 *
32 * Each mbuf that is being sent out through an interface is tagged with a
33 * unique 32-bit ID which will help to identify all the packets that belong
34 * to a particular flow at the interface layer. Packets carrying such ID
35 * would need to be marked with PKTF_FLOW_ID. Normally, this ID is computed
36 * by the module that generates the flow. There are 3 kinds of flow sources
37 * that are currently recognized:
38 *
39 * a. INPCB (INET/INET6 Protocol Control Block). When a socket is
40 * connected, the flow hash for the socket is computed and stored in
41 * the PCB. Further transmissions on the socket will cause the hash
42 * value to be carried within the mbuf as the flow ID.
43 *
44 * b. Interface. When an interface is attached, the flow hash for the
45 * interface is computed and stored in the ifnet. This value is
46 * normally ignored for most network drivers, except for those that
47 * reside atop another driver, e.g. a virtual interface performing
48 * encapsulation/encryption on the original packet and sending the
49 * newly-generated packet to another interface. Such interface needs
50 * to associate all generated packets with the interface flow hash
51 * value as the flow ID.
52 *
53 * c. PF (Packet Filter). When a packet goes through PF and it is not
54 * already associated with a flow ID, PF will compute a flow hash and
55 * store it in the packet as flow ID. When the packet is associated
56 * with a PF state, the state record will have the flow ID stored
57 * within, in order to avoid recalculating the flow hash. Although PF
58 * is capable of generating flow IDs, it does not participate in flow
59 * advisory, and therefore packets whose IDs are computed by PF will
60 * not have their PKTF_FLOW_ADV packet flag set.
61 *
62 * Activation of flow advisory mechanism is done by setting the PKTF_FLOW_ADV
63 * packet flag; because a flow ID is required, the mechanism will not take
64 * place unless PKTF_FLOW_ID is set as well. The packet must also carry one
65 * of the flow source types FLOWSRC_{INPCB,IFNET} in order to identify where
66 * the flow advisory notification should be delivered to. As noted above,
67 * FLOWSRC_PF does not participate in this mechanism.
68 *
69 * The classq module configured on the interface is responsible for exerting
70 * flow control to the upper layers. This occurs when the number of packets
71 * queued for a flow reaches a limit. The module generating the flow will
72 * cease transmission until further flow advisory notice, and the flow will
73 * be inserted into the classq's flow control list.
74 *
75 * When packets are dequeued from the classq and the number of packets for
76 * a flow goes below a limit, the classq will transfer its flow control list
77 * to the global fadv_list. This will then trigger the flow advisory thread
78 * to run, which will cause the flow source modules to be notified that data
79 * can now be generated for those previously flow-controlled flows.
80 */
81
82#include <sys/param.h>
83#include <sys/systm.h>
84#include <sys/kernel.h>
85#include <sys/mcache.h> /* for VERIFY() */
86#include <sys/mbuf.h>
87#include <sys/proc_internal.h>
88#include <sys/socketvar.h>
89
90#include <kern/assert.h>
91#include <kern/thread.h>
92#include <kern/locks.h>
93#include <kern/zalloc.h>
94
95#include <netinet/in_pcb.h>
96#include <net/flowadv.h>
97#if SKYWALK
98#include <skywalk/os_channel.h>
99#endif /* SKYWALK */
100
101/* Lock group and attribute for fadv_lock */
102static LCK_GRP_DECLARE(fadv_lock_grp, "fadv_lock");
103static LCK_MTX_DECLARE(fadv_lock, &fadv_lock_grp);
104
105/* protected by fadv_lock */
106static STAILQ_HEAD(fadv_head, flowadv_fcentry) fadv_list =
107 STAILQ_HEAD_INITIALIZER(fadv_list);
108static thread_t fadv_thread = THREAD_NULL;
109static uint32_t fadv_active;
110
111#define FADV_CACHE_NAME "flowadv" /* cache name */
112
113static int flowadv_thread_cont(int);
114static void flowadv_thread_func(void *, wait_result_t);
115
116void
117flowadv_init(void)
118{
119 if (kernel_thread_start(continuation: flowadv_thread_func, NULL, new_thread: &fadv_thread) !=
120 KERN_SUCCESS) {
121 panic("%s: couldn't create flow event advisory thread",
122 __func__);
123 /* NOTREACHED */
124 }
125 thread_deallocate(thread: fadv_thread);
126}
127
128struct flowadv_fcentry *
129flowadv_alloc_entry(int how)
130{
131 return kalloc_type(struct flowadv_fcentry, how | Z_ZERO);
132}
133
134void
135flowadv_free_entry(struct flowadv_fcentry *fce)
136{
137 kfree_type(struct flowadv_fcentry, fce);
138}
139
140void
141flowadv_add(struct flowadv_fclist *fcl)
142{
143 if (STAILQ_EMPTY(fcl)) {
144 return;
145 }
146
147 lck_mtx_lock_spin(lck: &fadv_lock);
148
149 STAILQ_CONCAT(&fadv_list, fcl);
150 VERIFY(!STAILQ_EMPTY(&fadv_list));
151
152 if (!fadv_active && fadv_thread != THREAD_NULL) {
153 wakeup_one(chan: (caddr_t)&fadv_list);
154 }
155
156 lck_mtx_unlock(lck: &fadv_lock);
157}
158
159void
160flowadv_add_entry(struct flowadv_fcentry *fce)
161{
162 lck_mtx_lock_spin(lck: &fadv_lock);
163 STAILQ_INSERT_HEAD(&fadv_list, fce, fce_link);
164 VERIFY(!STAILQ_EMPTY(&fadv_list));
165
166 if (!fadv_active && fadv_thread != THREAD_NULL) {
167 wakeup_one(chan: (caddr_t)&fadv_list);
168 }
169
170 lck_mtx_unlock(lck: &fadv_lock);
171}
172
173static int
174flowadv_thread_cont(int err)
175{
176#pragma unused(err)
177 for (;;) {
178 LCK_MTX_ASSERT(&fadv_lock, LCK_MTX_ASSERT_OWNED);
179 while (STAILQ_EMPTY(&fadv_list)) {
180 VERIFY(!fadv_active);
181 (void) msleep0(chan: &fadv_list, mtx: &fadv_lock, pri: (PSOCK | PSPIN),
182 wmesg: "flowadv_cont", timo: 0, continuation: flowadv_thread_cont);
183 /* NOTREACHED */
184 }
185
186 fadv_active = 1;
187 for (;;) {
188 struct flowadv_fcentry *fce;
189
190 VERIFY(!STAILQ_EMPTY(&fadv_list));
191 fce = STAILQ_FIRST(&fadv_list);
192 STAILQ_REMOVE(&fadv_list, fce,
193 flowadv_fcentry, fce_link);
194 STAILQ_NEXT(fce, fce_link) = NULL;
195
196 lck_mtx_unlock(lck: &fadv_lock);
197
198 if (fce->fce_event_type == FCE_EVENT_TYPE_CONGESTION_EXPERIENCED) {
199 switch (fce->fce_flowsrc_type) {
200 case FLOWSRC_CHANNEL:
201 kern_channel_flowadv_report_ce_event(fce, fce->fce_ce_cnt,
202 fce->fce_pkts_since_last_report);
203 break;
204 case FLOWSRC_INPCB:
205 case FLOWSRC_IFNET:
206 case FLOWSRC_PF:
207 default:
208 break;
209 }
210
211 goto next;
212 }
213
214 switch (fce->fce_flowsrc_type) {
215 case FLOWSRC_INPCB:
216 inp_flowadv(fce->fce_flowid);
217 break;
218
219 case FLOWSRC_IFNET:
220#if SKYWALK
221 /*
222 * when using the flowID allocator, IPSec
223 * driver uses the "pkt_flowid" field in mbuf
224 * packet header for the globally unique flowID
225 * and the "pkt_mpriv_srcid" field carries the
226 * interface flow control id (if_flowhash).
227 * For IPSec flows, it is the IPSec driver
228 * network interface which is flow controlled,
229 * instead of the IPSec SA flow.
230 */
231 ifnet_flowadv(fce->fce_flowsrc_token);
232#else /* !SKYWALK */
233 ifnet_flowadv(fce->fce_flowid);
234#endif /* !SKYWALK */
235 break;
236
237#if SKYWALK
238 case FLOWSRC_CHANNEL:
239 kern_channel_flowadv_clear(fce);
240 break;
241#endif /* SKYWALK */
242
243 case FLOWSRC_PF:
244 default:
245 break;
246 }
247next:
248 flowadv_free_entry(fce);
249 lck_mtx_lock_spin(lck: &fadv_lock);
250
251 /* if there's no pending request, we're done */
252 if (STAILQ_EMPTY(&fadv_list)) {
253 break;
254 }
255 }
256 fadv_active = 0;
257 }
258}
259
260__dead2
261static void
262flowadv_thread_func(void *v, wait_result_t w)
263{
264#pragma unused(v, w)
265 lck_mtx_lock(lck: &fadv_lock);
266 (void) msleep0(chan: &fadv_list, mtx: &fadv_lock, pri: (PSOCK | PSPIN),
267 wmesg: "flowadv", timo: 0, continuation: flowadv_thread_cont);
268 /*
269 * msleep0() shouldn't have returned as PCATCH was not set;
270 * therefore assert in this case.
271 */
272 lck_mtx_unlock(lck: &fadv_lock);
273 VERIFY(0);
274}
275