1 | /* |
2 | * Copyright (c) 2012-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * Flow Control and Feedback Advisory |
31 | * |
32 | * Each mbuf that is being sent out through an interface is tagged with a |
33 | * unique 32-bit ID which will help to identify all the packets that belong |
34 | * to a particular flow at the interface layer. Packets carrying such ID |
35 | * would need to be marked with PKTF_FLOW_ID. Normally, this ID is computed |
36 | * by the module that generates the flow. There are 3 kinds of flow sources |
37 | * that are currently recognized: |
38 | * |
39 | * a. INPCB (INET/INET6 Protocol Control Block). When a socket is |
40 | * connected, the flow hash for the socket is computed and stored in |
41 | * the PCB. Further transmissions on the socket will cause the hash |
42 | * value to be carried within the mbuf as the flow ID. |
43 | * |
44 | * b. Interface. When an interface is attached, the flow hash for the |
45 | * interface is computed and stored in the ifnet. This value is |
46 | * normally ignored for most network drivers, except for those that |
47 | * reside atop another driver, e.g. a virtual interface performing |
48 | * encapsulation/encryption on the original packet and sending the |
49 | * newly-generated packet to another interface. Such interface needs |
50 | * to associate all generated packets with the interface flow hash |
51 | * value as the flow ID. |
52 | * |
53 | * c. PF (Packet Filter). When a packet goes through PF and it is not |
54 | * already associated with a flow ID, PF will compute a flow hash and |
55 | * store it in the packet as flow ID. When the packet is associated |
56 | * with a PF state, the state record will have the flow ID stored |
57 | * within, in order to avoid recalculating the flow hash. Although PF |
58 | * is capable of generating flow IDs, it does not participate in flow |
59 | * advisory, and therefore packets whose IDs are computed by PF will |
60 | * not have their PKTF_FLOW_ADV packet flag set. |
61 | * |
62 | * Activation of flow advisory mechanism is done by setting the PKTF_FLOW_ADV |
63 | * packet flag; because a flow ID is required, the mechanism will not take |
64 | * place unless PKTF_FLOW_ID is set as well. The packet must also carry one |
65 | * of the flow source types FLOWSRC_{INPCB,IFNET} in order to identify where |
66 | * the flow advisory notification should be delivered to. As noted above, |
67 | * FLOWSRC_PF does not participate in this mechanism. |
68 | * |
69 | * The classq module configured on the interface is responsible for exerting |
70 | * flow control to the upper layers. This occurs when the number of packets |
71 | * queued for a flow reaches a limit. The module generating the flow will |
72 | * cease transmission until further flow advisory notice, and the flow will |
73 | * be inserted into the classq's flow control list. |
74 | * |
75 | * When packets are dequeued from the classq and the number of packets for |
76 | * a flow goes below a limit, the classq will transfer its flow control list |
77 | * to the global fadv_list. This will then trigger the flow advisory thread |
78 | * to run, which will cause the flow source modules to be notified that data |
79 | * can now be generated for those previously flow-controlled flows. |
80 | */ |
81 | |
82 | #include <sys/param.h> |
83 | #include <sys/systm.h> |
84 | #include <sys/kernel.h> |
85 | #include <sys/mcache.h> /* for VERIFY() */ |
86 | #include <sys/mbuf.h> |
87 | #include <sys/proc_internal.h> |
88 | #include <sys/socketvar.h> |
89 | |
90 | #include <kern/assert.h> |
91 | #include <kern/thread.h> |
92 | #include <kern/locks.h> |
93 | #include <kern/zalloc.h> |
94 | |
95 | #include <netinet/in_pcb.h> |
96 | #include <net/flowadv.h> |
97 | #if SKYWALK |
98 | #include <skywalk/os_channel.h> |
99 | #endif /* SKYWALK */ |
100 | |
101 | /* Lock group and attribute for fadv_lock */ |
102 | static LCK_GRP_DECLARE(fadv_lock_grp, "fadv_lock" ); |
103 | static LCK_MTX_DECLARE(fadv_lock, &fadv_lock_grp); |
104 | |
105 | /* protected by fadv_lock */ |
106 | static STAILQ_HEAD(fadv_head, flowadv_fcentry) fadv_list = |
107 | STAILQ_HEAD_INITIALIZER(fadv_list); |
108 | static thread_t fadv_thread = THREAD_NULL; |
109 | static uint32_t fadv_active; |
110 | |
111 | #define FADV_CACHE_NAME "flowadv" /* cache name */ |
112 | |
113 | static int flowadv_thread_cont(int); |
114 | static void flowadv_thread_func(void *, wait_result_t); |
115 | |
116 | void |
117 | flowadv_init(void) |
118 | { |
119 | if (kernel_thread_start(continuation: flowadv_thread_func, NULL, new_thread: &fadv_thread) != |
120 | KERN_SUCCESS) { |
121 | panic("%s: couldn't create flow event advisory thread" , |
122 | __func__); |
123 | /* NOTREACHED */ |
124 | } |
125 | thread_deallocate(thread: fadv_thread); |
126 | } |
127 | |
128 | struct flowadv_fcentry * |
129 | flowadv_alloc_entry(int how) |
130 | { |
131 | return kalloc_type(struct flowadv_fcentry, how | Z_ZERO); |
132 | } |
133 | |
134 | void |
135 | flowadv_free_entry(struct flowadv_fcentry *fce) |
136 | { |
137 | kfree_type(struct flowadv_fcentry, fce); |
138 | } |
139 | |
140 | void |
141 | flowadv_add(struct flowadv_fclist *fcl) |
142 | { |
143 | if (STAILQ_EMPTY(fcl)) { |
144 | return; |
145 | } |
146 | |
147 | lck_mtx_lock_spin(lck: &fadv_lock); |
148 | |
149 | STAILQ_CONCAT(&fadv_list, fcl); |
150 | VERIFY(!STAILQ_EMPTY(&fadv_list)); |
151 | |
152 | if (!fadv_active && fadv_thread != THREAD_NULL) { |
153 | wakeup_one(chan: (caddr_t)&fadv_list); |
154 | } |
155 | |
156 | lck_mtx_unlock(lck: &fadv_lock); |
157 | } |
158 | |
159 | void |
160 | flowadv_add_entry(struct flowadv_fcentry *fce) |
161 | { |
162 | lck_mtx_lock_spin(lck: &fadv_lock); |
163 | STAILQ_INSERT_HEAD(&fadv_list, fce, fce_link); |
164 | VERIFY(!STAILQ_EMPTY(&fadv_list)); |
165 | |
166 | if (!fadv_active && fadv_thread != THREAD_NULL) { |
167 | wakeup_one(chan: (caddr_t)&fadv_list); |
168 | } |
169 | |
170 | lck_mtx_unlock(lck: &fadv_lock); |
171 | } |
172 | |
173 | static int |
174 | flowadv_thread_cont(int err) |
175 | { |
176 | #pragma unused(err) |
177 | for (;;) { |
178 | LCK_MTX_ASSERT(&fadv_lock, LCK_MTX_ASSERT_OWNED); |
179 | while (STAILQ_EMPTY(&fadv_list)) { |
180 | VERIFY(!fadv_active); |
181 | (void) msleep0(chan: &fadv_list, mtx: &fadv_lock, pri: (PSOCK | PSPIN), |
182 | wmesg: "flowadv_cont" , timo: 0, continuation: flowadv_thread_cont); |
183 | /* NOTREACHED */ |
184 | } |
185 | |
186 | fadv_active = 1; |
187 | for (;;) { |
188 | struct flowadv_fcentry *fce; |
189 | |
190 | VERIFY(!STAILQ_EMPTY(&fadv_list)); |
191 | fce = STAILQ_FIRST(&fadv_list); |
192 | STAILQ_REMOVE(&fadv_list, fce, |
193 | flowadv_fcentry, fce_link); |
194 | STAILQ_NEXT(fce, fce_link) = NULL; |
195 | |
196 | lck_mtx_unlock(lck: &fadv_lock); |
197 | |
198 | if (fce->fce_event_type == FCE_EVENT_TYPE_CONGESTION_EXPERIENCED) { |
199 | switch (fce->fce_flowsrc_type) { |
200 | case FLOWSRC_CHANNEL: |
201 | kern_channel_flowadv_report_ce_event(fce, fce->fce_ce_cnt, |
202 | fce->fce_pkts_since_last_report); |
203 | break; |
204 | case FLOWSRC_INPCB: |
205 | case FLOWSRC_IFNET: |
206 | case FLOWSRC_PF: |
207 | default: |
208 | break; |
209 | } |
210 | |
211 | goto next; |
212 | } |
213 | |
214 | switch (fce->fce_flowsrc_type) { |
215 | case FLOWSRC_INPCB: |
216 | inp_flowadv(fce->fce_flowid); |
217 | break; |
218 | |
219 | case FLOWSRC_IFNET: |
220 | #if SKYWALK |
221 | /* |
222 | * when using the flowID allocator, IPSec |
223 | * driver uses the "pkt_flowid" field in mbuf |
224 | * packet header for the globally unique flowID |
225 | * and the "pkt_mpriv_srcid" field carries the |
226 | * interface flow control id (if_flowhash). |
227 | * For IPSec flows, it is the IPSec driver |
228 | * network interface which is flow controlled, |
229 | * instead of the IPSec SA flow. |
230 | */ |
231 | ifnet_flowadv(fce->fce_flowsrc_token); |
232 | #else /* !SKYWALK */ |
233 | ifnet_flowadv(fce->fce_flowid); |
234 | #endif /* !SKYWALK */ |
235 | break; |
236 | |
237 | #if SKYWALK |
238 | case FLOWSRC_CHANNEL: |
239 | kern_channel_flowadv_clear(fce); |
240 | break; |
241 | #endif /* SKYWALK */ |
242 | |
243 | case FLOWSRC_PF: |
244 | default: |
245 | break; |
246 | } |
247 | next: |
248 | flowadv_free_entry(fce); |
249 | lck_mtx_lock_spin(lck: &fadv_lock); |
250 | |
251 | /* if there's no pending request, we're done */ |
252 | if (STAILQ_EMPTY(&fadv_list)) { |
253 | break; |
254 | } |
255 | } |
256 | fadv_active = 0; |
257 | } |
258 | } |
259 | |
260 | __dead2 |
261 | static void |
262 | flowadv_thread_func(void *v, wait_result_t w) |
263 | { |
264 | #pragma unused(v, w) |
265 | lck_mtx_lock(lck: &fadv_lock); |
266 | (void) msleep0(chan: &fadv_list, mtx: &fadv_lock, pri: (PSOCK | PSPIN), |
267 | wmesg: "flowadv" , timo: 0, continuation: flowadv_thread_cont); |
268 | /* |
269 | * msleep0() shouldn't have returned as PCATCH was not set; |
270 | * therefore assert in this case. |
271 | */ |
272 | lck_mtx_unlock(lck: &fadv_lock); |
273 | VERIFY(0); |
274 | } |
275 | |